Enable FlashInfer support encoder models and add head_dim padding workaround (#6230)
This commit is contained in:
@@ -27,9 +27,9 @@ from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
|
||||
|
||||
MODELS = [("BAAI/bge-small-en", 1, 1e-5), ("BAAI/bge-m3", 1, 1e-5)]
|
||||
|
||||
ATTENTION_BACKEND = ["torch_native", "triton"]
|
||||
ATTENTION_BACKEND = ["torch_native", "triton", "flashinfer"]
|
||||
BATCH_SIZE = [1, 2]
|
||||
TORCH_DTYPES = [torch.float32]
|
||||
TORCH_DTYPES = [torch.float32, torch.float16]
|
||||
sgl_to_st_ratio = []
|
||||
|
||||
|
||||
@@ -126,6 +126,19 @@ class TestEncoderEmbeddingModels(CustomTestCase):
|
||||
for attention_backend in ATTENTION_BACKEND:
|
||||
for batch_size in BATCH_SIZE:
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
# NOTE: FlashInfer currently has limitations with head_dim = 32 or
|
||||
# other dimensions.
|
||||
# The FlashInfer head_dim limitation itself is tracked here:
|
||||
# https://github.com/flashinfer-ai/flashinfer/issues/1048
|
||||
#
|
||||
# Flashinfer does not support torch.float32 for dtype_q, so skip it
|
||||
if attention_backend == "flashinfer":
|
||||
if (
|
||||
model == "BAAI/bge-small-en"
|
||||
or torch_dtype == torch.float32
|
||||
):
|
||||
continue
|
||||
|
||||
self.assert_close_prefill_logits(
|
||||
DEFAULT_PROMPTS,
|
||||
model,
|
||||
|
||||
Reference in New Issue
Block a user