Enable FlashInfer support encoder models and add head_dim padding workaround (#6230)

2025-07-20 10:30:16 +08:00
parent 282eb59ff3
commit cbdfb77123
2 changed files with 25 additions and 3 deletions
--- a/test/srt/models/test_encoder_embedding_models.py
+++ b/test/srt/models/test_encoder_embedding_models.py
@@ -27,9 +27,9 @@ from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci

 MODELS = [("BAAI/bge-small-en", 1, 1e-5), ("BAAI/bge-m3", 1, 1e-5)]

-ATTENTION_BACKEND = ["torch_native", "triton"]
+ATTENTION_BACKEND = ["torch_native", "triton", "flashinfer"]
 BATCH_SIZE = [1, 2]
-TORCH_DTYPES = [torch.float32]
+TORCH_DTYPES = [torch.float32, torch.float16]
 sgl_to_st_ratio = []


@@ -126,6 +126,19 @@ class TestEncoderEmbeddingModels(CustomTestCase):
            for attention_backend in ATTENTION_BACKEND:
                for batch_size in BATCH_SIZE:
                    for torch_dtype in TORCH_DTYPES:
+                        # NOTE: FlashInfer currently has limitations with head_dim = 32 or
+                        # other dimensions.
+                        # The FlashInfer head_dim limitation itself is tracked here:
+                        # https://github.com/flashinfer-ai/flashinfer/issues/1048
+                        #
+                        # Flashinfer does not support torch.float32 for dtype_q, so skip it
+                        if attention_backend == "flashinfer":
+                            if (
+                                model == "BAAI/bge-small-en"
+                                or torch_dtype == torch.float32
+                            ):
+                                continue
+
                        self.assert_close_prefill_logits(
                            DEFAULT_PROMPTS,
                            model,