[VLM] Support chunk prefill for VLM (#6355)

Co-authored-by: yizhang2077 <1109276519@qq.com>
2025-05-22 20:32:41 -07:00
parent 0a4fc73b48
commit 4685fbb888
20 changed files with 510 additions and 184 deletions
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -116,6 +116,10 @@ class ModelConfig:
        self.is_audio_model = enable_multimodal and is_audio_model(
            self.hf_config.architectures
        )
+        self.is_multimodal_chunked_prefill_supported = (
+            enable_multimodal
+            and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
+        )
        self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)

@@ -574,6 +578,21 @@ def is_encoder_decoder_model(model_architectures: List[str]):
    return "MllamaForConditionalGeneration" in model_architectures


+def is_multimodal_chunked_prefill_supported(model_architectures: List[str]):
+    """Check if chunked prefill is supported for a MultiModal model."""
+    unsupported = [
+        "Grok1VForCausalLM",
+        "Grok1AForCausalLM",
+        "LlavaLlamaForCausalLM",
+        "MllamaForConditionalGeneration",
+        "CLIPModel",
+    ]
+    if any(multi_model_arch in unsupported for multi_model_arch in model_architectures):
+        return False
+    else:
+        return True
+
+
 def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
    if scale <= 1:
        return 1.0