[VLM] Support chunk prefill for VLM (#6355)

Co-authored-by: yizhang2077 <1109276519@qq.com>
2025-05-22 20:32:41 -07:00
parent 0a4fc73b48
commit 4685fbb888
20 changed files with 510 additions and 184 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -166,6 +166,9 @@ class ModelRunner:
        self.is_draft_worker = is_draft_worker
        self.is_generation = model_config.is_generation
        self.is_multimodal = model_config.is_multimodal
+        self.is_multimodal_chunked_prefill_supported = (
+            model_config.is_multimodal_chunked_prefill_supported
+        )
        self.spec_algorithm = SpeculativeAlgorithm.from_string(
            server_args.speculative_algorithm
        )
@@ -389,12 +392,15 @@ class ModelRunner:
        if self.is_multimodal:
            self.mem_fraction_static *= 0.90
            logger.info(
-                f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} because this is a multimodal model."
-            )
-            server_args.chunked_prefill_size = -1
-            logger.info(
-                "Automatically turn off --chunked-prefill-size for multimodal model."
+                f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
+                f"because this is a multimodal model."
            )
+            if not self.is_multimodal_chunked_prefill_supported:
+                server_args.chunked_prefill_size = -1
+                logger.info(
+                    f"Automatically turn of --chunked-prefill-size as it is not supported for "
+                    f"{self.model_config.hf_config.model_type}"
+                )

        if not self.use_mla_backend:
            server_args.disable_chunked_prefix_cache = True