vlm: enable radix cache for qwen-vl models (#5349)

Co-authored-by: Xinyuan Tong <justinning0323@outlook.com>
2025-04-24 12:35:05 +09:00
parent 7d0edf3cae
commit c998d04b46
26 changed files with 429 additions and 331 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -153,8 +153,7 @@ class ServerArgs:
    enable_nccl_nvls: bool = False
    disable_outlines_disk_cache: bool = False
    disable_custom_all_reduce: bool = False
-    enable_llama4_multimodal: Optional[bool] = None
-    enable_gemma3_multimodal: Optional[bool] = None
+    enable_multimodal: Optional[bool] = None
    disable_overlap_schedule: bool = False
    enable_mixed_chunk: bool = False
    enable_dp_attention: bool = False
@@ -286,10 +285,6 @@ class ServerArgs:
        if self.grammar_backend is None:
            self.grammar_backend = "xgrammar"

-        self.enable_multimodal: Optional[bool] = (
-            self.enable_llama4_multimodal or self.enable_gemma3_multimodal
-        )
-
        # Data parallelism attention
        if self.enable_dp_attention:
            self.schedule_conservativeness = self.schedule_conservativeness * 0.3
@@ -982,16 +977,10 @@ class ServerArgs:
            help="Disable the custom all-reduce kernel and fall back to NCCL.",
        )
        parser.add_argument(
-            "--enable-llama4-multimodal",
-            default=ServerArgs.enable_llama4_multimodal,
+            "--enable-multimodal",
+            default=ServerArgs.enable_multimodal,
            action="store_true",
-            help="Enable the multimodal functionality for Llama-4.",
-        )
-        parser.add_argument(
-            "--enable-gemma3-multimodal",
-            default=ServerArgs.enable_gemma3_multimodal,
-            action="store_true",
-            help="Enable the multimodal functionality for Gemma-3.",
+            help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
        )
        parser.add_argument(
            "--disable-overlap-schedule",