vlm: remove redundant d2h movement of mm feature tensors (#9987)

Co-authored-by: Xiang (Kevin) Li <lik@nvidia.com>
2025-09-17 15:00:39 -07:00
parent 564050766d
commit de28f8e741
2 changed files with 13 additions and 6 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -381,6 +381,7 @@ class ServerArgs:
    disable_shared_experts_fusion: bool = False
    disable_chunked_prefix_cache: bool = False
    disable_fast_image_processor: bool = False
+    keep_mm_feature_on_device: bool = False
    enable_return_hidden_states: bool = False
    scheduler_recv_interval: int = 1
    numa_node: Optional[List[int]] = None
@@ -2213,6 +2214,11 @@ class ServerArgs:
            action="store_true",
            help="Adopt base image processor instead of fast image processor.",
        )
+        parser.add_argument(
+            "--keep-mm-feature-on-device",
+            action="store_true",
+            help="Keep multimodal feature tensors on device after processing to save D2H copy.",
+        )
        parser.add_argument(
            "--enable-return-hidden-states",
            action="store_true",