vlm: remove redundant d2h movement of mm feature tensors (#9987)

Co-authored-by: Xiang (Kevin) Li <lik@nvidia.com>
2025-09-17 15:00:39 -07:00
parent 564050766d
commit de28f8e741
2 changed files with 13 additions and 6 deletions
--- a/python/sglang/srt/multimodal/processors/base_processor.py
+++ b/python/sglang/srt/multimodal/processors/base_processor.py
@@ -241,12 +241,13 @@ class BaseMultimodalProcessor(ABC):
            return_tensors="pt",
            **kwargs,
        )
-        # move feature tensors to cpu
-        for feature_name in self.FEATURE_NAMES:
-            if feature_name in result and isinstance(
-                result[feature_name], torch.Tensor
-            ):
-                result[feature_name] = result[feature_name].to("cpu")
+        if not self.server_args.keep_mm_feature_on_device:
+            # move feature tensors to cpu
+            for feature_name in self.FEATURE_NAMES:
+                if feature_name in result and isinstance(
+                    result[feature_name], torch.Tensor
+                ):
+                    result[feature_name] = result[feature_name].to("cpu")

        return result