vlm: remove redundant d2h movement of mm feature tensors (#9987)

Co-authored-by: Xiang (Kevin) Li <lik@nvidia.com>
This commit is contained in:
Kevin Xiang Li
2025-09-17 15:00:39 -07:00
committed by GitHub
parent 564050766d
commit de28f8e741
2 changed files with 13 additions and 6 deletions

View File

@@ -241,12 +241,13 @@ class BaseMultimodalProcessor(ABC):
return_tensors="pt",
**kwargs,
)
# move feature tensors to cpu
for feature_name in self.FEATURE_NAMES:
if feature_name in result and isinstance(
result[feature_name], torch.Tensor
):
result[feature_name] = result[feature_name].to("cpu")
if not self.server_args.keep_mm_feature_on_device:
# move feature tensors to cpu
for feature_name in self.FEATURE_NAMES:
if feature_name in result and isinstance(
result[feature_name], torch.Tensor
):
result[feature_name] = result[feature_name].to("cpu")
return result

View File

@@ -381,6 +381,7 @@ class ServerArgs:
disable_shared_experts_fusion: bool = False
disable_chunked_prefix_cache: bool = False
disable_fast_image_processor: bool = False
keep_mm_feature_on_device: bool = False
enable_return_hidden_states: bool = False
scheduler_recv_interval: int = 1
numa_node: Optional[List[int]] = None
@@ -2213,6 +2214,11 @@ class ServerArgs:
action="store_true",
help="Adopt base image processor instead of fast image processor.",
)
parser.add_argument(
"--keep-mm-feature-on-device",
action="store_true",
help="Keep multimodal feature tensors on device after processing to save D2H copy.",
)
parser.add_argument(
"--enable-return-hidden-states",
action="store_true",