[Bugfix] Implement multimodal_cpu_fields in model runner (#5196)

### What this PR does / why we need it? Related to https://github.com/vllm-project/vllm-ascend/issues/4084 Implement multimodal_cpu_fields in model runner - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-12-22 18:39:45 +08:00
parent 052e472453
commit 61efaffcaf
3 changed files with 29 additions and 86 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -792,6 +792,8 @@ class NPUModelRunner(GPUModelRunner):
        # _prepare_inputs may reorder the batch, so we must gather
        # multi-modal outputs after that to ensure the correct order
        if self.is_multimodal_model:
+            self.multimodal_cpu_fields = ["grid_thw"]
+            self._prepare_multimodal_fields()
            with self.maybe_get_ec_connector_output(
                    scheduler_output,
                    encoder_cache=self.encoder_cache,
@@ -3396,6 +3398,33 @@ class NPUModelRunner(GPUModelRunner):
            mtp_slot_pad[unpad_mask] = mtp_slot_ori
            self.mtp_slot_pad = mtp_slot_pad.to(self.device, non_blocking=True)

+    def _prepare_multimodal_fields(self):
+        """
+        Ensures specific multimodal tensors are on CPU.
+        This is necessary for fields like 'grid_thw' which are converted to numpy 
+        inside the model's forward pass.
+        """
+        if not self.multimodal_cpu_fields:
+            return
+
+        req_ids = self.input_batch.req_ids
+        for req_id in req_ids:
+            req = self.requests.get(req_id)
+            if req is None:
+                continue
+
+            mm_data = getattr(req, 'multimodal_data', None)
+            if not mm_data:
+                continue
+
+            for field in self.multimodal_cpu_fields:
+                if field in mm_data:
+                    tensor = mm_data[field]
+                    if isinstance(
+                            tensor,
+                            torch.Tensor) and tensor.device.type != 'cpu':
+                        mm_data[field] = tensor.cpu()
+

@contextmanager
 def _torch_cuda_wrapper():