From 61efaffcaf7225b54c79dd5103bd24835eeb52e0 Mon Sep 17 00:00:00 2001
From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com>
Date: Mon, 22 Dec 2025 18:39:45 +0800
Subject: [PATCH] [Bugfix] Implement multimodal_cpu_fields in model runner
 (#5196)

### What this PR does / why we need it?
Related to https://github.com/vllm-project/vllm-ascend/issues/4084
Implement multimodal_cpu_fields in model runner

- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
---
 vllm_ascend/patch/worker/__init__.py       |  1 -
 vllm_ascend/patch/worker/patch_qwen3_vl.py | 85 ----------------------
 vllm_ascend/worker/model_runner_v1.py      | 29 ++++++++
 3 files changed, 29 insertions(+), 86 deletions(-)
 delete mode 100644 vllm_ascend/patch/worker/patch_qwen3_vl.py

diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
index d6e9c049..ec135d2b 100644
--- a/vllm_ascend/patch/worker/__init__.py
+++ b/vllm_ascend/patch/worker/__init__.py
@@ -30,7 +30,6 @@ import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
 import vllm_ascend.patch.worker.patch_minicpm  # noqa
 import vllm_ascend.patch.worker.patch_qwen2_5_vl  # noqa
 import vllm_ascend.patch.worker.patch_qwen2_5_omni  # noqa
-import vllm_ascend.patch.worker.patch_qwen3_vl  # noqa
 import vllm_ascend.patch.worker.patch_rope  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next_mtp  # noqa
diff --git a/vllm_ascend/patch/worker/patch_qwen3_vl.py b/vllm_ascend/patch/worker/patch_qwen3_vl.py
deleted file mode 100644
index 26d94850..00000000
--- a/vllm_ascend/patch/worker/patch_qwen3_vl.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-import torch
-import torch.nn as nn
-from vllm.model_executor.models.qwen3_vl import Qwen3_VisionTransformer
-
-
-class AscendQwen3_VisionTransformer(nn.Module):
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        grid_thw: torch.Tensor | list[list[int]],
-    ) -> torch.Tensor:
-        hidden_states = x.to(device=self.device,
-                             dtype=self.dtype,
-                             non_blocking=True)
-        hidden_states = self.patch_embed(hidden_states)
-
-        if isinstance(grid_thw, list):
-            grid_thw_list = grid_thw
-            grid_thw = np.array(grid_thw, dtype=np.int32)
-        else:
-            grid_thw = grid_thw.to("cpu")
-            grid_thw_list = grid_thw.tolist()
-            grid_thw = grid_thw.numpy()
-
-        pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
-        hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(
-            grid_thw_list)
-        rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device,
-                                                   non_blocking=True)
-        rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device,
-                                                   non_blocking=True)
-
-        cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2],
-                               grid_thw[:, 0]).cumsum(axis=0, dtype=np.int32)
-        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
-        cu_seqlens = torch.from_numpy(cu_seqlens)
-
-        hidden_states = hidden_states.unsqueeze(1)
-        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
-        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
-
-        deepstack_feature_lists = []
-        for layer_num, blk in enumerate(self.blocks):
-            hidden_states = blk(
-                hidden_states,
-                cu_seqlens=cu_seqlens,
-                rotary_pos_emb_cos=rotary_pos_emb_cos,
-                rotary_pos_emb_sin=rotary_pos_emb_sin,
-                max_seqlen=max_seqlen,
-            )
-            if layer_num in self.deepstack_visual_indexes:
-                deepstack_merger_idx = self.deepstack_visual_indexes.index(
-                    layer_num)
-                deepstack_feature = self.deepstack_merger_list[
-                    deepstack_merger_idx](hidden_states)
-                deepstack_feature_lists.append(deepstack_feature)
-        hidden_states = self.merger(hidden_states)
-        hidden_states = torch.cat(
-            [hidden_states] + deepstack_feature_lists,
-            dim=1)  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
-        return hidden_states
-
-
-# NOTE: This will be removed after implementing multimodal_cpu_fields in vllm-ascend model_runner.
-Qwen3_VisionTransformer.forward = AscendQwen3_VisionTransformer.forward
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 570d8a3e..b2c48978 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -792,6 +792,8 @@ class NPUModelRunner(GPUModelRunner):
         # _prepare_inputs may reorder the batch, so we must gather
         # multi-modal outputs after that to ensure the correct order
         if self.is_multimodal_model:
+            self.multimodal_cpu_fields = ["grid_thw"]
+            self._prepare_multimodal_fields()
             with self.maybe_get_ec_connector_output(
                     scheduler_output,
                     encoder_cache=self.encoder_cache,
@@ -3396,6 +3398,33 @@ class NPUModelRunner(GPUModelRunner):
             mtp_slot_pad[unpad_mask] = mtp_slot_ori
             self.mtp_slot_pad = mtp_slot_pad.to(self.device, non_blocking=True)
 
+    def _prepare_multimodal_fields(self):
+        """
+        Ensures specific multimodal tensors are on CPU.
+        This is necessary for fields like 'grid_thw' which are converted to numpy 
+        inside the model's forward pass.
+        """
+        if not self.multimodal_cpu_fields:
+            return
+
+        req_ids = self.input_batch.req_ids
+        for req_id in req_ids:
+            req = self.requests.get(req_id)
+            if req is None:
+                continue
+
+            mm_data = getattr(req, 'multimodal_data', None)
+            if not mm_data:
+                continue
+
+            for field in self.multimodal_cpu_fields:
+                if field in mm_data:
+                    tensor = mm_data[field]
+                    if isinstance(
+                            tensor,
+                            torch.Tensor) and tensor.device.type != 'cpu':
+                        mm_data[field] = tensor.cpu()
+
 
 @contextmanager
 def _torch_cuda_wrapper():