From 61efaffcaf7225b54c79dd5103bd24835eeb52e0 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Mon, 22 Dec 2025 18:39:45 +0800 Subject: [PATCH] [Bugfix] Implement multimodal_cpu_fields in model runner (#5196) ### What this PR does / why we need it? Related to https://github.com/vllm-project/vllm-ascend/issues/4084 Implement multimodal_cpu_fields in model runner - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: hfadzxy --- vllm_ascend/patch/worker/__init__.py | 1 - vllm_ascend/patch/worker/patch_qwen3_vl.py | 85 ---------------------- vllm_ascend/worker/model_runner_v1.py | 29 ++++++++ 3 files changed, 29 insertions(+), 86 deletions(-) delete mode 100644 vllm_ascend/patch/worker/patch_qwen3_vl.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d6e9c049..ec135d2b 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -30,7 +30,6 @@ import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa import vllm_ascend.patch.worker.patch_qwen2_5_vl # noqa import vllm_ascend.patch.worker.patch_qwen2_5_omni # noqa -import vllm_ascend.patch.worker.patch_qwen3_vl # noqa import vllm_ascend.patch.worker.patch_rope # noqa import vllm_ascend.patch.worker.patch_qwen3_next # noqa import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa diff --git a/vllm_ascend/patch/worker/patch_qwen3_vl.py b/vllm_ascend/patch/worker/patch_qwen3_vl.py deleted file mode 100644 index 26d94850..00000000 --- a/vllm_ascend/patch/worker/patch_qwen3_vl.py +++ /dev/null @@ -1,85 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np -import torch -import torch.nn as nn -from vllm.model_executor.models.qwen3_vl import Qwen3_VisionTransformer - - -class AscendQwen3_VisionTransformer(nn.Module): - - def forward( - self, - x: torch.Tensor, - grid_thw: torch.Tensor | list[list[int]], - ) -> torch.Tensor: - hidden_states = x.to(device=self.device, - dtype=self.dtype, - non_blocking=True) - hidden_states = self.patch_embed(hidden_states) - - if isinstance(grid_thw, list): - grid_thw_list = grid_thw - grid_thw = np.array(grid_thw, dtype=np.int32) - else: - grid_thw = grid_thw.to("cpu") - grid_thw_list = grid_thw.tolist() - grid_thw = grid_thw.numpy() - - pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) - hidden_states = hidden_states + pos_embeds - rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb( - grid_thw_list) - rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device, - non_blocking=True) - rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device, - non_blocking=True) - - cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, 0]).cumsum(axis=0, dtype=np.int32) - cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) - cu_seqlens = torch.from_numpy(cu_seqlens) - - hidden_states = hidden_states.unsqueeze(1) - max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) - cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) - - deepstack_feature_lists = [] - for layer_num, blk in enumerate(self.blocks): - hidden_states = blk( - hidden_states, - cu_seqlens=cu_seqlens, - rotary_pos_emb_cos=rotary_pos_emb_cos, - rotary_pos_emb_sin=rotary_pos_emb_sin, - max_seqlen=max_seqlen, - ) - if layer_num in self.deepstack_visual_indexes: - deepstack_merger_idx = self.deepstack_visual_indexes.index( - layer_num) - deepstack_feature = self.deepstack_merger_list[ - deepstack_merger_idx](hidden_states) - deepstack_feature_lists.append(deepstack_feature) - hidden_states = self.merger(hidden_states) - hidden_states = torch.cat( - [hidden_states] + deepstack_feature_lists, - dim=1) # [seq_len, hidden_size * (1 + depth_of_deepstack)] - return hidden_states - - -# NOTE: This will be removed after implementing multimodal_cpu_fields in vllm-ascend model_runner. -Qwen3_VisionTransformer.forward = AscendQwen3_VisionTransformer.forward diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 570d8a3e..b2c48978 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -792,6 +792,8 @@ class NPUModelRunner(GPUModelRunner): # _prepare_inputs may reorder the batch, so we must gather # multi-modal outputs after that to ensure the correct order if self.is_multimodal_model: + self.multimodal_cpu_fields = ["grid_thw"] + self._prepare_multimodal_fields() with self.maybe_get_ec_connector_output( scheduler_output, encoder_cache=self.encoder_cache, @@ -3396,6 +3398,33 @@ class NPUModelRunner(GPUModelRunner): mtp_slot_pad[unpad_mask] = mtp_slot_ori self.mtp_slot_pad = mtp_slot_pad.to(self.device, non_blocking=True) + def _prepare_multimodal_fields(self): + """ + Ensures specific multimodal tensors are on CPU. + This is necessary for fields like 'grid_thw' which are converted to numpy + inside the model's forward pass. + """ + if not self.multimodal_cpu_fields: + return + + req_ids = self.input_batch.req_ids + for req_id in req_ids: + req = self.requests.get(req_id) + if req is None: + continue + + mm_data = getattr(req, 'multimodal_data', None) + if not mm_data: + continue + + for field in self.multimodal_cpu_fields: + if field in mm_data: + tensor = mm_data[field] + if isinstance( + tensor, + torch.Tensor) and tensor.device.type != 'cpu': + mm_data[field] = tensor.cpu() + @contextmanager def _torch_cuda_wrapper():