From 434059e4179371f196671ba878e56115e0c790df Mon Sep 17 00:00:00 2001 From: fan2956 Date: Tue, 14 Oct 2025 21:51:09 +0800 Subject: [PATCH] [BugFix] Fix multimodal model support fullgraph error (#3425) ### What this PR does / why we need it? Because the update_attn_params function requires passing the num_tokens parameter, and num_tokens is obtained via postions.shape[0]. However, the multimodal model uses mrope (Multidimensional Rotary Position Embedding), which results in the postions having a shape of 2. Consequently, postions.shape[0] retrieves an incorrect value.We resolve this issue by replacing positions.shape[0] with maybe_padded_num_tokens. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: fan2956 --- vllm_ascend/worker/model_runner_v1.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 04682a2..b432db0 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1520,13 +1520,14 @@ class NPUModelRunner(LoRAModelRunnerMixin): forward_context = get_forward_context() if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL: + # TODO: maybe_padded_num_tokens will be removed, use num_input_tokens instead if self.vllm_config.model_config.use_mla: # FIXME: Try using `auto_dispatch_capture=True` update_mla_attn_params(self.update_stream, forward_context, - positions.shape[0]) + maybe_padded_num_tokens) else: update_attn_params(self.update_stream, forward_context, - positions.shape[0]) + maybe_padded_num_tokens) if get_forward_context().sp_enabled: hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)