From 70cc5f7969a3a99a713a4b5628325644c6981070 Mon Sep 17 00:00:00 2001 From: Wang Kunpeng <1289706727@qq.com> Date: Fri, 30 Jan 2026 14:09:00 +0800 Subject: [PATCH] [bugfix]fix rope_forward_triton error (#6404) ### What this PR does / why we need it? The rope_forward_triton method reports an error. For example: ``` (Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] q, k = rope_forward_triton(q, k, cos, sin, rope_dim=self.qk_rope_head_dim, is_neox_style=True) (Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] File "/vllm-workspace/vllm-ascend/vllm_ascend/ops/triton/rope.py", line 155, in rope_forward_triton (Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] cos = cos.view(num_tokens, -1) (Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^ (Worker_DP0_TP1_EP1 pid=5298) ERROR 01-29 02:01:11 [multiproc_executor.py:822] RuntimeError: shape '[14, -1]' is invalid for input of size 768 ``` This is because an incorrect num_tokens_padded was passed in. Related-RFC: https://github.com/vllm-project/vllm-ascend/issues/5449 Co-authored-by: @zhenwenqi2024 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd Signed-off-by: Wang Kunpeng <1289706727@qq.com> --- vllm_ascend/worker/model_runner_v1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 6848a697..90f453a5 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1210,9 +1210,9 @@ class NPUModelRunner(GPUModelRunner): (attn_metadata, spec_decode_common_attn_metadata) = ( self._build_attention_metadata( num_tokens=num_tokens_unpadded, - num_tokens_padded=num_tokens_padded if pad_attn else None, + num_tokens_padded=num_tokens_padded, num_reqs=num_reqs, - num_reqs_padded=num_reqs_padded if pad_attn else None, + num_reqs_padded=num_reqs_padded, max_query_len=max_num_scheduled_tokens, ubatch_slices=ubatch_slices_attn, logits_indices=logits_indices,