From f2956ce94427f6a3af44f8028b7b0fb3f7c087de Mon Sep 17 00:00:00 2001 From: wangbj127 <256472688+wangbj127@users.noreply.github.com> Date: Fri, 17 Apr 2026 22:50:22 +0800 Subject: [PATCH] [v0.18.0][BugFix] Fix dimension mismatch error when SP padding causes num_tokens_padded != num_tokens_unpadded (#8133) Cherry-picked from https://github.com/vllm-project/vllm-ascend/pull/7858 ### What this PR does / why we need it? This PR fixes a `RuntimeError` (dimension mismatch) that occurs when Sequence Parallelism (SP) is enabled and the padding added for SP causes `num_tokens_padded` to differ from `num_tokens_unpadded`. In such cases, `_pad_query_start_loc_for_fia` adds a dummy request, increasing `num_reqs_padded`. This mismatch between the actual number of requests and the padded number of requests leads to errors in downstream token count computations (e.g., `compute_num_computed_tokens`). The fix modifies the restrictive condition `num_tokens_padded == num_tokens_unpadded` when reverting the dummy request padding if SP is enabled, as SP padding is handled by stripping it after communication and should not be treated as an additional request in the attention metadata. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? vLLM version: v0.18.0 vLLM-Ascend version: releases/v0.18.0 Signed-off-by: Wangbj127 --- vllm_ascend/worker/model_runner_v1.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 206f40df..357ee039 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1257,10 +1257,22 @@ class NPUModelRunner(GPUModelRunner): num_reqs_padded = self._pad_query_start_loc_for_fia( num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_mode, batch_desc.num_reqs ) - if enable_sp() and num_tokens_padded == num_tokens_unpadded: - if num_reqs_padded > old_num_reqs_padded: + + + # FIA may add a virtual request in Mixed Batch scenarios. + # here we revert the request added by _pad_query_start_loc_for_fia if SP is enabled. + # RELAXED CONDITION: Check if num_reqs_padded was actually increased, rather than + # strictly checking token equality. This handles cases where num_tokens_padded + # != num_tokens_unpadded due to SP alignment (e.g., 29292 vs 29290). + if enable_sp() and num_reqs_padded > old_num_reqs_padded: + if num_tokens_padded == num_tokens_unpadded: num_reqs_padded = old_num_reqs_padded self.query_start_loc.np[num_reqs_padded + 1] = 0 + if num_tokens_padded != num_tokens_unpadded and not self.speculative_config: + num_reqs_padded = old_num_reqs_padded + self.query_start_loc.np[num_reqs_padded + 1] = 0 + self.query_start_loc.np[num_reqs_padded] = num_tokens_padded + self.query_start_loc.gpu[num_reqs_padded] = num_tokens_padded (attn_metadata, spec_decode_common_attn_metadata) = self._build_attention_metadata( num_tokens=num_tokens_unpadded