From 6bdc72949b2682a7f1c7804aadf10b34f37ed127 Mon Sep 17 00:00:00 2001 From: wangbj127 <256472688+wangbj127@users.noreply.github.com> Date: Sat, 18 Apr 2026 20:43:42 +0800 Subject: [PATCH] Revert "[v0.18.0][BugFix] Fix dimension mismatch error when SP padding causes num_tokens_padded != num_tokens_unpadded" (#8413) Reverts vllm-project/vllm-ascend#8133 - Reversion of Logic: This pull request reverts the changes introduced in a previous commit that attempted to handle dimension mismatches during SP padding. Signed-off-by: Wangbingjie --- vllm_ascend/worker/model_runner_v1.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 48d24a07..0a9c1986 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1263,22 +1263,10 @@ class NPUModelRunner(GPUModelRunner): num_reqs_padded = self._pad_query_start_loc_for_fia( num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_mode, batch_desc.num_reqs ) - - - # FIA may add a virtual request in Mixed Batch scenarios. - # here we revert the request added by _pad_query_start_loc_for_fia if SP is enabled. - # RELAXED CONDITION: Check if num_reqs_padded was actually increased, rather than - # strictly checking token equality. This handles cases where num_tokens_padded - # != num_tokens_unpadded due to SP alignment (e.g., 29292 vs 29290). - if enable_sp() and num_reqs_padded > old_num_reqs_padded: - if num_tokens_padded == num_tokens_unpadded: + if enable_sp() and num_tokens_padded == num_tokens_unpadded: + if num_reqs_padded > old_num_reqs_padded: num_reqs_padded = old_num_reqs_padded self.query_start_loc.np[num_reqs_padded + 1] = 0 - if num_tokens_padded != num_tokens_unpadded and not self.speculative_config: - num_reqs_padded = old_num_reqs_padded - self.query_start_loc.np[num_reqs_padded + 1] = 0 - self.query_start_loc.np[num_reqs_padded] = num_tokens_padded - self.query_start_loc.gpu[num_reqs_padded] = num_tokens_padded (attn_metadata, spec_decode_common_attn_metadata) = self._build_attention_metadata( num_tokens=num_tokens_unpadded