Revert "[v0.18.0][BugFix] Fix dimension mismatch error when SP padding causes num_tokens_padded != num_tokens_unpadded" (#8413)
Reverts vllm-project/vllm-ascend#8133 - Reversion of Logic: This pull request reverts the changes introduced in a previous commit that attempted to handle dimension mismatches during SP padding. Signed-off-by: Wangbingjie <wangbj1207@126.com>
This commit is contained in:
@@ -1263,22 +1263,10 @@ class NPUModelRunner(GPUModelRunner):
|
||||
num_reqs_padded = self._pad_query_start_loc_for_fia(
|
||||
num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_mode, batch_desc.num_reqs
|
||||
)
|
||||
|
||||
|
||||
# FIA may add a virtual request in Mixed Batch scenarios.
|
||||
# here we revert the request added by _pad_query_start_loc_for_fia if SP is enabled.
|
||||
# RELAXED CONDITION: Check if num_reqs_padded was actually increased, rather than
|
||||
# strictly checking token equality. This handles cases where num_tokens_padded
|
||||
# != num_tokens_unpadded due to SP alignment (e.g., 29292 vs 29290).
|
||||
if enable_sp() and num_reqs_padded > old_num_reqs_padded:
|
||||
if num_tokens_padded == num_tokens_unpadded:
|
||||
if enable_sp() and num_tokens_padded == num_tokens_unpadded:
|
||||
if num_reqs_padded > old_num_reqs_padded:
|
||||
num_reqs_padded = old_num_reqs_padded
|
||||
self.query_start_loc.np[num_reqs_padded + 1] = 0
|
||||
if num_tokens_padded != num_tokens_unpadded and not self.speculative_config:
|
||||
num_reqs_padded = old_num_reqs_padded
|
||||
self.query_start_loc.np[num_reqs_padded + 1] = 0
|
||||
self.query_start_loc.np[num_reqs_padded] = num_tokens_padded
|
||||
self.query_start_loc.gpu[num_reqs_padded] = num_tokens_padded
|
||||
|
||||
(attn_metadata, spec_decode_common_attn_metadata) = self._build_attention_metadata(
|
||||
num_tokens=num_tokens_unpadded
|
||||
|
||||
Reference in New Issue
Block a user