bugfix for mtp>1 (#3174)
### What this PR does / why we need it?
fix bugs when mtp>1, and reorder input batch when mtp is not accepted.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
by ci
- vLLM version: v0.10.2
- vLLM main:
52d0cb8458
---------
Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
This commit is contained in:
@@ -202,6 +202,8 @@ class AscendMLAMetadataBuilder:
|
|||||||
npu_fused_infer_attention_score TND layout's limit of 16, \
|
npu_fused_infer_attention_score TND layout's limit of 16, \
|
||||||
got {self.decode_threshold}"
|
got {self.decode_threshold}"
|
||||||
|
|
||||||
|
self.reorder_batch_threshold = self.decode_threshold
|
||||||
|
|
||||||
if self.chunked_prefill_enabled:
|
if self.chunked_prefill_enabled:
|
||||||
self.chunked_prefill_workspace_size = min(
|
self.chunked_prefill_workspace_size = min(
|
||||||
# Max sure there is enough for 8 full length request or at least
|
# Max sure there is enough for 8 full length request or at least
|
||||||
|
|||||||
@@ -555,7 +555,7 @@ class MtpProposer(Proposer):
|
|||||||
# copy inputs to buffer for cudagraph
|
# copy inputs to buffer for cudagraph
|
||||||
self.input_ids[:batch_size] = input_ids
|
self.input_ids[:batch_size] = input_ids
|
||||||
self.positions[:batch_size] = clamped_positions
|
self.positions[:batch_size] = clamped_positions
|
||||||
self.hidden_states[:batch_size] = hidden_states
|
self.hidden_states[:hidden_states.shape[0]] = hidden_states
|
||||||
attn_metadata_i.slot_mapping[:batch_size] = slot_mapping
|
attn_metadata_i.slot_mapping[:batch_size] = slot_mapping
|
||||||
|
|
||||||
if attn_metadata_i.prefill is not None:
|
if attn_metadata_i.prefill is not None:
|
||||||
|
|||||||
Reference in New Issue
Block a user