bugfix for mtp>1 (#3174)

### What this PR does / why we need it? fix bugs when mtp>1, and reorder input batch when mtp is not accepted. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? by ci - vLLM version: v0.10.2 - vLLM main: 52d0cb8458 --------- Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
2025-09-26 09:04:16 +08:00
parent 69509bcdd6
commit b72e3327a6
2 changed files with 3 additions and 1 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -202,6 +202,8 @@ class AscendMLAMetadataBuilder:
                npu_fused_infer_attention_score TND layout's limit of 16, \
                got {self.decode_threshold}"

+        self.reorder_batch_threshold = self.decode_threshold
+
        if self.chunked_prefill_enabled:
            self.chunked_prefill_workspace_size = min(
                # Max sure there is enough for 8 full length request or at least
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -555,7 +555,7 @@ class MtpProposer(Proposer):
            # copy inputs to buffer for cudagraph
            self.input_ids[:batch_size] = input_ids
            self.positions[:batch_size] = clamped_positions
-            self.hidden_states[:batch_size] = hidden_states
+            self.hidden_states[:hidden_states.shape[0]] = hidden_states
            attn_metadata_i.slot_mapping[:batch_size] = slot_mapping

            if attn_metadata_i.prefill is not None: