From b72e3327a61f495ae3df9440b08efa90d4928aac Mon Sep 17 00:00:00 2001 From: zouyida2052 Date: Fri, 26 Sep 2025 09:04:16 +0800 Subject: [PATCH] bugfix for mtp>1 (#3174) ### What this PR does / why we need it? fix bugs when mtp>1, and reorder input batch when mtp is not accepted. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? by ci - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/52d0cb845866869d587fc013a7c59e60a86ebcf2 --------- Signed-off-by: zouyida2052 --- vllm_ascend/attention/mla_v1.py | 2 ++ vllm_ascend/spec_decode/mtp_proposer.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index cb15bd1..73cbae6 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -202,6 +202,8 @@ class AscendMLAMetadataBuilder: npu_fused_infer_attention_score TND layout's limit of 16, \ got {self.decode_threshold}" + self.reorder_batch_threshold = self.decode_threshold + if self.chunked_prefill_enabled: self.chunked_prefill_workspace_size = min( # Max sure there is enough for 8 full length request or at least diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index ac0b3c5..ed4e887 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -555,7 +555,7 @@ class MtpProposer(Proposer): # copy inputs to buffer for cudagraph self.input_ids[:batch_size] = input_ids self.positions[:batch_size] = clamped_positions - self.hidden_states[:batch_size] = hidden_states + self.hidden_states[:hidden_states.shape[0]] = hidden_states attn_metadata_i.slot_mapping[:batch_size] = slot_mapping if attn_metadata_i.prefill is not None: