From 0686b32d82de042b07a136548cdaacfcc3a9c5e9 Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Sun, 14 Dec 2025 00:10:11 +0800
Subject: [PATCH] [Fix] Fixes issues in MTP with async scheduling and ACL graph
 (#4963)

### What this PR does / why we need it?
Corrects attention metadata size for MTP when both asynchronous
scheduling and full ACL graph mode are enabled. This prevents potential
size mismatches during execution.

Additionally, improves the robustness of calculating token sample
indices by explicitly aligning tensor shapes.

Finally, prevents padding when the number of input tokens exceeds the
maximum ACL graph batch size to avoid out-of-bounds errors.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
Need to add corresponding test case ASAP.
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Signed-off-by: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
---
 vllm_ascend/spec_decode/mtp_proposer.py | 17 +++++++++++++++--
 vllm_ascend/worker/model_runner_v1.py   |  2 +-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
index 85bbad89..14db8976 100644
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -748,6 +748,7 @@ class MtpProposer(Proposer):
         has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0
         aclgraph_runtime_mode, batch_descriptor = \
             self.runner.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
+        original_aclgraph_runtime_mode = aclgraph_runtime_mode
         if self.use_async_scheduling:
             # there is synchronization between mtp steps when enabling aclgraph,
             # disable aclgraph when use async scheduling to avoid the
@@ -802,6 +803,17 @@ class MtpProposer(Proposer):
                         hidden_states = torch.ops.vllm.maybe_pad_and_reduce(
                             hidden_states)
 
+                    if original_aclgraph_runtime_mode == CUDAGraphMode.FULL and \
+                        self.use_async_scheduling and attn_metadata[layer_name].decode is not None:
+                        for layer_name in self.attn_layer_name:
+                            actual_size = len(attn_metadata[layer_name].decode.
+                                              actual_seq_lengths_q)
+
+                            attn_metadata[layer_name].decode.seq_lens_list = \
+                                attn_metadata[layer_name].decode.seq_lens_list[:actual_size]
+                            attn_metadata[layer_name].decode.block_table = \
+                                attn_metadata[layer_name].decode.block_table[:actual_size]
+
                     hidden_states = self.model(input_ids=input_ids,
                                                positions=positions,
                                                hidden_states=hidden_states)
@@ -1133,8 +1145,9 @@ class MtpProposer(Proposer):
             num_computed_tokens_cpu,
             seq_lens=common_attn_metadata.seq_lens)
 
-        token_indices_to_sample = (common_attn_metadata.query_start_loc[1:] -
-                                   1 - num_rejected_tokens_gpu)
+        query_start_loc = common_attn_metadata.query_start_loc[
+            1:1 + num_rejected_tokens_gpu.shape[0]]
+        token_indices_to_sample = query_start_loc - 1 - num_rejected_tokens_gpu
 
         return spec_common_attn_metadata, token_indices, token_indices_to_sample
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 091f4e59..0b514e60 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1019,7 +1019,7 @@ class NPUModelRunner(GPUModelRunner):
             # TODO: We should make this official ASAP. Also note that if we pad here,
             # the builders won’t need to add any extra padding.
             if self.compilation_config.cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \
-                uniform_decode:
+                uniform_decode and num_input_tokens <= self.cudagraph_batch_sizes[-1]:
                 num_reqs_padded = num_input_tokens // self.uniform_decode_query_len
                 pad_size = num_reqs_padded - num_reqs
                 if pad_size > 0: