From 291c216898c989efb4b59a64dad818d2dd48a71c Mon Sep 17 00:00:00 2001
From: Pleaplusone <38376071+ganyi1996ppo@users.noreply.github.com>
Date: Tue, 10 Jun 2025 22:20:40 +0800
Subject: [PATCH] fix torchair execute issue on padding data, and mtp padding
 logic (#1160)

### What this PR does / why we need it?
The former PR https://github.com/vllm-project/vllm-ascend/pull/736
select the valid token inside the `input_ids` and `position_ids` breaks
the necessary padding required by torchair. In this PR, we pending the
pad logic after the multimodal part.


Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
---
 vllm_ascend/attention/mla_v1.py       |  5 ++++-
 vllm_ascend/worker/model_runner_v1.py | 10 +++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index 226a570..4a8b590 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -376,7 +376,10 @@ class AscendMLAMetadataBuilder:
             seq_lens = seq_lens[:self._num_decode_tokens]
             input_positions = input_positions[:self._num_decode_tokens]
             block_table = block_table[:self._num_decode_tokens, ...]
-            if use_torchair_graph and self.runner.attn_state == AscendAttentionState.DecodeOnly:
+            if use_torchair_graph and self.runner.attn_state in [
+                    AscendAttentionState.DecodeOnly,
+                    AscendAttentionState.SpecDecoding
+            ]:
                 num_seqs = len(seq_lens)
                 if graph_pad_size != 0:
                     pad_value = 1
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 2b343d7..51ffe1e 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -943,11 +943,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
         input_ids = self.input_ids[:num_input_tokens]
 
-        if (envs_ascend.VLLM_ENABLE_MC2
-                or self.torchair_graph_enabled) and not with_prefill:
-            input_ids = self.input_ids[:padded_batch_size]
-            positions = self.positions[:padded_batch_size]
-
         # prepare the MRoPE for mllm if using multimodal
         num_input_tokens = total_num_scheduled_tokens
         # _prepare_inputs may reorder the batch, so we must gather multi
@@ -985,6 +980,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         else:
             positions = self.positions[:num_input_tokens]
 
+        if (envs_ascend.VLLM_ENABLE_MC2
+                or self.torchair_graph_enabled) and not with_prefill:
+            input_ids = self.input_ids[:padded_batch_size]
+            positions = self.positions[:padded_batch_size]
+
         # Run forward pass
         with set_forward_context(attn_metadata,
                                  self.vllm_config,