From 291c216898c989efb4b59a64dad818d2dd48a71c Mon Sep 17 00:00:00 2001 From: Pleaplusone <38376071+ganyi1996ppo@users.noreply.github.com> Date: Tue, 10 Jun 2025 22:20:40 +0800 Subject: [PATCH] fix torchair execute issue on padding data, and mtp padding logic (#1160) ### What this PR does / why we need it? The former PR https://github.com/vllm-project/vllm-ascend/pull/736 select the valid token inside the `input_ids` and `position_ids` breaks the necessary padding required by torchair. In this PR, we pending the pad logic after the multimodal part. Signed-off-by: ganyi --- vllm_ascend/attention/mla_v1.py | 5 ++++- vllm_ascend/worker/model_runner_v1.py | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 226a570..4a8b590 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -376,7 +376,10 @@ class AscendMLAMetadataBuilder: seq_lens = seq_lens[:self._num_decode_tokens] input_positions = input_positions[:self._num_decode_tokens] block_table = block_table[:self._num_decode_tokens, ...] - if use_torchair_graph and self.runner.attn_state == AscendAttentionState.DecodeOnly: + if use_torchair_graph and self.runner.attn_state in [ + AscendAttentionState.DecodeOnly, + AscendAttentionState.SpecDecoding + ]: num_seqs = len(seq_lens) if graph_pad_size != 0: pad_value = 1 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 2b343d7..51ffe1e 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -943,11 +943,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) input_ids = self.input_ids[:num_input_tokens] - if (envs_ascend.VLLM_ENABLE_MC2 - or self.torchair_graph_enabled) and not with_prefill: - input_ids = self.input_ids[:padded_batch_size] - positions = self.positions[:padded_batch_size] - # prepare the MRoPE for mllm if using multimodal num_input_tokens = total_num_scheduled_tokens # _prepare_inputs may reorder the batch, so we must gather multi @@ -985,6 +980,11 @@ class NPUModelRunner(LoRAModelRunnerMixin): else: positions = self.positions[:num_input_tokens] + if (envs_ascend.VLLM_ENABLE_MC2 + or self.torchair_graph_enabled) and not with_prefill: + input_ids = self.input_ids[:padded_batch_size] + positions = self.positions[:padded_batch_size] + # Run forward pass with set_forward_context(attn_metadata, self.vllm_config,