From 2c7dd85fd8ce4a0951fc441d9c9c432d47c98d32 Mon Sep 17 00:00:00 2001 From: yiz-liu <136800916+yiz-liu@users.noreply.github.com> Date: Fri, 20 Jun 2025 14:46:17 +0800 Subject: [PATCH] [Fix] Fix the token-wise padding mechanism (#1300) ### What this PR does / why we need it? Fix the token-wise padding mechanism. Signed-off-by: Yizhou Liu --- vllm_ascend/worker/model_runner_v1.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 89f30bc..4febfcb 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -956,10 +956,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): # Copy the tensors to the NPU. self.input_ids[:total_num_scheduled_tokens].copy_( self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) - input_ids = self.input_ids[:num_input_tokens] - # prepare the MRoPE for mllm if using multimodal - num_input_tokens = total_num_scheduled_tokens # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order if self.is_multimodal_model: @@ -973,27 +970,26 @@ class NPUModelRunner(LoRAModelRunnerMixin): # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - input_ids = self.input_ids[:num_input_tokens] + input_ids = self.input_ids[:total_num_scheduled_tokens] if mm_embeds: inputs_embeds = self.model.get_input_embeddings( input_ids, mm_embeds) else: inputs_embeds = self.model.get_input_embeddings(input_ids) # TODO(woosuk): Avoid the copy. Optimize. - self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds) + self.inputs_embeds[:total_num_scheduled_tokens].copy_( + inputs_embeds) inputs_embeds = self.inputs_embeds[:num_input_tokens] input_ids = None else: # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the # multimodal models, it is not desirable for performance since - # then the embedding layer is not included in the CUDA graph. + # then the embedding layer is not included in the ACL graph. input_ids = self.input_ids[:num_input_tokens] inputs_embeds = None if self.uses_mrope: positions = self.mrope_positions[:, :num_input_tokens] - else: - positions = self.positions[:num_input_tokens] if self.torchair_graph_enabled and not with_prefill: input_ids = self.input_ids[:padded_batch_size]