[Fix] Fix the token-wise padding mechanism (#1300)

### What this PR does / why we need it?
Fix the token-wise padding mechanism.

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
yiz-liu
2025-06-20 14:46:17 +08:00
committed by GitHub
parent b350edae9a
commit 2c7dd85fd8

View File

@@ -956,10 +956,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Copy the tensors to the NPU.
self.input_ids[:total_num_scheduled_tokens].copy_(
self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
input_ids = self.input_ids[:num_input_tokens]
# prepare the MRoPE for mllm if using multimodal
num_input_tokens = total_num_scheduled_tokens
# _prepare_inputs may reorder the batch, so we must gather multi
# modal outputs after that to ensure the correct order
if self.is_multimodal_model:
@@ -973,27 +970,26 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# NOTE(woosuk): To unify token ids and soft tokens (vision
# embeddings), we always use embeddings (rather than token ids)
# as input to the multimodal model, even when the input is text.
input_ids = self.input_ids[:num_input_tokens]
input_ids = self.input_ids[:total_num_scheduled_tokens]
if mm_embeds:
inputs_embeds = self.model.get_input_embeddings(
input_ids, mm_embeds)
else:
inputs_embeds = self.model.get_input_embeddings(input_ids)
# TODO(woosuk): Avoid the copy. Optimize.
self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds)
self.inputs_embeds[:total_num_scheduled_tokens].copy_(
inputs_embeds)
inputs_embeds = self.inputs_embeds[:num_input_tokens]
input_ids = None
else:
# For text-only models, we use token ids as input.
# While it is possible to use embeddings as input just like the
# multimodal models, it is not desirable for performance since
# then the embedding layer is not included in the CUDA graph.
# then the embedding layer is not included in the ACL graph.
input_ids = self.input_ids[:num_input_tokens]
inputs_embeds = None
if self.uses_mrope:
positions = self.mrope_positions[:, :num_input_tokens]
else:
positions = self.positions[:num_input_tokens]
if self.torchair_graph_enabled and not with_prefill:
input_ids = self.input_ids[:padded_batch_size]