[Fix] Fix the token-wise padding mechanism (#1300)
### What this PR does / why we need it? Fix the token-wise padding mechanism. Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -956,10 +956,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# Copy the tensors to the NPU.
|
||||
self.input_ids[:total_num_scheduled_tokens].copy_(
|
||||
self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
|
||||
input_ids = self.input_ids[:num_input_tokens]
|
||||
|
||||
# prepare the MRoPE for mllm if using multimodal
|
||||
num_input_tokens = total_num_scheduled_tokens
|
||||
# _prepare_inputs may reorder the batch, so we must gather multi
|
||||
# modal outputs after that to ensure the correct order
|
||||
if self.is_multimodal_model:
|
||||
@@ -973,27 +970,26 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# NOTE(woosuk): To unify token ids and soft tokens (vision
|
||||
# embeddings), we always use embeddings (rather than token ids)
|
||||
# as input to the multimodal model, even when the input is text.
|
||||
input_ids = self.input_ids[:num_input_tokens]
|
||||
input_ids = self.input_ids[:total_num_scheduled_tokens]
|
||||
if mm_embeds:
|
||||
inputs_embeds = self.model.get_input_embeddings(
|
||||
input_ids, mm_embeds)
|
||||
else:
|
||||
inputs_embeds = self.model.get_input_embeddings(input_ids)
|
||||
# TODO(woosuk): Avoid the copy. Optimize.
|
||||
self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds)
|
||||
self.inputs_embeds[:total_num_scheduled_tokens].copy_(
|
||||
inputs_embeds)
|
||||
inputs_embeds = self.inputs_embeds[:num_input_tokens]
|
||||
input_ids = None
|
||||
else:
|
||||
# For text-only models, we use token ids as input.
|
||||
# While it is possible to use embeddings as input just like the
|
||||
# multimodal models, it is not desirable for performance since
|
||||
# then the embedding layer is not included in the CUDA graph.
|
||||
# then the embedding layer is not included in the ACL graph.
|
||||
input_ids = self.input_ids[:num_input_tokens]
|
||||
inputs_embeds = None
|
||||
if self.uses_mrope:
|
||||
positions = self.mrope_positions[:, :num_input_tokens]
|
||||
else:
|
||||
positions = self.positions[:num_input_tokens]
|
||||
|
||||
if self.torchair_graph_enabled and not with_prefill:
|
||||
input_ids = self.input_ids[:padded_batch_size]
|
||||
|
||||
Reference in New Issue
Block a user