[Bugfix] Sync MRotaryEmbedding interface change to recover CI (#1399)
### What this PR does / why we need it? Sync MRotaryEmbedding interface change to recover main CI (https://github.com/vllm-project/vllm/pull/19939) ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -274,6 +274,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
dtype=torch.int64,
|
||||
device="cpu",
|
||||
pin_memory=True)
|
||||
self.mrope_positions_np = self.mrope_positions_cpu.numpy()
|
||||
|
||||
if self.is_multimodal_model:
|
||||
self.inputs_embeds = torch.zeros(
|
||||
@@ -793,14 +794,23 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
dst_start = mrope_pos_ptr
|
||||
dst_end = mrope_pos_ptr + completion_part_len
|
||||
|
||||
self.mrope_positions_cpu[:, dst_start:dst_end] = \
|
||||
if vllm_version_is("0.9.1"):
|
||||
self.mrope_positions_cpu[:, dst_start:dst_end] = \
|
||||
MRotaryEmbedding.get_next_input_positions_tensor(
|
||||
req.mrope_position_delta,
|
||||
context_len=num_computed_tokens +
|
||||
prompt_part_len,
|
||||
seq_len=num_computed_tokens +
|
||||
prompt_part_len +
|
||||
completion_part_len,
|
||||
)
|
||||
else:
|
||||
MRotaryEmbedding.get_next_input_positions_tensor(
|
||||
req.mrope_position_delta,
|
||||
context_len=num_computed_tokens +
|
||||
prompt_part_len,
|
||||
seq_len=num_computed_tokens +
|
||||
prompt_part_len +
|
||||
completion_part_len,
|
||||
out=self.mrope_positions_np,
|
||||
out_offset=dst_start,
|
||||
mrope_position_delta=req.mrope_position_delta,
|
||||
context_len=num_computed_tokens + prompt_part_len,
|
||||
num_new_tokens=completion_part_len,
|
||||
)
|
||||
|
||||
mrope_pos_ptr += completion_part_len
|
||||
|
||||
Reference in New Issue
Block a user