[Bugfix] Sync MRotaryEmbedding interface change to recover CI (#1399)

### What this PR does / why we need it?

Sync MRotaryEmbedding interface change to recover main CI
(https://github.com/vllm-project/vllm/pull/19939)

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-06-24 22:56:39 +08:00
committed by GitHub
parent 6ed3f00427
commit 5f5800ba42

View File

@@ -274,6 +274,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dtype=torch.int64,
device="cpu",
pin_memory=True)
self.mrope_positions_np = self.mrope_positions_cpu.numpy()
if self.is_multimodal_model:
self.inputs_embeds = torch.zeros(
@@ -793,14 +794,23 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dst_start = mrope_pos_ptr
dst_end = mrope_pos_ptr + completion_part_len
self.mrope_positions_cpu[:, dst_start:dst_end] = \
if vllm_version_is("0.9.1"):
self.mrope_positions_cpu[:, dst_start:dst_end] = \
MRotaryEmbedding.get_next_input_positions_tensor(
req.mrope_position_delta,
context_len=num_computed_tokens +
prompt_part_len,
seq_len=num_computed_tokens +
prompt_part_len +
completion_part_len,
)
else:
MRotaryEmbedding.get_next_input_positions_tensor(
req.mrope_position_delta,
context_len=num_computed_tokens +
prompt_part_len,
seq_len=num_computed_tokens +
prompt_part_len +
completion_part_len,
out=self.mrope_positions_np,
out_offset=dst_start,
mrope_position_delta=req.mrope_position_delta,
context_len=num_computed_tokens + prompt_part_len,
num_new_tokens=completion_part_len,
)
mrope_pos_ptr += completion_part_len