diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 5cf6bb3..1ddc7bb 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -274,6 +274,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): dtype=torch.int64, device="cpu", pin_memory=True) + self.mrope_positions_np = self.mrope_positions_cpu.numpy() if self.is_multimodal_model: self.inputs_embeds = torch.zeros( @@ -793,14 +794,23 @@ class NPUModelRunner(LoRAModelRunnerMixin): dst_start = mrope_pos_ptr dst_end = mrope_pos_ptr + completion_part_len - self.mrope_positions_cpu[:, dst_start:dst_end] = \ + if vllm_version_is("0.9.1"): + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + else: MRotaryEmbedding.get_next_input_positions_tensor( - req.mrope_position_delta, - context_len=num_computed_tokens + - prompt_part_len, - seq_len=num_computed_tokens + - prompt_part_len + - completion_part_len, + out=self.mrope_positions_np, + out_offset=dst_start, + mrope_position_delta=req.mrope_position_delta, + context_len=num_computed_tokens + prompt_part_len, + num_new_tokens=completion_part_len, ) mrope_pos_ptr += completion_part_len