From cb2fbf7df2bacca85f26643f225e757554c73146 Mon Sep 17 00:00:00 2001 From: hwhaokun Date: Sat, 27 Dec 2025 10:36:59 +0800 Subject: [PATCH] [bugfix] solve dp scenario Host-Device sync (#5298) ### What this PR does / why we need it? In the speculative decoding scenario, the original code performs Host-Device synchronization, which slows down the main model's execution speed. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: hwhaokun Co-authored-by: realliujiaxu --- vllm_ascend/worker/model_runner_v1.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index b01c8f4d..0083a407 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1863,10 +1863,11 @@ class NPUModelRunner(GPUModelRunner): # QUESTION: Why do we separately set query_start_loc for spec in the first place? # While in _prepare_inputs we don't? if self.speculative_config: - self.query_start_loc.gpu[:num_reqs + 1] = torch.tensor( + self.query_start_loc.cpu[:num_reqs + 1] = torch.tensor( [0] + self.actual_seq_lengths_q[:num_reqs], - device=self.device, + device="cpu", dtype=torch.int32) + self.query_start_loc.copy_to_gpu() common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=self.query_start_loc.gpu[:num_reqs + 1], query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +