From 43d974c6f74e4aff4083e76d14103a22b8b30c90 Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Wed, 17 Dec 2025 23:50:12 +0800 Subject: [PATCH] [Fix] Synchronize the host query_start_loc with device values to prevent shape mismatches (#5134) ### What this PR does / why we need it? Synchronize the host query_start_loc with device values to prevent shape mismatches when not enable async scheduling. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: Yizhou Liu --- vllm_ascend/spec_decode/mtp_proposer.py | 6 +++--- vllm_ascend/worker/model_runner_v1.py | 15 ++++++--------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index b20b73d7..462fa33f 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -779,9 +779,9 @@ class MtpProposer(Proposer): hidden_states = torch.ops.vllm.maybe_pad_and_reduce( hidden_states) - if self.use_async_scheduling and attn_metadata[ - layer_name].decode is not None: - for layer_name in self.attn_layer_name: + for layer_name in self.attn_layer_name: + if self.use_async_scheduling and attn_metadata[ + layer_name].decode is not None: actual_size = len(attn_metadata[layer_name].decode. actual_seq_lengths_q) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 4c3c42d9..830270ea 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1014,17 +1014,14 @@ class NPUModelRunner(GPUModelRunner): num_reqs_padded = num_input_tokens // self.uniform_decode_query_len pad_size = num_reqs_padded - num_reqs if pad_size > 0: - last_query_loc = self.query_start_loc.gpu[num_reqs] + last_query_loc = self.query_start_loc.np[num_reqs] - steps = torch.arange(1, - pad_size + 1, - device=self.device, - dtype=self.query_start_loc.gpu.dtype) - fill_values = last_query_loc + ( - steps * self.uniform_decode_query_len) + self.query_start_loc.np[ + num_reqs + 1:num_reqs_padded + 1] = self.arange_np[ + 1:pad_size + + 1] * self.uniform_decode_query_len + last_query_loc + self.query_start_loc.copy_to_gpu(num_reqs_padded + 1) - self.query_start_loc.gpu[num_reqs + 1:num_reqs_padded + - 1] = fill_values # So we are trying to simulate the behavior of GPUModelRunner's # prepare_inputs for uniform decode mode by padding query_start_loc num_reqs = num_reqs_padded