[Fix] Synchronize the host query_start_loc with device values to prevent shape mismatches (#5134)
### What this PR does / why we need it?
Synchronize the host query_start_loc with device values to prevent shape
mismatches when not enable async scheduling.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
None.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -779,9 +779,9 @@ class MtpProposer(Proposer):
|
|||||||
hidden_states = torch.ops.vllm.maybe_pad_and_reduce(
|
hidden_states = torch.ops.vllm.maybe_pad_and_reduce(
|
||||||
hidden_states)
|
hidden_states)
|
||||||
|
|
||||||
if self.use_async_scheduling and attn_metadata[
|
for layer_name in self.attn_layer_name:
|
||||||
layer_name].decode is not None:
|
if self.use_async_scheduling and attn_metadata[
|
||||||
for layer_name in self.attn_layer_name:
|
layer_name].decode is not None:
|
||||||
actual_size = len(attn_metadata[layer_name].decode.
|
actual_size = len(attn_metadata[layer_name].decode.
|
||||||
actual_seq_lengths_q)
|
actual_seq_lengths_q)
|
||||||
|
|
||||||
|
|||||||
@@ -1014,17 +1014,14 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
num_reqs_padded = num_input_tokens // self.uniform_decode_query_len
|
num_reqs_padded = num_input_tokens // self.uniform_decode_query_len
|
||||||
pad_size = num_reqs_padded - num_reqs
|
pad_size = num_reqs_padded - num_reqs
|
||||||
if pad_size > 0:
|
if pad_size > 0:
|
||||||
last_query_loc = self.query_start_loc.gpu[num_reqs]
|
last_query_loc = self.query_start_loc.np[num_reqs]
|
||||||
|
|
||||||
steps = torch.arange(1,
|
self.query_start_loc.np[
|
||||||
pad_size + 1,
|
num_reqs + 1:num_reqs_padded + 1] = self.arange_np[
|
||||||
device=self.device,
|
1:pad_size +
|
||||||
dtype=self.query_start_loc.gpu.dtype)
|
1] * self.uniform_decode_query_len + last_query_loc
|
||||||
fill_values = last_query_loc + (
|
self.query_start_loc.copy_to_gpu(num_reqs_padded + 1)
|
||||||
steps * self.uniform_decode_query_len)
|
|
||||||
|
|
||||||
self.query_start_loc.gpu[num_reqs + 1:num_reqs_padded +
|
|
||||||
1] = fill_values
|
|
||||||
# So we are trying to simulate the behavior of GPUModelRunner's
|
# So we are trying to simulate the behavior of GPUModelRunner's
|
||||||
# prepare_inputs for uniform decode mode by padding query_start_loc
|
# prepare_inputs for uniform decode mode by padding query_start_loc
|
||||||
num_reqs = num_reqs_padded
|
num_reqs = num_reqs_padded
|
||||||
|
|||||||
Reference in New Issue
Block a user