[Bugfix] Fix DeepSeek FIA error in async_scheduling with mtp (#5046)
### What this PR does / why we need it?
When enable the async_scheduling, in large scale EP scene, mtp module
goes to eagler mode, which results in the mismatch of
seq_lens_list、block_table. So adapt the judgement before the draft model
forward.
fix #4986
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: hust17yixuan <303660421@qq.com>
This commit is contained in:
@@ -725,7 +725,6 @@ class MtpProposer(Proposer):
|
||||
has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0
|
||||
aclgraph_runtime_mode, batch_descriptor = \
|
||||
self.runner.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
||||
original_aclgraph_runtime_mode = aclgraph_runtime_mode
|
||||
if self.use_async_scheduling:
|
||||
# there is synchronization between mtp steps when enabling aclgraph,
|
||||
# disable aclgraph when use async scheduling to avoid the
|
||||
@@ -779,8 +778,8 @@ class MtpProposer(Proposer):
|
||||
hidden_states = torch.ops.vllm.maybe_pad_and_reduce(
|
||||
hidden_states)
|
||||
|
||||
if original_aclgraph_runtime_mode == CUDAGraphMode.FULL and \
|
||||
self.use_async_scheduling and attn_metadata[layer_name].decode is not None:
|
||||
if self.use_async_scheduling and attn_metadata[
|
||||
layer_name].decode is not None:
|
||||
for layer_name in self.attn_layer_name:
|
||||
actual_size = len(attn_metadata[layer_name].decode.
|
||||
actual_seq_lengths_q)
|
||||
|
||||
Reference in New Issue
Block a user