[Bugfix] Fix DeepSeek FIA error in async_scheduling with mtp (#5046)

### What this PR does / why we need it? When enable the async_scheduling, in large scale EP scene, mtp module goes to eagler mode, which results in the mismatch of seq_lens_list、block_table. So adapt the judgement before the draft model forward. fix #4986 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: hust17yixuan <303660421@qq.com>
2025-12-17 09:20:44 +08:00
parent 06f33540c4
commit 153eeaa621
1 changed files with 2 additions and 3 deletions
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -725,7 +725,6 @@ class MtpProposer(Proposer):
        has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0
        aclgraph_runtime_mode, batch_descriptor = \
            self.runner.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
-        original_aclgraph_runtime_mode = aclgraph_runtime_mode
        if self.use_async_scheduling:
            # there is synchronization between mtp steps when enabling aclgraph,
            # disable aclgraph when use async scheduling to avoid the
@@ -779,8 +778,8 @@ class MtpProposer(Proposer):
                        hidden_states = torch.ops.vllm.maybe_pad_and_reduce(
                            hidden_states)

-                    if original_aclgraph_runtime_mode == CUDAGraphMode.FULL and \
-                        self.use_async_scheduling and attn_metadata[layer_name].decode is not None:
+                    if self.use_async_scheduling and attn_metadata[
+                            layer_name].decode is not None:
                        for layer_name in self.attn_layer_name:
                            actual_size = len(attn_metadata[layer_name].decode.
                                              actual_seq_lengths_q)