From 63d3921208202314c11c06cfb8be1525d413f253 Mon Sep 17 00:00:00 2001 From: Zetong Li <48438720+slippersss@users.noreply.github.com> Date: Thu, 22 Jan 2026 21:08:07 +0800 Subject: [PATCH] [Bugfix] Remove `use_aclgraph` in mtp_proposer and use `use_cuda_graph` (#6032) ### What this PR does / why we need it? This PR aims to remove `use_aclgraph` and use `use_cuda_graph` just the same as eagle_proposer in mtp_proposer. The reason of these changes are described below. There is a scenario that `use_aclgraph=True` while `use_cuda_graph=False`, e.g. enabling `async_scheduling=True`. When using deepseek v3.2, `common_attn_metadata.num_input_tokens` is important and it should be the same as `num_input_tokens` entering into model. In the above scenario, `use_aclgraph` accidentally pad `num_tokens` to `num_input_tokens`, coinciding with `common_attn_metadata.num_input_tokens`. But later eager mode is triggered and actually we don't need padding. That means that the code logic is incorrect but the running output looks fine. However, `common_attn_metadata.num_input_tokens` should mean `num_input_tokens` entering into model. So we should update `common_attn_metadata.num_input_tokens = num_input_tokens` after padding. Therefore, we can safely use normal `use_cuda_graph` instead of problematic `use_acl_graph`. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: Zetong Li --- vllm_ascend/spec_decode/eagle_proposer.py | 2 -- vllm_ascend/spec_decode/mtp_proposer.py | 7 ++----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 7844d183..a8d657bd 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -107,8 +107,6 @@ class EagleProposer(VllmEagleProposer): self.pcp_rank = self.runner.pcp_rank self.dcp_rank = self.runner.dcp_rank - self.use_aclgraph = self.runner._use_aclgraph() - self.full_indices = range( self.runner.max_num_tokens * self.pcp_size * self.dcp_size + self.pcp_size * self.dcp_size * self.runner.max_num_reqs) diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index a5ce1e57..dac8b5d1 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -242,14 +242,10 @@ class MtpProposer(EagleProposer): assert self.runner is not None # Note(qcs): We may need to refactor these check logics. - if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[ + if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[ -1]: num_input_tokens = self.vllm_config.pad_for_cudagraph( num_scheduled_tokens) - elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[ - -1]: - # Acl graph mode, add padding to the batch size - num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) else: # Eager mode, no padding needed num_input_tokens = num_tokens @@ -293,6 +289,7 @@ class MtpProposer(EagleProposer): # update the graph_pad_size in common_attn_metadata, to tell the # builder padding some elements. common_attn_metadata.graph_pad_size = graph_pad_size + common_attn_metadata.num_input_tokens = num_input_tokens builder = self.runner.attn_groups[0][0].get_metadata_builder() attn_metadata_mtp = builder.build(0, common_attn_metadata, self.runner.get_model())