[Bugfix] Remove use_aclgraph in mtp_proposer and use use_cuda_graph (#6032)

### What this PR does / why we need it?
This PR aims to remove `use_aclgraph` and use `use_cuda_graph` just the
same as eagle_proposer in mtp_proposer. The reason of these changes are
described below.

There is a scenario that `use_aclgraph=True` while
`use_cuda_graph=False`, e.g. enabling `async_scheduling=True`. When
using deepseek v3.2, `common_attn_metadata.num_input_tokens` is
important and it should be the same as `num_input_tokens` entering into
model. In the above scenario, `use_aclgraph` accidentally pad
`num_tokens` to `num_input_tokens`, coinciding with
`common_attn_metadata.num_input_tokens`. But later eager mode is
triggered and actually we don't need padding. That means that the code
logic is incorrect but the running output looks fine.

However, `common_attn_metadata.num_input_tokens` should mean
`num_input_tokens` entering into model. So we should update
`common_attn_metadata.num_input_tokens = num_input_tokens` after
padding. Therefore, we can safely use normal `use_cuda_graph` instead of
problematic `use_acl_graph`.

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
by ci

- vLLM version: v0.13.0
- vLLM main:
2c24bc6996

Signed-off-by: Zetong Li <slippersss@126.com>
This commit is contained in:
Zetong Li
2026-01-22 21:08:07 +08:00
committed by GitHub
parent 176bfc36bc
commit 63d3921208
2 changed files with 2 additions and 7 deletions

View File

@@ -107,8 +107,6 @@ class EagleProposer(VllmEagleProposer):
self.pcp_rank = self.runner.pcp_rank self.pcp_rank = self.runner.pcp_rank
self.dcp_rank = self.runner.dcp_rank self.dcp_rank = self.runner.dcp_rank
self.use_aclgraph = self.runner._use_aclgraph()
self.full_indices = range( self.full_indices = range(
self.runner.max_num_tokens * self.pcp_size * self.dcp_size + self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
self.pcp_size * self.dcp_size * self.runner.max_num_reqs) self.pcp_size * self.dcp_size * self.runner.max_num_reqs)

View File

@@ -242,14 +242,10 @@ class MtpProposer(EagleProposer):
assert self.runner is not None assert self.runner is not None
# Note(qcs): We may need to refactor these check logics. # Note(qcs): We may need to refactor these check logics.
if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[ if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
-1]: -1]:
num_input_tokens = self.vllm_config.pad_for_cudagraph( num_input_tokens = self.vllm_config.pad_for_cudagraph(
num_scheduled_tokens) num_scheduled_tokens)
elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[
-1]:
# Acl graph mode, add padding to the batch size
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
else: else:
# Eager mode, no padding needed # Eager mode, no padding needed
num_input_tokens = num_tokens num_input_tokens = num_tokens
@@ -293,6 +289,7 @@ class MtpProposer(EagleProposer):
# update the graph_pad_size in common_attn_metadata, to tell the # update the graph_pad_size in common_attn_metadata, to tell the
# builder padding some elements. # builder padding some elements.
common_attn_metadata.graph_pad_size = graph_pad_size common_attn_metadata.graph_pad_size = graph_pad_size
common_attn_metadata.num_input_tokens = num_input_tokens
builder = self.runner.attn_groups[0][0].get_metadata_builder() builder = self.runner.attn_groups[0][0].get_metadata_builder()
attn_metadata_mtp = builder.build(0, common_attn_metadata, attn_metadata_mtp = builder.build(0, common_attn_metadata,
self.runner.get_model()) self.runner.get_model())