From ea6206bb183e555345e0fbc38c7cd0a1634fe21f Mon Sep 17 00:00:00 2001 From: Qiu Date: Mon, 22 Dec 2025 03:08:27 -0300 Subject: [PATCH] [bugfix][ACLGraph][MTP] deletes `cudagraph_batch_sizes` in `MtpProposer` (#5183) ### What this PR does / why we need it? This PR deletes `cudagraph_batch_sizes` in `MtpProposer` and reuses the one in `NPUModelRunner`. During our deployment of DeepSeek-V3.2 with MTP across machines 2P2D and conducting AISBench stress testing, an error occurred (see below). After investigation, we found that `compilation_config.cudagraph_capture_sizes` is modified by `adjust_cudagraph_sizes_for_spec_decode` in `NPUModelRunner`. This modification only updates `cudagraph_batch_sizes` in `NPUModelRunner` but is not synchronized to `MtpProposer`. After discussion (CC @yiz-liu) , we believe it is unnecessary to maintain `cudagraph_batch_sizes` in `MtpProposer`; it should directly use the variable from `NPUModelRunner`. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: QiuChunshuo --- tests/ut/spec_decode/test_mtp_proposer.py | 1 - vllm_ascend/spec_decode/mtp_proposer.py | 11 ++++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py index c52ef569..3047b3d9 100644 --- a/tests/ut/spec_decode/test_mtp_proposer.py +++ b/tests/ut/spec_decode/test_mtp_proposer.py @@ -97,7 +97,6 @@ class TestMtpProposer: proposer = MtpProposer(vllm_config, torch.device("cpu"), runner) assert proposer.use_aclgraph is True - assert proposer.cudagraph_batch_sizes == [1, 2, 4, 8] @patch("vllm.config.get_layers_from_vllm_config") @patch("vllm_ascend.spec_decode.mtp_proposer.get_model_loader") diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 4deef9a2..14a61a79 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -107,11 +107,6 @@ class MtpProposer(Proposer): self.use_aclgraph = self.runner._use_aclgraph() - self.cudagraph_batch_sizes = (list( - sorted( - self.vllm_config.compilation_config.cudagraph_capture_sizes)) - if self.use_aclgraph else []) - # persistent buffers for aclgraph graph self.input_ids = torch.zeros(self.max_num_tokens, dtype=torch.int32, @@ -697,11 +692,13 @@ class MtpProposer(Proposer): assert self.runner is not None - if self.runner.use_aclgraph and num_scheduled_tokens <= self.cudagraph_batch_sizes[ + # Note(qcs): We may need to refactor these check logics. + if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[ -1]: num_input_tokens = self.vllm_config.pad_for_cudagraph( num_scheduled_tokens) - elif self.use_aclgraph and num_tokens <= self.cudagraph_batch_sizes[-1]: + elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[ + -1]: # Acl graph mode, add padding to the batch size num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) else: