[bugfix][ACLGraph][MTP] deletes cudagraph_batch_sizes in MtpProposer (#5183)

### What this PR does / why we need it? This PR deletes `cudagraph_batch_sizes` in `MtpProposer` and reuses the one in `NPUModelRunner`. During our deployment of DeepSeek-V3.2 with MTP across machines 2P2D and conducting AISBench stress testing, an error occurred (see below). After investigation, we found that `compilation_config.cudagraph_capture_sizes` is modified by `adjust_cudagraph_sizes_for_spec_decode` in `NPUModelRunner`. This modification only updates `cudagraph_batch_sizes` in `NPUModelRunner` but is not synchronized to `MtpProposer`. After discussion (CC @yiz-liu) , we believe it is unnecessary to maintain `cudagraph_batch_sizes` in `MtpProposer`; it should directly use the variable from `NPUModelRunner`. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
2025-12-22 03:08:27 -03:00
parent dc047489c7
commit ea6206bb18
2 changed files with 4 additions and 8 deletions
--- a/tests/ut/spec_decode/test_mtp_proposer.py
+++ b/tests/ut/spec_decode/test_mtp_proposer.py
@@ -97,7 +97,6 @@ class TestMtpProposer:
        proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)
        assert proposer.use_aclgraph is True
        assert proposer.cudagraph_batch_sizes == [1, 2, 4, 8]
    @patch("vllm.config.get_layers_from_vllm_config")
    @patch("vllm_ascend.spec_decode.mtp_proposer.get_model_loader")
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -107,11 +107,6 @@ class MtpProposer(Proposer):
        self.use_aclgraph = self.runner._use_aclgraph()
        self.cudagraph_batch_sizes = (list(
            sorted(
                self.vllm_config.compilation_config.cudagraph_capture_sizes))
                                      if self.use_aclgraph else [])
        # persistent buffers for aclgraph graph
        self.input_ids = torch.zeros(self.max_num_tokens,
                                     dtype=torch.int32,
@@ -697,11 +692,13 @@ class MtpProposer(Proposer):
        assert self.runner is not None
-        if self.runner.use_aclgraph and num_scheduled_tokens <= self.cudagraph_batch_sizes[
+        # Note(qcs): We may need to refactor these check logics.
        if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
                -1]:
            num_input_tokens = self.vllm_config.pad_for_cudagraph(
                num_scheduled_tokens)
-        elif self.use_aclgraph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[
                -1]:
            # Acl graph mode, add padding to the batch size
            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
        else: