From 8fdb689a32ddc86de8f035ccb000065a6e3ed8d5 Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Sun, 7 Dec 2025 17:32:45 +0800
Subject: [PATCH] [BugFix] Refactor ACL graph size adjustment for speculative
 decoding (#4640)

### What this PR does / why we need it?
Move the logic for adjusting ACL graph capture sizes for speculative
decoding from the generic utility module into a dedicated method within
the compilation configuration.

This change improves code organization and encapsulation by making the
compilation configuration responsible for managing its own state. The
model runner now triggers this adjustment directly, providing the
necessary context.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
None.

- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm_ascend/utils.py                  | 20 --------------------
 vllm_ascend/worker/model_runner_v1.py | 23 ++++++++++++-----------
 2 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 16c0f68d..00014c80 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -571,26 +571,6 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
             vllm_config.model_config.architectures[0], num_hidden_layers,
             len(original_sizes))
 
-    # default or defined cudagraph_capture_sizes may not consider num_speculative_tokens>1 scenario
-    # the maximum size cudagraph_capture_sizes[0] should be greater or equal than
-    # (num_speculative_tokens+1)*max_num_seqs, otherwise draft model will run in eager mode
-    if vllm_config.speculative_config is not None and \
-        vllm_config.speculative_config.num_speculative_tokens > 1:
-        num_speculative_tokens = vllm_config.speculative_config.num_speculative_tokens
-        max_num_seqs = vllm_config.scheduler_config.max_num_seqs
-        original_sizes, compilation_config.cudagraph_capture_sizes = \
-            compilation_config.cudagraph_capture_sizes, None
-        assert len(original_sizes) > 0
-        if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
-            enlarged_sizes = [(num_speculative_tokens + 1) * size
-                              for size in original_sizes]
-            update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
-            logger.info(
-                "Adjusted ACL graphs: %s → %s for speculative decoding",
-                original_sizes, enlarged_sizes)
-        else:
-            compilation_config.cudagraph_capture_sizes = original_sizes
-
 
 # TODO(wxy): Move to ops module
 def dispose_tensor(x: torch.Tensor):
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 907a6e07..04ff0287 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -4027,6 +4027,16 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
                              "; please try cudagraph_mode=PIECEWISE, "
                              "and make sure compilation level is piecewise")
 
+        if (aclgraph_mode.decode_mode() == CUDAGraphMode.FULL
+                and aclgraph_mode.separate_routine()
+                and self.uniform_decode_query_len > 1):
+            self.compilation_config.adjust_cudagraph_sizes_for_spec_decode(
+                self.uniform_decode_query_len,
+                self.parallel_config.tensor_parallel_size)
+            capture_sizes = self.compilation_config.cudagraph_capture_sizes
+            self.aclgraph_batch_sizes = (capture_sizes
+                                         if capture_sizes is not None else [])
+
         self.aclgraph_dispatcher.initialize_cudagraph_keys(
             self.compilation_config.cudagraph_mode,
             self.uniform_decode_query_len)
@@ -4122,17 +4132,8 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
                     x for x in self.aclgraph_batch_sizes if x <= max_num_tokens
                     and x >= self.uniform_decode_query_len
                 ]
-                compilation_cases_decode = sorted(decode_cudagraph_batch_sizes)
-                # TODO: refactor this when vLLM supports mtp>1
-                if not all(x % self.uniform_decode_query_len == 0
-                           for x in decode_cudagraph_batch_sizes):
-                    raise ValueError(
-                        "In the MTP fullgraph scenario, each graph size must be an integer multiple of "
-                        f"(num_speculative_tokens + 1): {self.uniform_decode_query_len}. "
-                        f"Please modify the cudagraph_capture_sizes variable to be integer multiple of {self.uniform_decode_query_len}, "
-                        f"while ensuring the maximum cudagraph_capture_sizes does not exceed max_num_seqs * (num_speculative_tokens + 1): {max_num_tokens}. "
-                        "For example, with MTP=2 and max_num_seqs=16, we recommend setting cudagraph_capture_sizes to [48]."
-                    )
+                compilation_cases_decode = list(
+                    reversed(decode_cudagraph_batch_sizes))
                 self._capture_aclgraphs(
                     compilation_cases=compilation_cases_decode,
                     aclgraph_runtime_mode=CUDAGraphMode.FULL,