[BugFix] Refactor ACL graph size adjustment for speculative decoding (#4640)

### What this PR does / why we need it? Move the logic for adjusting ACL graph capture sizes for speculative decoding from the generic utility module into a dedicated method within the compilation configuration. This change improves code organization and encapsulation by making the compilation configuration responsible for managing its own state. The model runner now triggers this adjustment directly, providing the necessary context. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-07 17:32:45 +08:00
parent 688b1332da
commit 8fdb689a32
2 changed files with 12 additions and 31 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -571,26 +571,6 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
            vllm_config.model_config.architectures[0], num_hidden_layers,
            len(original_sizes))

-    # default or defined cudagraph_capture_sizes may not consider num_speculative_tokens>1 scenario
-    # the maximum size cudagraph_capture_sizes[0] should be greater or equal than
-    # (num_speculative_tokens+1)*max_num_seqs, otherwise draft model will run in eager mode
-    if vllm_config.speculative_config is not None and \
-        vllm_config.speculative_config.num_speculative_tokens > 1:
-        num_speculative_tokens = vllm_config.speculative_config.num_speculative_tokens
-        max_num_seqs = vllm_config.scheduler_config.max_num_seqs
-        original_sizes, compilation_config.cudagraph_capture_sizes = \
-            compilation_config.cudagraph_capture_sizes, None
-        assert len(original_sizes) > 0
-        if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
-            enlarged_sizes = [(num_speculative_tokens + 1) * size
-                              for size in original_sizes]
-            update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
-            logger.info(
-                "Adjusted ACL graphs: %s → %s for speculative decoding",
-                original_sizes, enlarged_sizes)
-        else:
-            compilation_config.cudagraph_capture_sizes = original_sizes
-

 # TODO(wxy): Move to ops module
 def dispose_tensor(x: torch.Tensor):