From 70713c3fd4693f0a861a3d0b84a96517dced614f Mon Sep 17 00:00:00 2001 From: Qiu Date: Tue, 14 Apr 2026 22:00:10 +0800 Subject: [PATCH] [cherry-pick][BugFix] Improve max_cudagraph_capture_size validation (#8252) ### What this PR does / why we need it? This PR improves the validation of `max_cudagraph_capture_size` by comparing it against the potential maximum tokens required for decoding, derived from the scheduler configuration. It introduces a warning to alert users when the capture size might be insufficient for the workload, which could lead to suboptimal performance. ref: #8227 ### Does this PR introduce _any_ user-facing change? Yes, a warning log is added when the `max_cudagraph_capture_size` is smaller than the potential decode workload. --------- Signed-off-by: QiuChunshuo --- vllm_ascend/utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index c79a8454..923e1a6f 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -1041,7 +1041,23 @@ def should_skip_allreduce_across_dp_group(vllm_config, is_draft_model: bool = Fa # Determine whether decode must use MC2. Use max cudagraph capture size # if available, otherwise use the maximal uniform decode token count. if compilation_config.cudagraph_capture_sizes: - potential_max_tokens = compilation_config.max_cudagraph_capture_size + potential_max_tokens = max( + compilation_config.max_cudagraph_capture_size, + min( + vllm_config.scheduler_config.max_num_batched_tokens, + vllm_config.scheduler_config.max_num_seqs * uniform_decode_query_len, + ), + ) + if potential_max_tokens != compilation_config.max_cudagraph_capture_size: + logger.warning_once( + "The max_cudagraph_capture_size (%d) is smaller than the potential max tokens required for " + "decode (%d). This may lead to suboptimal performance. Consider adjusting" + "max_cudagraph_capture_size or scheduler_config (max_num_batched_tokens or max_num_seqs)" + "to ensure max_cudagraph_capture_size can accommodate the decode workload. For more details, " + "see the issue #8240(https://github.com/vllm-project/vllm-ascend/issues/8240).", + compilation_config.max_cudagraph_capture_size, + potential_max_tokens, + ) else: potential_max_tokens = min(max_num_reqs * uniform_decode_query_len, 512)