[BugFix] Fix some issues caused by the ascending order of cudagraph_capture_sizes (#4338)

### What this PR does / why we need it? In [#26016](https://github.com/vllm-project/vllm/pull/26016), vllm change the `cudagraph_capture_sizes` to be in ascending order. This PR fixes related issues caused by this. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: Angazenn <supperccell@163.com>
2025-11-22 17:33:12 +08:00
parent fff258bce1
commit 9b3a484b46
3 changed files with 13 additions and 3 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -433,7 +433,13 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
            cudagraph_capture_sizes += list(
                range(256, max_cudagraph_capture_size + 1, 16))

-    if sorted(cudagraph_capture_sizes, reverse=True) == \
+    if vllm_version_is("0.11.0"):
+        target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes,
+                                                reverse=True)
+    else:
+        # in newer version, vVLLM use ascending order of cudagraph_capture_sizes.
+        target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
+    if target_cudagraph_capture_sizes == \
            vllm_config.compilation_config.cudagraph_capture_sizes:
        return True

--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -664,7 +664,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
        # the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512).
        if self.compilation_config.cudagraph_capture_sizes:
-            max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0]
+            if vllm_version_is("0.11.0"):
+                max_num_tokens = self.compilation_config.cudagraph_capture_sizes[
+                    0]
+            else:
+                max_num_tokens = self.compilation_config.max_cudagraph_capture_size
        else:
            # NOTE: To save memory, we cap the max number of tokens to 512.
            max_num_tokens = min(