From 9b3a484b46740c23d4f71a781b10576c37ccc569 Mon Sep 17 00:00:00 2001
From: Angazenn <92204292+Angazenn@users.noreply.github.com>
Date: Sat, 22 Nov 2025 17:33:12 +0800
Subject: [PATCH] [BugFix] Fix some issues caused by the ascending order of
 cudagraph_capture_sizes (#4338)

### What this PR does / why we need it?
In [#26016](https://github.com/vllm-project/vllm/pull/26016), vllm
change the `cudagraph_capture_sizes` to be in ascending order. This PR
fixes related issues caused by this.
### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?


- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379

---------

Signed-off-by: Angazenn <supperccell@163.com>
---
 tests/e2e/multicard/test_aclgraph_capture_replay.py | 2 +-
 vllm_ascend/utils.py                                | 8 +++++++-
 vllm_ascend/worker/model_runner_v1.py               | 6 +++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py
index f4dd4965..d8c0e2ee 100644
--- a/tests/e2e/multicard/test_aclgraph_capture_replay.py
+++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py
@@ -109,7 +109,7 @@ def _run_worker_process(
         llm = LLM(
             model=model_path,
             quantization="ascend" if "W8A8" in model_path else None,
-            # enable_expert_parallel=True if "DeepSeek" in model_path else False,
+            enable_expert_parallel=True if "DeepSeek" in model_path else False,
             trust_remote_code=True,
         )
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 8dc60911..2235cc07 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -433,7 +433,13 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
             cudagraph_capture_sizes += list(
                 range(256, max_cudagraph_capture_size + 1, 16))
 
-    if sorted(cudagraph_capture_sizes, reverse=True) == \
+    if vllm_version_is("0.11.0"):
+        target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes,
+                                                reverse=True)
+    else:
+        # in newer version, vVLLM use ascending order of cudagraph_capture_sizes.
+        target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
+    if target_cudagraph_capture_sizes == \
             vllm_config.compilation_config.cudagraph_capture_sizes:
         return True
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 544b3edc..c91abc54 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -664,7 +664,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
         # the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512).
         if self.compilation_config.cudagraph_capture_sizes:
-            max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0]
+            if vllm_version_is("0.11.0"):
+                max_num_tokens = self.compilation_config.cudagraph_capture_sizes[
+                    0]
+            else:
+                max_num_tokens = self.compilation_config.max_cudagraph_capture_size
         else:
             # NOTE: To save memory, we cap the max number of tokens to 512.
             max_num_tokens = min(