From 9b3a484b46740c23d4f71a781b10576c37ccc569 Mon Sep 17 00:00:00 2001 From: Angazenn <92204292+Angazenn@users.noreply.github.com> Date: Sat, 22 Nov 2025 17:33:12 +0800 Subject: [PATCH] [BugFix] Fix some issues caused by the ascending order of cudagraph_capture_sizes (#4338) ### What this PR does / why we need it? In [#26016](https://github.com/vllm-project/vllm/pull/26016), vllm change the `cudagraph_capture_sizes` to be in ascending order. This PR fixes related issues caused by this. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379 --------- Signed-off-by: Angazenn --- tests/e2e/multicard/test_aclgraph_capture_replay.py | 2 +- vllm_ascend/utils.py | 8 +++++++- vllm_ascend/worker/model_runner_v1.py | 6 +++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py index f4dd4965..d8c0e2ee 100644 --- a/tests/e2e/multicard/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py @@ -109,7 +109,7 @@ def _run_worker_process( llm = LLM( model=model_path, quantization="ascend" if "W8A8" in model_path else None, - # enable_expert_parallel=True if "DeepSeek" in model_path else False, + enable_expert_parallel=True if "DeepSeek" in model_path else False, trust_remote_code=True, ) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 8dc60911..2235cc07 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -433,7 +433,13 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool: cudagraph_capture_sizes += list( range(256, max_cudagraph_capture_size + 1, 16)) - if sorted(cudagraph_capture_sizes, reverse=True) == \ + if vllm_version_is("0.11.0"): + target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes, + reverse=True) + else: + # in newer version, vVLLM use ascending order of cudagraph_capture_sizes. + target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes) + if target_cudagraph_capture_sizes == \ vllm_config.compilation_config.cudagraph_capture_sizes: return True diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 544b3edc..c91abc54 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -664,7 +664,11 @@ class NPUModelRunner(LoRAModelRunnerMixin): # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes, # the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512). if self.compilation_config.cudagraph_capture_sizes: - max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0] + if vllm_version_is("0.11.0"): + max_num_tokens = self.compilation_config.cudagraph_capture_sizes[ + 0] + else: + max_num_tokens = self.compilation_config.max_cudagraph_capture_size else: # NOTE: To save memory, we cap the max number of tokens to 512. max_num_tokens = min(