From cbf46fad3c0a8df50d02a1b39952a47e039d3ff8 Mon Sep 17 00:00:00 2001
From: XiaoxinWang <963372609@qq.com>
Date: Sun, 22 Mar 2026 10:09:37 +0800
Subject: [PATCH] fixed graph mode bug. (#7460)

### What this PR does / why we need it?
In fulldecodeonly mode, num_req_padded was set to an incorrect value,
causing accuracy degradation in Qwen3-Next. Therefore, we added a check
for compilation_config.cudagraph_mode to the conditional logic, ensuring
that padding is applied only in FULL mode.


### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.17.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/8a680463fab3bc9e6760417cd5c0a6aa58283065

Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
---
 vllm_ascend/worker/model_runner_v1.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index d6389402..4d97c1a1 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -567,7 +567,8 @@ class NPUModelRunner(GPUModelRunner):
         """
         # TODO: need refactor later, related to vllm PR #34043 this pr delete func
         # relax_for_mixed_batch_cudagraphs, num_reqs no longer equals the actual number of requests.
-        if cudagraph_runtime_mode == CUDAGraphMode.FULL:
+        if cudagraph_runtime_mode == CUDAGraphMode.FULL and \
+            self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL:
             num_reqs_padded = num_reqs
         else:
             num_reqs_padded = batch_desc_num_reqs if batch_desc_num_reqs is not None else num_reqs