From ff3914e31a4a44b89bde0ebb5ad6bd1fb2ab4df8 Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Thu, 18 Dec 2025 21:09:23 +0800 Subject: [PATCH] [Fix] Refines decode mode padding condition for uniform queries (#5164) ### What this PR does / why we need it? The reason why we cannot use `self.cudagraph_batch_sizes[-1]` is that it's actually not the max number of tokens to be padded in `FULL_DECODE_ONLY` mode, much larger instead. And it's trimmed only before capturing to `compilation_cases`, this really caused us lots of trouble. Updates the logic to ensure padding occurs only when the number of input tokens falls within a valid uniform decode query range, improving consistency and avoiding unnecessary padding in specific decode modes. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: Yizhou Liu --- vllm_ascend/worker/model_runner_v1.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 4ab4f06d..9ab91214 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1008,8 +1008,9 @@ class NPUModelRunner(GPUModelRunner): # TODO: We should make this official ASAP. Also note that if we pad here, # the builders won’t need to add any extra padding. + max_decode_tokens = self.scheduler_config.max_num_seqs * self.uniform_decode_query_len if self.compilation_config.cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \ - uniform_decode and num_input_tokens <= self.cudagraph_batch_sizes[-1]: + uniform_decode and self.uniform_decode_query_len <= num_input_tokens <= max_decode_tokens: num_reqs_padded = num_input_tokens // self.uniform_decode_query_len pad_size = num_reqs_padded - num_reqs if pad_size > 0: