From 0777e2f899f7fa8f4edb663629442246445c0d86 Mon Sep 17 00:00:00 2001 From: xuyexiong Date: Sat, 18 Oct 2025 16:42:17 +0800 Subject: [PATCH] Optimize torchair kv_consumer padding logic (#3526) ### What this PR does / why we need it? Optimize torchair kv_consumer padding logic. Only pad when it is spec decoding ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: xuyexiong --- vllm_ascend/torchair/torchair_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py index 14a6f1e..2a5c513 100644 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -85,7 +85,8 @@ class NPUTorchairModelRunner(NPUModelRunner): def _may_pad_kv_consumer_num_seq(self): # pd disaggregation scenario need redundant_batch_sizes to avoid each batch's seq_len exceed 16 tokens # self.max_num_reqs here is greater than the actual maximum request number - if self.is_kv_consumer: + if self.decode_token_per_req > 1 and self.is_kv_consumer: + # applied only when speculative decoding is active FIA_SEQ_LEN_LIMIT = 16 new_max_num_reqs = self.max_num_reqs + math.ceil( self.max_num_reqs / FIA_SEQ_LEN_LIMIT) + math.ceil(