From ec983202850ed396a116373bf8b7ecb9c12dd440 Mon Sep 17 00:00:00 2001 From: zouyida2052 Date: Mon, 3 Nov 2025 14:17:51 +0800 Subject: [PATCH] correct bug to fix the value of max_num_tokens (#3933) ### What this PR does / why we need it? correct bug to fix the value of max_num_tokens - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac Signed-off-by: zouyida2052 --- vllm_ascend/torchair/torchair_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py index fda33df1..6dd424c8 100644 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -117,7 +117,7 @@ class NPUTorchairModelRunner(NPUModelRunner): # NOTE: To be clear, we need to make sure that during graph capture, the number of # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes, # the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512). - max_num_tokens = self.parallel_config.tensor_parallel_size + max_num_tokens = self.max_num_reqs * self.uniform_decode_query_len tp_size = self.parallel_config.tensor_parallel_size # Use integer arithmetic for ceiling division. max_graph_batch_size = self.calculate_new_torchair_graph_batch_size(