From 1f25d60870f2ab091b4774914ee9ebe64898b679 Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Sat, 25 Oct 2025 11:23:21 +0800 Subject: [PATCH] [Fix] Cap max tokens to prevent potential OOM (#3720) ### What this PR does / why we need it? Caps the calculated maximum number of tokens at 512. This prevents allocating an excessively large buffer when a cudagraph capture size is not specified, mitigating the risk of out-of-memory errors. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 Signed-off-by: Yizhou Liu --- vllm_ascend/worker/model_runner_v1.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 8fe6df2a..1cc93533 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -586,7 +586,9 @@ class NPUModelRunner(LoRAModelRunnerMixin): if self.compilation_config.cudagraph_capture_sizes: max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0] else: - max_num_tokens = self.max_num_reqs * self.uniform_decode_query_len + # NOTE: To save memory, we cap the max number of tokens to 512. + max_num_tokens = min( + self.max_num_reqs * self.uniform_decode_query_len, 512) tp_size = self.parallel_config.tensor_parallel_size # Use integer arithmetic for ceiling division. num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size