From 1bc61031e580ecafc4e95d779cdf5ff747909c71 Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Sat, 25 Oct 2025 15:46:56 +0800
Subject: [PATCH] [v0.11.0][Fix] Cap max tokens to prevent potential OOM
 (#3720) (#3744)

### What this PR does / why we need it?
Caps the calculated maximum number of tokens at 512.

This prevents allocating an excessively large buffer when a cudagraph
capture size is not specified, mitigating the risk of out-of-memory
errors.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
None.

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 vllm_ascend/worker/model_runner_v1.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 0bfe0f8..cd98c60 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -543,7 +543,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         if self.compilation_config.cudagraph_capture_sizes:
             max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0]
         else:
-            max_num_tokens = self.max_num_reqs * self.uniform_decode_query_len
+            # NOTE: To save memory, we cap the max number of tokens to 512.
+            max_num_tokens = min(
+                self.max_num_reqs * self.uniform_decode_query_len, 512)
         tp_size = self.parallel_config.tensor_parallel_size
         # Use integer arithmetic for ceiling division.
         num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size