### What this PR does / why we need it? Caps the calculated maximum number of tokens at 512. This prevents allocating an excessively large buffer when a cudagraph capture size is not specified, mitigating the risk of out-of-memory errors. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -543,7 +543,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
if self.compilation_config.cudagraph_capture_sizes:
|
if self.compilation_config.cudagraph_capture_sizes:
|
||||||
max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0]
|
max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0]
|
||||||
else:
|
else:
|
||||||
max_num_tokens = self.max_num_reqs * self.uniform_decode_query_len
|
# NOTE: To save memory, we cap the max number of tokens to 512.
|
||||||
|
max_num_tokens = min(
|
||||||
|
self.max_num_reqs * self.uniform_decode_query_len, 512)
|
||||||
tp_size = self.parallel_config.tensor_parallel_size
|
tp_size = self.parallel_config.tensor_parallel_size
|
||||||
# Use integer arithmetic for ceiling division.
|
# Use integer arithmetic for ceiling division.
|
||||||
num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
|
num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
|
||||||
|
|||||||
Reference in New Issue
Block a user