Increase the number of thread limitation for tp worker managers. (#567)

This commit is contained in:
Lianmin Zheng
2024-06-26 09:33:45 -07:00
committed by GitHub
parent a385ee27bd
commit 2e6e62e156
9 changed files with 148 additions and 84 deletions

View File

@@ -100,7 +100,7 @@ class ModelTpServer:
self.max_prefill_tokens = (
max(
self.model_config.context_len,
min(self.max_total_num_tokens // 6, 65536),
min(self.max_total_num_tokens // 6, 32768),
)
if server_args.max_prefill_tokens is None
else server_args.max_prefill_tokens