Improve the user control of new_token_ratio (#1811)

This commit is contained in:
Lianmin Zheng
2024-10-26 16:39:41 -07:00
committed by GitHub
parent 2b80978859
commit 86e0dde555
4 changed files with 32 additions and 17 deletions

View File

@@ -121,13 +121,13 @@ class CudaGraphRunner:
bs
for bs in self.capture_bs
if bs <= model_runner.req_to_token_pool.size
and bs <= model_runner.server_args.max_cuda_graph_bs
and bs <= model_runner.server_args.cuda_graph_max_bs
]
self.compile_bs = (
[
bs
for bs in self.capture_bs
if bs <= self.model_runner.server_args.max_torch_compile_bs
if bs <= self.model_runner.server_args.torch_compile_max_bs
]
if self.use_torch_compile
else []