Enable cuda graph by default (#612)

This commit is contained in:
Lianmin Zheng
2024-07-13 05:29:46 -07:00
committed by GitHub
parent 396a69240f
commit 665815969a
10 changed files with 331 additions and 84 deletions

View File

@@ -98,7 +98,7 @@ class ModelTpServer:
)
self.max_total_num_tokens = self.model_runner.max_total_num_tokens
self.max_prefill_tokens = (
4096
8192
if server_args.max_prefill_tokens is None
else server_args.max_prefill_tokens
)
@@ -314,11 +314,9 @@ class ModelTpServer:
self.forward_queue.append(req)
def get_new_fill_batch(self) -> Optional[Batch]:
if (
self.running_batch is not None
and len(self.running_batch.reqs) > self.max_running_requests
):
return None
running_bs = len(self.running_batch.reqs) if self.running_batch is not None else 0
if running_bs > self.max_running_requests:
return
# Compute matched prefix length
for req in self.forward_queue:
@@ -394,6 +392,10 @@ class ModelTpServer:
new_batch_input_tokens += req.extend_input_len
else:
break
if running_bs + len(can_run_list) > self.max_running_requests:
break
if len(can_run_list) == 0:
return None