Enable cuda graph by default (#612)
This commit is contained in:
@@ -98,7 +98,7 @@ class ModelTpServer:
|
||||
)
|
||||
self.max_total_num_tokens = self.model_runner.max_total_num_tokens
|
||||
self.max_prefill_tokens = (
|
||||
4096
|
||||
8192
|
||||
if server_args.max_prefill_tokens is None
|
||||
else server_args.max_prefill_tokens
|
||||
)
|
||||
@@ -314,11 +314,9 @@ class ModelTpServer:
|
||||
self.forward_queue.append(req)
|
||||
|
||||
def get_new_fill_batch(self) -> Optional[Batch]:
|
||||
if (
|
||||
self.running_batch is not None
|
||||
and len(self.running_batch.reqs) > self.max_running_requests
|
||||
):
|
||||
return None
|
||||
running_bs = len(self.running_batch.reqs) if self.running_batch is not None else 0
|
||||
if running_bs > self.max_running_requests:
|
||||
return
|
||||
|
||||
# Compute matched prefix length
|
||||
for req in self.forward_queue:
|
||||
@@ -394,6 +392,10 @@ class ModelTpServer:
|
||||
new_batch_input_tokens += req.extend_input_len
|
||||
else:
|
||||
break
|
||||
|
||||
if running_bs + len(can_run_list) > self.max_running_requests:
|
||||
break
|
||||
|
||||
if len(can_run_list) == 0:
|
||||
return None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user