Enable cuda graph by default (#612)

This commit is contained in:
Lianmin Zheng
2024-07-13 05:29:46 -07:00
committed by GitHub
parent 396a69240f
commit 665815969a
10 changed files with 331 additions and 84 deletions

View File

@@ -29,7 +29,7 @@ class ServerArgs:
max_prefill_tokens: Optional[int] = None
max_running_requests: Optional[int] = None
schedule_heuristic: str = "lpm"
schedule_conservativeness: float = 1.0
schedule_conservativeness: float = 0.8
# Other runtime options
tp_size: int = 1
@@ -68,13 +68,13 @@ class ServerArgs:
self.tokenizer_path = self.model_path
if self.mem_fraction_static is None:
if self.tp_size >= 8:
self.mem_fraction_static = 0.80
self.mem_fraction_static = 0.78
elif self.tp_size >= 4:
self.mem_fraction_static = 0.82
self.mem_fraction_static = 0.80
elif self.tp_size >= 2:
self.mem_fraction_static = 0.85
else:
self.mem_fraction_static = 0.90
self.mem_fraction_static = 0.88
if isinstance(self.additional_ports, int):
self.additional_ports = [self.additional_ports]
elif self.additional_ports is None: