Enable cuda graph by default (#612)
This commit is contained in:
@@ -29,7 +29,7 @@ class ServerArgs:
|
||||
max_prefill_tokens: Optional[int] = None
|
||||
max_running_requests: Optional[int] = None
|
||||
schedule_heuristic: str = "lpm"
|
||||
schedule_conservativeness: float = 1.0
|
||||
schedule_conservativeness: float = 0.8
|
||||
|
||||
# Other runtime options
|
||||
tp_size: int = 1
|
||||
@@ -68,13 +68,13 @@ class ServerArgs:
|
||||
self.tokenizer_path = self.model_path
|
||||
if self.mem_fraction_static is None:
|
||||
if self.tp_size >= 8:
|
||||
self.mem_fraction_static = 0.80
|
||||
self.mem_fraction_static = 0.78
|
||||
elif self.tp_size >= 4:
|
||||
self.mem_fraction_static = 0.82
|
||||
self.mem_fraction_static = 0.80
|
||||
elif self.tp_size >= 2:
|
||||
self.mem_fraction_static = 0.85
|
||||
else:
|
||||
self.mem_fraction_static = 0.90
|
||||
self.mem_fraction_static = 0.88
|
||||
if isinstance(self.additional_ports, int):
|
||||
self.additional_ports = [self.additional_ports]
|
||||
elif self.additional_ports is None:
|
||||
|
||||
Reference in New Issue
Block a user