Clean up readme and arguments of chunked prefill (#1022)

This commit is contained in:
Lianmin Zheng
2024-08-11 01:18:52 -07:00
committed by GitHub
parent 33d61356b8
commit a97df79124
3 changed files with 30 additions and 34 deletions

View File

@@ -118,11 +118,7 @@ class ModelTpServer:
trust_remote_code=server_args.trust_remote_code,
)
self.max_total_num_tokens = self.model_runner.max_total_num_tokens
self.max_prefill_tokens = (
16384
if server_args.max_prefill_tokens is None
else server_args.max_prefill_tokens
)
self.max_prefill_tokens = server_args.max_prefill_tokens
self.max_running_requests = min(
(
self.max_total_num_tokens // 2

View File

@@ -43,10 +43,11 @@ class ServerArgs:
# Memory and scheduling
mem_fraction_static: Optional[float] = None
max_prefill_tokens: Optional[int] = None
max_running_requests: Optional[int] = None
max_num_reqs: Optional[int] = None
max_total_tokens: Optional[int] = None
chunked_prefill_size: int = -1
max_prefill_tokens: int = 16384
schedule_policy: str = "lpm"
schedule_conservativeness: float = 1.0
@@ -69,9 +70,6 @@ class ServerArgs:
dp_size: int = 1
load_balance_method: str = "round_robin"
# Chunked Prefill
chunked_prefill_size: Optional[int] = None
# Optimization/debug options
disable_flashinfer: bool = False
disable_flashinfer_sampling: bool = False
@@ -97,6 +95,10 @@ class ServerArgs:
if self.served_model_name is None:
self.served_model_name = self.model_path
if self.chunked_prefill_size <= 0:
# Disable chunked prefill
self.chunked_prefill_size = None
if self.mem_fraction_static is None:
if self.tp_size >= 16:
self.mem_fraction_static = 0.79
@@ -108,6 +110,7 @@ class ServerArgs:
self.mem_fraction_static = 0.87
else:
self.mem_fraction_static = 0.88
if isinstance(self.additional_ports, int):
self.additional_ports = [self.additional_ports]
elif self.additional_ports is None:
@@ -232,12 +235,6 @@ class ServerArgs:
default=ServerArgs.mem_fraction_static,
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
)
parser.add_argument(
"--max-prefill-tokens",
type=int,
default=ServerArgs.max_prefill_tokens,
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
)
parser.add_argument(
"--max-running-requests",
type=int,
@@ -256,6 +253,18 @@ class ServerArgs:
default=ServerArgs.max_total_tokens,
help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
)
parser.add_argument(
"--chunked-prefill-size",
type=int,
default=ServerArgs.chunked_prefill_size,
help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill",
)
parser.add_argument(
"--max-prefill-tokens",
type=int,
default=ServerArgs.max_prefill_tokens,
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
)
parser.add_argument(
"--schedule-policy",
type=str,
@@ -353,14 +362,6 @@ class ServerArgs:
)
parser.add_argument("--node-rank", type=int, help="The node rank.")
# Chunked prefill
parser.add_argument(
"--chunked-prefill-size",
type=int,
default=ServerArgs.chunked_prefill_size,
help="The size of the chunked prefill.",
)
# Optimization/debug options
parser.add_argument(
"--disable-flashinfer",