Clean up readme and arguments of chunked prefill (#1022)
This commit is contained in:
@@ -118,11 +118,7 @@ class ModelTpServer:
|
||||
trust_remote_code=server_args.trust_remote_code,
|
||||
)
|
||||
self.max_total_num_tokens = self.model_runner.max_total_num_tokens
|
||||
self.max_prefill_tokens = (
|
||||
16384
|
||||
if server_args.max_prefill_tokens is None
|
||||
else server_args.max_prefill_tokens
|
||||
)
|
||||
self.max_prefill_tokens = server_args.max_prefill_tokens
|
||||
self.max_running_requests = min(
|
||||
(
|
||||
self.max_total_num_tokens // 2
|
||||
|
||||
@@ -43,10 +43,11 @@ class ServerArgs:
|
||||
|
||||
# Memory and scheduling
|
||||
mem_fraction_static: Optional[float] = None
|
||||
max_prefill_tokens: Optional[int] = None
|
||||
max_running_requests: Optional[int] = None
|
||||
max_num_reqs: Optional[int] = None
|
||||
max_total_tokens: Optional[int] = None
|
||||
chunked_prefill_size: int = -1
|
||||
max_prefill_tokens: int = 16384
|
||||
schedule_policy: str = "lpm"
|
||||
schedule_conservativeness: float = 1.0
|
||||
|
||||
@@ -69,9 +70,6 @@ class ServerArgs:
|
||||
dp_size: int = 1
|
||||
load_balance_method: str = "round_robin"
|
||||
|
||||
# Chunked Prefill
|
||||
chunked_prefill_size: Optional[int] = None
|
||||
|
||||
# Optimization/debug options
|
||||
disable_flashinfer: bool = False
|
||||
disable_flashinfer_sampling: bool = False
|
||||
@@ -97,6 +95,10 @@ class ServerArgs:
|
||||
if self.served_model_name is None:
|
||||
self.served_model_name = self.model_path
|
||||
|
||||
if self.chunked_prefill_size <= 0:
|
||||
# Disable chunked prefill
|
||||
self.chunked_prefill_size = None
|
||||
|
||||
if self.mem_fraction_static is None:
|
||||
if self.tp_size >= 16:
|
||||
self.mem_fraction_static = 0.79
|
||||
@@ -108,6 +110,7 @@ class ServerArgs:
|
||||
self.mem_fraction_static = 0.87
|
||||
else:
|
||||
self.mem_fraction_static = 0.88
|
||||
|
||||
if isinstance(self.additional_ports, int):
|
||||
self.additional_ports = [self.additional_ports]
|
||||
elif self.additional_ports is None:
|
||||
@@ -232,12 +235,6 @@ class ServerArgs:
|
||||
default=ServerArgs.mem_fraction_static,
|
||||
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-prefill-tokens",
|
||||
type=int,
|
||||
default=ServerArgs.max_prefill_tokens,
|
||||
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-running-requests",
|
||||
type=int,
|
||||
@@ -256,6 +253,18 @@ class ServerArgs:
|
||||
default=ServerArgs.max_total_tokens,
|
||||
help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunked-prefill-size",
|
||||
type=int,
|
||||
default=ServerArgs.chunked_prefill_size,
|
||||
help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-prefill-tokens",
|
||||
type=int,
|
||||
default=ServerArgs.max_prefill_tokens,
|
||||
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--schedule-policy",
|
||||
type=str,
|
||||
@@ -353,14 +362,6 @@ class ServerArgs:
|
||||
)
|
||||
parser.add_argument("--node-rank", type=int, help="The node rank.")
|
||||
|
||||
# Chunked prefill
|
||||
parser.add_argument(
|
||||
"--chunked-prefill-size",
|
||||
type=int,
|
||||
default=ServerArgs.chunked_prefill_size,
|
||||
help="The size of the chunked prefill.",
|
||||
)
|
||||
|
||||
# Optimization/debug options
|
||||
parser.add_argument(
|
||||
"--disable-flashinfer",
|
||||
|
||||
Reference in New Issue
Block a user