2024-07-28 23:07:12 +10:00
"""
Copyright 2023 - 2024 SGLang Team
Licensed under the Apache License , Version 2.0 ( the " License " ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : / / www . apache . org / licenses / LICENSE - 2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an " AS IS " BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
"""
2024-05-11 20:55:00 -07:00
""" The arguments of the server. """
2024-01-08 04:37:50 +00:00
import argparse
import dataclasses
2024-05-31 23:33:34 -07:00
import random
2024-01-30 08:34:51 -08:00
from typing import List , Optional , Union
2024-01-08 04:37:50 +00:00
@dataclasses.dataclass
class ServerArgs :
2024-05-11 20:55:00 -07:00
# Model and tokenizer
2024-01-08 04:37:50 +00:00
model_path : str
tokenizer_path : Optional [ str ] = None
tokenizer_mode : str = " auto "
2024-06-27 23:30:39 -07:00
load_format : str = " auto "
dtype : str = " auto "
2024-01-08 04:37:50 +00:00
trust_remote_code : bool = True
2024-05-11 20:55:00 -07:00
context_length : Optional [ int ] = None
2024-05-21 11:46:35 -07:00
quantization : Optional [ str ] = None
2024-06-27 23:30:39 -07:00
chat_template : Optional [ str ] = None
2024-05-11 20:55:00 -07:00
# Port
host : str = " 127.0.0.1 "
port : int = 30000
additional_ports : Optional [ Union [ List [ int ] , int ] ] = None
# Memory and scheduling
2024-01-15 01:15:53 -08:00
mem_fraction_static : Optional [ float ] = None
2024-05-26 12:51:45 -07:00
max_prefill_tokens : Optional [ int ] = None
max_running_requests : Optional [ int ] = None
2024-07-26 17:10:07 -07:00
max_num_reqs : Optional [ int ] = None
2024-07-29 23:04:48 -07:00
schedule_policy : str = " lpm "
2024-07-23 11:52:50 -07:00
schedule_conservativeness : float = 1.0
2024-05-11 20:55:00 -07:00
# Other runtime options
tp_size : int = 1
2024-07-20 02:14:22 +10:00
stream_interval : int = 1
2024-05-31 23:33:34 -07:00
random_seed : Optional [ int ] = None
2024-05-11 20:55:00 -07:00
# Logging
log_level : str = " info "
2024-06-25 01:16:20 -07:00
log_level_http : Optional [ str ] = None
2024-05-12 06:41:32 -07:00
log_requests : bool = False
2024-04-09 23:27:31 +08:00
show_time_cost : bool = False
2024-03-11 20:06:52 +08:00
2024-05-11 20:55:00 -07:00
# Other
api_key : str = " "
2024-07-30 04:07:18 +08:00
file_storage_pth : str = " SGlang_storage "
2024-05-11 20:55:00 -07:00
2024-05-27 21:24:10 -07:00
# Data parallelism
dp_size : int = 1
load_balance_method : str = " round_robin "
2024-07-29 03:32:58 -07:00
# Chunked Prefill
chunked_prefill_size : Optional [ int ] = None
2024-05-11 20:55:00 -07:00
# Optimization/debug options
2024-07-03 23:19:33 -07:00
disable_flashinfer : bool = False
2024-07-27 20:18:56 -07:00
disable_flashinfer_sampling : bool = False
2024-05-11 20:55:00 -07:00
disable_radix_cache : bool = False
2024-02-08 00:50:12 +08:00
disable_regex_jump_forward : bool = False
2024-07-12 13:00:03 -07:00
disable_cuda_graph : bool = False
2024-02-08 00:50:12 +08:00
disable_disk_cache : bool = False
2024-07-20 18:34:37 -07:00
enable_torch_compile : bool = False
2024-07-06 23:34:10 -07:00
enable_p2p_check : bool = False
2024-07-27 20:18:56 -07:00
attention_reduce_in_fp32 : bool = False
2024-07-20 02:18:22 -07:00
efficient_weight_load : bool = False
2024-01-08 04:37:50 +00:00
2024-06-17 20:41:24 -07:00
# Distributed args
nccl_init_addr : Optional [ str ] = None
nnodes : int = 1
node_rank : Optional [ int ] = None
2024-01-08 04:37:50 +00:00
def __post_init__ ( self ) :
if self . tokenizer_path is None :
self . tokenizer_path = self . model_path
2024-01-15 01:15:53 -08:00
if self . mem_fraction_static is None :
2024-07-15 07:10:51 -07:00
if self . tp_size > = 16 :
2024-07-30 01:58:31 -07:00
self . mem_fraction_static = 0.79
2024-07-15 07:10:51 -07:00
elif self . tp_size > = 8 :
2024-07-30 01:58:31 -07:00
self . mem_fraction_static = 0.83
2024-01-17 04:43:17 -08:00
elif self . tp_size > = 4 :
2024-07-30 01:58:31 -07:00
self . mem_fraction_static = 0.85
2024-01-17 04:43:17 -08:00
elif self . tp_size > = 2 :
2024-07-30 01:58:31 -07:00
self . mem_fraction_static = 0.87
2024-07-22 03:19:24 -07:00
else :
2024-07-30 01:58:31 -07:00
self . mem_fraction_static = 0.88
2024-01-30 08:34:51 -08:00
if isinstance ( self . additional_ports , int ) :
self . additional_ports = [ self . additional_ports ]
elif self . additional_ports is None :
self . additional_ports = [ ]
2024-01-08 04:37:50 +00:00
2024-05-31 23:33:34 -07:00
if self . random_seed is None :
self . random_seed = random . randint ( 0 , 1 << 30 )
2024-01-08 04:37:50 +00:00
@staticmethod
def add_cli_args ( parser : argparse . ArgumentParser ) :
parser . add_argument (
" --model-path " ,
type = str ,
help = " The path of the model weights. This can be a local folder or a Hugging Face repo ID. " ,
required = True ,
)
parser . add_argument (
" --tokenizer-path " ,
type = str ,
default = ServerArgs . tokenizer_path ,
help = " The path of the tokenizer. " ,
)
2024-05-14 07:57:00 +08:00
parser . add_argument (
" --host " , type = str , default = ServerArgs . host , help = " The host of the server. "
)
parser . add_argument (
" --port " , type = int , default = ServerArgs . port , help = " The port of the server. "
)
2024-01-30 08:34:51 -08:00
parser . add_argument (
" --additional-ports " ,
type = int ,
nargs = " * " ,
default = [ ] ,
2024-06-25 01:16:20 -07:00
help = " The additional ports specified for the server. " ,
2024-01-30 08:34:51 -08:00
)
2024-06-27 23:30:39 -07:00
parser . add_argument (
" --tokenizer-mode " ,
type = str ,
default = ServerArgs . tokenizer_mode ,
choices = [ " auto " , " slow " ] ,
help = " Tokenizer mode. ' auto ' will use the fast "
" tokenizer if available, and ' slow ' will "
" always use the slow tokenizer. " ,
)
2024-01-08 04:37:50 +00:00
parser . add_argument (
" --load-format " ,
type = str ,
default = ServerArgs . load_format ,
choices = [ " auto " , " pt " , " safetensors " , " npcache " , " dummy " ] ,
help = " The format of the model weights to load. "
' " auto " will try to load the weights in the safetensors format '
" and fall back to the pytorch bin format if safetensors format "
" is not available. "
' " pt " will load the weights in the pytorch bin format. '
' " safetensors " will load the weights in the safetensors format. '
' " npcache " will load the weights in pytorch format and store '
" a numpy cache to speed up the loading. "
' " dummy " will initialize the weights with random values, '
" which is mainly for profiling. " ,
)
parser . add_argument (
2024-06-27 23:30:39 -07:00
" --dtype " ,
2024-01-18 23:43:09 -08:00
type = str ,
2024-06-27 23:30:39 -07:00
default = ServerArgs . dtype ,
2024-07-05 10:06:17 -07:00
choices = [ " auto " , " half " , " float16 " , " bfloat16 " , " float " , " float32 " ] ,
help = " Data type for model weights and activations. \n \n "
2024-06-27 23:30:39 -07:00
' * " auto " will use FP16 precision for FP32 and FP16 models, and '
2024-07-05 10:06:17 -07:00
" BF16 precision for BF16 models. \n "
2024-06-27 23:30:39 -07:00
' * " half " for FP16. Recommended for AWQ quantization. \n '
' * " float16 " is the same as " half " . \n '
' * " bfloat16 " for a balance between precision and range. \n '
' * " float " is shorthand for FP32 precision. \n '
2024-07-05 10:06:17 -07:00
' * " float32 " for FP32 precision. ' ,
)
2024-01-08 04:37:50 +00:00
parser . add_argument (
" --trust-remote-code " ,
action = " store_true " ,
help = " Whether or not to allow for custom models defined on the Hub in their own modeling files. " ,
)
2024-05-11 20:55:00 -07:00
parser . add_argument (
" --context-length " ,
type = int ,
default = ServerArgs . context_length ,
help = " The model ' s maximum context length. Defaults to None (will use the value from the model ' s config.json instead). " ,
)
2024-05-21 11:46:35 -07:00
parser . add_argument (
" --quantization " ,
type = str ,
default = ServerArgs . quantization ,
2024-07-19 09:54:01 -07:00
choices = [
" awq " ,
" fp8 " ,
" gptq " ,
" marlin " ,
" gptq_marlin " ,
2024-07-30 02:04:51 -07:00
" awq_marlin " ,
2024-07-19 09:54:01 -07:00
" squeezellm " ,
" bitsandbytes " ,
] ,
2024-05-21 11:46:35 -07:00
help = " The quantization method. " ,
)
2024-06-27 23:30:39 -07:00
parser . add_argument (
" --chat-template " ,
type = str ,
default = ServerArgs . chat_template ,
help = " The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server. " ,
)
2024-01-08 04:37:50 +00:00
parser . add_argument (
" --mem-fraction-static " ,
type = float ,
default = ServerArgs . mem_fraction_static ,
2024-01-17 18:37:02 -08:00
help = " The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors. " ,
2024-01-08 04:37:50 +00:00
)
2024-02-03 02:35:54 -08:00
parser . add_argument (
2024-05-26 12:51:45 -07:00
" --max-prefill-tokens " ,
2024-02-03 02:35:54 -08:00
type = int ,
2024-05-26 12:51:45 -07:00
default = ServerArgs . max_prefill_tokens ,
2024-02-06 13:27:46 -08:00
help = " The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model ' s maximum context length. " ,
2024-02-03 02:35:54 -08:00
)
2024-05-26 12:51:45 -07:00
parser . add_argument (
" --max-running-requests " ,
type = int ,
default = ServerArgs . max_running_requests ,
help = " The maximum number of running requests. " ,
)
2024-07-26 17:10:07 -07:00
parser . add_argument (
" --max-num-reqs " ,
type = int ,
2024-07-29 03:32:58 -07:00
default = ServerArgs . max_num_reqs ,
2024-07-26 17:10:07 -07:00
help = " The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors. " ,
)
2024-01-08 04:37:50 +00:00
parser . add_argument (
2024-07-29 23:04:48 -07:00
" --schedule-policy " ,
2024-01-08 04:37:50 +00:00
type = str ,
2024-07-29 23:04:48 -07:00
default = ServerArgs . schedule_policy ,
2024-05-13 12:47:13 +08:00
choices = [ " lpm " , " random " , " fcfs " , " dfs-weight " ] ,
2024-07-29 23:04:48 -07:00
help = " The scheduling policy of the requests. " ,
2024-01-08 04:37:50 +00:00
)
2024-01-17 18:37:02 -08:00
parser . add_argument (
" --schedule-conservativeness " ,
type = float ,
default = ServerArgs . schedule_conservativeness ,
2024-01-20 03:01:15 +08:00
help = " How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently. " ,
2024-01-17 18:37:02 -08:00
)
2024-01-08 04:37:50 +00:00
parser . add_argument (
2024-05-11 20:55:00 -07:00
" --tp-size " ,
2024-01-08 04:37:50 +00:00
type = int ,
2024-05-11 20:55:00 -07:00
default = ServerArgs . tp_size ,
2024-06-25 01:16:20 -07:00
help = " The tensor parallelism size. " ,
2024-03-11 04:43:39 -07:00
)
2024-01-17 02:54:41 -08:00
parser . add_argument (
" --stream-interval " ,
type = int ,
2024-01-17 16:38:20 -08:00
default = ServerArgs . stream_interval ,
2024-01-29 17:05:42 -08:00
help = " The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher " ,
2024-01-17 02:54:41 -08:00
)
2024-05-11 20:55:00 -07:00
parser . add_argument (
" --random-seed " ,
type = int ,
default = ServerArgs . random_seed ,
2024-06-25 01:16:20 -07:00
help = " The random seed. " ,
2024-05-11 20:55:00 -07:00
)
2024-01-08 04:37:50 +00:00
parser . add_argument (
" --log-level " ,
type = str ,
default = ServerArgs . log_level ,
2024-06-25 01:16:20 -07:00
help = " The logging level of all loggers. " ,
2024-01-08 04:37:50 +00:00
)
2024-05-12 06:41:32 -07:00
parser . add_argument (
2024-06-25 01:16:20 -07:00
" --log-level-http " ,
type = str ,
default = ServerArgs . log_level_http ,
help = " The logging level of HTTP server. If not set, reuse --log-level by default. " ,
2024-05-12 06:41:32 -07:00
)
2024-01-08 04:37:50 +00:00
parser . add_argument (
2024-06-25 01:16:20 -07:00
" --log-requests " ,
2024-01-08 04:37:50 +00:00
action = " store_true " ,
2024-06-25 01:16:20 -07:00
help = " Log the inputs and outputs of all requests. " ,
2024-01-08 04:37:50 +00:00
)
2024-05-11 20:55:00 -07:00
parser . add_argument (
" --show-time-cost " ,
action = " store_true " ,
2024-07-19 09:54:01 -07:00
help = " Show time cost of custom marks. " ,
2024-05-11 20:55:00 -07:00
)
2024-04-09 23:27:31 +08:00
parser . add_argument (
" --api-key " ,
type = str ,
default = ServerArgs . api_key ,
2024-07-19 09:54:01 -07:00
help = " Set API key of the server. " ,
2024-04-09 23:27:31 +08:00
)
2024-07-30 04:07:18 +08:00
parser . add_argument (
" --file-storage-pth " ,
type = str ,
default = ServerArgs . file_storage_pth ,
help = " The path of the file storage in backend. " ,
)
2024-05-11 20:55:00 -07:00
2024-05-27 21:24:10 -07:00
# Data parallelism
parser . add_argument (
" --dp-size " ,
type = int ,
default = ServerArgs . dp_size ,
2024-06-25 01:16:20 -07:00
help = " The data parallelism size. " ,
2024-05-27 21:24:10 -07:00
)
parser . add_argument (
" --load-balance-method " ,
type = str ,
default = ServerArgs . load_balance_method ,
2024-06-25 01:16:20 -07:00
help = " The load balancing strategy for data parallelism. " ,
2024-05-27 21:24:10 -07:00
choices = [
" round_robin " ,
" shortest_queue " ,
] ,
)
2024-06-17 20:41:24 -07:00
# Multi-node distributed serving args
parser . add_argument (
" --nccl-init-addr " ,
type = str ,
2024-07-05 10:06:17 -07:00
help = " The nccl init address of multi-node server. " ,
2024-06-17 20:41:24 -07:00
)
parser . add_argument (
2024-07-29 03:32:58 -07:00
" --nnodes " , type = int , default = ServerArgs . nnodes , help = " The number of nodes. "
2024-06-17 20:41:24 -07:00
)
2024-07-05 10:06:17 -07:00
parser . add_argument ( " --node-rank " , type = int , help = " The node rank. " )
2024-06-17 20:41:24 -07:00
2024-07-29 03:32:58 -07:00
# Chunked prefill
parser . add_argument (
" --chunked-prefill-size " ,
type = int ,
default = ServerArgs . chunked_prefill_size ,
help = " The size of the chunked prefill. " ,
)
2024-05-11 20:55:00 -07:00
# Optimization/debug options
2024-04-09 23:27:31 +08:00
parser . add_argument (
2024-07-02 02:25:07 -07:00
" --disable-flashinfer " ,
2024-04-09 23:27:31 +08:00
action = " store_true " ,
2024-07-27 20:18:56 -07:00
help = " Disable flashinfer attention kernels. " ,
)
parser . add_argument (
" --disable-flashinfer-sampling " ,
action = " store_true " ,
help = " Disable flashinfer sampling kernels. " ,
2024-04-09 23:27:31 +08:00
)
2024-03-11 20:06:52 +08:00
parser . add_argument (
2024-05-11 20:55:00 -07:00
" --disable-radix-cache " ,
2024-03-11 20:06:52 +08:00
action = " store_true " ,
2024-07-19 09:54:01 -07:00
help = " Disable RadixAttention for prefix caching. " ,
2024-03-11 20:06:52 +08:00
)
2024-01-25 01:16:25 +08:00
parser . add_argument (
2024-02-08 00:50:12 +08:00
" --disable-regex-jump-forward " ,
2024-01-25 01:16:25 +08:00
action = " store_true " ,
2024-07-19 09:54:01 -07:00
help = " Disable regex jump-forward. " ,
2024-01-25 01:16:25 +08:00
)
2024-07-12 13:00:03 -07:00
parser . add_argument (
" --disable-cuda-graph " ,
action = " store_true " ,
help = " Disable cuda graph. " ,
)
2024-02-08 00:50:12 +08:00
parser . add_argument (
" --disable-disk-cache " ,
action = " store_true " ,
help = " Disable disk cache to avoid possible crashes related to file system or high concurrency. " ,
)
2024-07-20 18:34:37 -07:00
parser . add_argument (
" --enable-torch-compile " ,
action = " store_true " ,
help = " Optimize the model with torch.compile, experimental feature. " ,
)
2024-07-03 02:07:34 -07:00
parser . add_argument (
2024-07-27 20:18:56 -07:00
" --enable-p2p-check " ,
2024-07-03 02:07:34 -07:00
action = " store_true " ,
2024-07-27 20:18:56 -07:00
help = " Enable P2P check for GPU access, otherwise the p2p access is allowed by default. " ,
2024-07-03 02:07:34 -07:00
)
2024-07-06 23:34:10 -07:00
parser . add_argument (
2024-07-27 20:18:56 -07:00
" --attention-reduce-in-fp32 " ,
2024-07-06 23:34:10 -07:00
action = " store_true " ,
2024-07-27 20:18:56 -07:00
help = " Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16. "
" This only affects Triton attention kernels " ,
2024-07-06 23:34:10 -07:00
)
2024-07-20 02:18:22 -07:00
parser . add_argument (
" --efficient-weight-load " ,
action = " store_true " ,
help = " Turn on memory efficient weight loading with quantization (quantize per layer during loading). " ,
)
2024-01-08 04:37:50 +00:00
@classmethod
def from_cli_args ( cls , args : argparse . Namespace ) :
attrs = [ attr . name for attr in dataclasses . fields ( cls ) ]
return cls ( * * { attr : getattr ( args , attr ) for attr in attrs } )
def url ( self ) :
return f " http:// { self . host } : { self . port } "
2024-05-11 20:55:00 -07:00
def print_mode_args ( self ) :
2024-03-11 20:06:52 +08:00
return (
2024-07-02 02:25:07 -07:00
f " disable_flashinfer= { self . disable_flashinfer } , "
2024-05-12 04:54:07 -07:00
f " attention_reduce_in_fp32= { self . attention_reduce_in_fp32 } , "
2024-05-11 20:55:00 -07:00
f " disable_radix_cache= { self . disable_radix_cache } , "
2024-03-11 20:06:52 +08:00
f " disable_regex_jump_forward= { self . disable_regex_jump_forward } , "
f " disable_disk_cache= { self . disable_disk_cache } , "
)
2024-07-27 19:03:40 -07:00
def check_server_args ( self ) :
assert (
self . tp_size % self . nnodes == 0
) , " tp_size must be divisible by number of nodes "
assert not (
self . dp_size > 1 and self . node_rank is not None
) , " multi-node data parallel is not supported "
2024-07-29 03:32:58 -07:00
assert not (
2024-07-29 18:34:25 -07:00
self . chunked_prefill_size is not None and self . disable_radix_cache
2024-07-29 03:32:58 -07:00
) , " chunked prefill is not supported with radix cache disabled currently "
2024-01-08 04:37:50 +00:00
@dataclasses.dataclass
class PortArgs :
tokenizer_port : int
2024-07-18 02:13:54 -07:00
controller_port : int
2024-01-08 04:37:50 +00:00
detokenizer_port : int
2024-07-18 02:13:54 -07:00
nccl_ports : List [ int ]