[minor] Improve code style and compatibility (#1961)
This commit is contained in:
@@ -63,7 +63,7 @@ class ServerArgs:
|
||||
stream_interval: int = 1
|
||||
random_seed: Optional[int] = None
|
||||
constrained_json_whitespace_pattern: Optional[str] = None
|
||||
decode_log_interval: int = 40
|
||||
watchdog_timeout: float = 300
|
||||
|
||||
# Logging
|
||||
log_level: str = "info"
|
||||
@@ -71,18 +71,18 @@ class ServerArgs:
|
||||
log_requests: bool = False
|
||||
show_time_cost: bool = False
|
||||
enable_metrics: bool = False
|
||||
decode_log_interval: int = 40
|
||||
|
||||
# Other
|
||||
# API related
|
||||
api_key: Optional[str] = None
|
||||
file_storage_pth: str = "SGLang_storage"
|
||||
enable_cache_report: bool = False
|
||||
watchdog_timeout: float = 600
|
||||
|
||||
# Data parallelism
|
||||
dp_size: int = 1
|
||||
load_balance_method: str = "round_robin"
|
||||
|
||||
# Distributed args
|
||||
# Multi-node distributed serving
|
||||
dist_init_addr: Optional[str] = None
|
||||
nnodes: int = 1
|
||||
node_rank: int = 0
|
||||
@@ -128,6 +128,7 @@ class ServerArgs:
|
||||
enable_p2p_check: bool = False
|
||||
triton_attention_reduce_in_fp32: bool = False
|
||||
num_continuous_decode_steps: int = 1
|
||||
delete_ckpt_after_loading: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
# Set missing default values
|
||||
@@ -205,6 +206,7 @@ class ServerArgs:
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser):
|
||||
# Model and port args
|
||||
parser.add_argument(
|
||||
"--model-path",
|
||||
type=str,
|
||||
@@ -324,6 +326,8 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Whether to use a CausalLM as an embedding model.",
|
||||
)
|
||||
|
||||
# Memory and scheduling
|
||||
parser.add_argument(
|
||||
"--mem-fraction-static",
|
||||
type=float,
|
||||
@@ -368,6 +372,8 @@ class ServerArgs:
|
||||
default=ServerArgs.schedule_conservativeness,
|
||||
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
||||
)
|
||||
|
||||
# Other runtime options
|
||||
parser.add_argument(
|
||||
"--tensor-parallel-size",
|
||||
"--tp-size",
|
||||
@@ -393,6 +399,14 @@ class ServerArgs:
|
||||
default=ServerArgs.constrained_json_whitespace_pattern,
|
||||
help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--watchdog-timeout",
|
||||
type=float,
|
||||
default=ServerArgs.watchdog_timeout,
|
||||
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
|
||||
)
|
||||
|
||||
# Logging
|
||||
parser.add_argument(
|
||||
"--log-level",
|
||||
type=str,
|
||||
@@ -420,7 +434,14 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Enable log prometheus metrics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decode-log-interval",
|
||||
type=int,
|
||||
default=ServerArgs.decode_log_interval,
|
||||
help="The log interval of decode batch",
|
||||
)
|
||||
|
||||
# API related
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
type=str,
|
||||
@@ -438,18 +459,6 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--watchdog-timeout",
|
||||
type=float,
|
||||
default=ServerArgs.watchdog_timeout,
|
||||
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decode-log-interval",
|
||||
type=int,
|
||||
default=ServerArgs.decode_log_interval,
|
||||
help="The log interval of decode batch",
|
||||
)
|
||||
|
||||
# Data parallelism
|
||||
parser.add_argument(
|
||||
@@ -470,7 +479,7 @@ class ServerArgs:
|
||||
],
|
||||
)
|
||||
|
||||
# Multi-node distributed serving args
|
||||
# Multi-node distributed serving
|
||||
parser.add_argument(
|
||||
"--dist-init-addr",
|
||||
"--nccl-init-addr", # For backward compatbility. This will be removed in the future.
|
||||
@@ -677,6 +686,12 @@ class ServerArgs:
|
||||
"This can potentially increase throughput but may also increase time-to-first-token latency. "
|
||||
"The default value is 1, meaning only run one decoding step at a time.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delete-ckpt-after-loading",
|
||||
default=ServerArgs.delete_ckpt_after_loading,
|
||||
action="store_true",
|
||||
help="Delete the model checkpoint after loading the model.",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_cli_args(cls, args: argparse.Namespace):
|
||||
|
||||
Reference in New Issue
Block a user