[minor] Improve code style and compatibility (#1961)

This commit is contained in:
Lianmin Zheng
2024-11-08 02:19:41 -08:00
committed by GitHub
parent 7ef0084b0d
commit a509552087
6 changed files with 109 additions and 35 deletions

View File

@@ -63,7 +63,7 @@ class ServerArgs:
stream_interval: int = 1
random_seed: Optional[int] = None
constrained_json_whitespace_pattern: Optional[str] = None
decode_log_interval: int = 40
watchdog_timeout: float = 300
# Logging
log_level: str = "info"
@@ -71,18 +71,18 @@ class ServerArgs:
log_requests: bool = False
show_time_cost: bool = False
enable_metrics: bool = False
decode_log_interval: int = 40
# Other
# API related
api_key: Optional[str] = None
file_storage_pth: str = "SGLang_storage"
enable_cache_report: bool = False
watchdog_timeout: float = 600
# Data parallelism
dp_size: int = 1
load_balance_method: str = "round_robin"
# Distributed args
# Multi-node distributed serving
dist_init_addr: Optional[str] = None
nnodes: int = 1
node_rank: int = 0
@@ -128,6 +128,7 @@ class ServerArgs:
enable_p2p_check: bool = False
triton_attention_reduce_in_fp32: bool = False
num_continuous_decode_steps: int = 1
delete_ckpt_after_loading: bool = False
def __post_init__(self):
# Set missing default values
@@ -205,6 +206,7 @@ class ServerArgs:
@staticmethod
def add_cli_args(parser: argparse.ArgumentParser):
# Model and port args
parser.add_argument(
"--model-path",
type=str,
@@ -324,6 +326,8 @@ class ServerArgs:
action="store_true",
help="Whether to use a CausalLM as an embedding model.",
)
# Memory and scheduling
parser.add_argument(
"--mem-fraction-static",
type=float,
@@ -368,6 +372,8 @@ class ServerArgs:
default=ServerArgs.schedule_conservativeness,
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
)
# Other runtime options
parser.add_argument(
"--tensor-parallel-size",
"--tp-size",
@@ -393,6 +399,14 @@ class ServerArgs:
default=ServerArgs.constrained_json_whitespace_pattern,
help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
)
parser.add_argument(
"--watchdog-timeout",
type=float,
default=ServerArgs.watchdog_timeout,
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
)
# Logging
parser.add_argument(
"--log-level",
type=str,
@@ -420,7 +434,14 @@ class ServerArgs:
action="store_true",
help="Enable log prometheus metrics.",
)
parser.add_argument(
"--decode-log-interval",
type=int,
default=ServerArgs.decode_log_interval,
help="The log interval of decode batch",
)
# API related
parser.add_argument(
"--api-key",
type=str,
@@ -438,18 +459,6 @@ class ServerArgs:
action="store_true",
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
)
parser.add_argument(
"--watchdog-timeout",
type=float,
default=ServerArgs.watchdog_timeout,
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
)
parser.add_argument(
"--decode-log-interval",
type=int,
default=ServerArgs.decode_log_interval,
help="The log interval of decode batch",
)
# Data parallelism
parser.add_argument(
@@ -470,7 +479,7 @@ class ServerArgs:
],
)
# Multi-node distributed serving args
# Multi-node distributed serving
parser.add_argument(
"--dist-init-addr",
"--nccl-init-addr", # For backward compatbility. This will be removed in the future.
@@ -677,6 +686,12 @@ class ServerArgs:
"This can potentially increase throughput but may also increase time-to-first-token latency. "
"The default value is 1, meaning only run one decoding step at a time.",
)
parser.add_argument(
"--delete-ckpt-after-loading",
default=ServerArgs.delete_ckpt_after_loading,
action="store_true",
help="Delete the model checkpoint after loading the model.",
)
@classmethod
def from_cli_args(cls, args: argparse.Namespace):