[minor] Improve code style and compatibility (#1961)

2024-11-08 02:19:41 -08:00
parent 7ef0084b0d
commit a509552087
6 changed files with 109 additions and 35 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -63,7 +63,7 @@ class ServerArgs:
    stream_interval: int = 1
    random_seed: Optional[int] = None
    constrained_json_whitespace_pattern: Optional[str] = None
-    decode_log_interval: int = 40
+    watchdog_timeout: float = 300

    # Logging
    log_level: str = "info"
@@ -71,18 +71,18 @@ class ServerArgs:
    log_requests: bool = False
    show_time_cost: bool = False
    enable_metrics: bool = False
+    decode_log_interval: int = 40

-    # Other
+    # API related
    api_key: Optional[str] = None
    file_storage_pth: str = "SGLang_storage"
    enable_cache_report: bool = False
-    watchdog_timeout: float = 600

    # Data parallelism
    dp_size: int = 1
    load_balance_method: str = "round_robin"

-    # Distributed args
+    # Multi-node distributed serving
    dist_init_addr: Optional[str] = None
    nnodes: int = 1
    node_rank: int = 0
@@ -128,6 +128,7 @@ class ServerArgs:
    enable_p2p_check: bool = False
    triton_attention_reduce_in_fp32: bool = False
    num_continuous_decode_steps: int = 1
+    delete_ckpt_after_loading: bool = False

    def __post_init__(self):
        # Set missing default values
@@ -205,6 +206,7 @@ class ServerArgs:

    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
+        # Model and port args
        parser.add_argument(
            "--model-path",
            type=str,
@@ -324,6 +326,8 @@ class ServerArgs:
            action="store_true",
            help="Whether to use a CausalLM as an embedding model.",
        )
+
+        # Memory and scheduling
        parser.add_argument(
            "--mem-fraction-static",
            type=float,
@@ -368,6 +372,8 @@ class ServerArgs:
            default=ServerArgs.schedule_conservativeness,
            help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
        )
+
+        # Other runtime options
        parser.add_argument(
            "--tensor-parallel-size",
            "--tp-size",
@@ -393,6 +399,14 @@ class ServerArgs:
            default=ServerArgs.constrained_json_whitespace_pattern,
            help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
        )
+        parser.add_argument(
+            "--watchdog-timeout",
+            type=float,
+            default=ServerArgs.watchdog_timeout,
+            help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
+        )
+
+        # Logging
        parser.add_argument(
            "--log-level",
            type=str,
@@ -420,7 +434,14 @@ class ServerArgs:
            action="store_true",
            help="Enable log prometheus metrics.",
        )
+        parser.add_argument(
+            "--decode-log-interval",
+            type=int,
+            default=ServerArgs.decode_log_interval,
+            help="The log interval of decode batch",
+        )

+        # API related
        parser.add_argument(
            "--api-key",
            type=str,
@@ -438,18 +459,6 @@ class ServerArgs:
            action="store_true",
            help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
        )
-        parser.add_argument(
-            "--watchdog-timeout",
-            type=float,
-            default=ServerArgs.watchdog_timeout,
-            help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
-        )
-        parser.add_argument(
-            "--decode-log-interval",
-            type=int,
-            default=ServerArgs.decode_log_interval,
-            help="The log interval of decode batch",
-        )

        # Data parallelism
        parser.add_argument(
@@ -470,7 +479,7 @@ class ServerArgs:
            ],
        )

-        # Multi-node distributed serving args
+        # Multi-node distributed serving
        parser.add_argument(
            "--dist-init-addr",
            "--nccl-init-addr",  # For backward compatbility. This will be removed in the future.
@@ -677,6 +686,12 @@ class ServerArgs:
            "This can potentially increase throughput but may also increase time-to-first-token latency. "
            "The default value is 1, meaning only run one decoding step at a time.",
        )
+        parser.add_argument(
+            "--delete-ckpt-after-loading",
+            default=ServerArgs.delete_ckpt_after_loading,
+            action="store_true",
+            help="Delete the model checkpoint after loading the model.",
+        )

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):