Minor improvements of TokenizerManager / health check (#6327)

2025-05-15 15:29:25 -07:00
parent cd8d4b9dfc
commit e07a6977e7
9 changed files with 136 additions and 33 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -46,7 +46,6 @@ class ServerArgs:
    tokenizer_path: Optional[str] = None
    tokenizer_mode: str = "auto"
    skip_tokenizer_init: bool = False
-    enable_tokenizer_batch_encode: bool = False
    load_format: str = "auto"
    trust_remote_code: bool = False
    dtype: str = "auto"
@@ -59,6 +58,7 @@ class ServerArgs:
    chat_template: Optional[str] = None
    completion_template: Optional[str] = None
    is_embedding: bool = False
+    enable_multimodal: Optional[bool] = None
    revision: Optional[str] = None

    # Port for the HTTP server
@@ -97,6 +97,10 @@ class ServerArgs:
    log_requests_level: int = 0
    show_time_cost: bool = False
    enable_metrics: bool = False
+    bucket_time_to_first_token: Optional[List[float]] = None
+    bucket_e2e_request_latency: Optional[List[float]] = None
+    bucket_inter_token_latency: Optional[List[float]] = None
+    collect_tokens_histogram: bool = False
    decode_log_interval: int = 40
    enable_request_time_stats_logging: bool = False

@@ -120,6 +124,7 @@ class ServerArgs:

    # Model override args in JSON
    json_model_override_args: str = "{}"
+    preferred_sampling_params: Optional[str] = None

    # LoRA
    lora_paths: Optional[List[str]] = None
@@ -154,9 +159,9 @@ class ServerArgs:
    disable_cuda_graph: bool = False
    disable_cuda_graph_padding: bool = False
    enable_nccl_nvls: bool = False
+    enable_tokenizer_batch_encode: bool = False
    disable_outlines_disk_cache: bool = False
    disable_custom_all_reduce: bool = False
-    enable_multimodal: Optional[bool] = None
    disable_overlap_schedule: bool = False
    enable_mixed_chunk: bool = False
    enable_dp_attention: bool = False
@@ -474,11 +479,6 @@ class ServerArgs:
            action="store_true",
            help="If set, skip init tokenizer and pass input_ids in generate request.",
        )
-        parser.add_argument(
-            "--enable-tokenizer-batch-encode",
-            action="store_true",
-            help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
-        )
        parser.add_argument(
            "--load-format",
            type=str,
@@ -603,6 +603,12 @@ class ServerArgs:
            action="store_true",
            help="Whether to use a CausalLM as an embedding model.",
        )
+        parser.add_argument(
+            "--enable-multimodal",
+            default=ServerArgs.enable_multimodal,
+            action="store_true",
+            help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
+        )
        parser.add_argument(
            "--revision",
            type=str,
@@ -780,6 +786,33 @@ class ServerArgs:
            action="store_true",
            help="Enable log prometheus metrics.",
        )
+        parser.add_argument(
+            "--bucket-time-to-first-token",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_time_to_first_token,
+            help="The buckets of time to first token, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--bucket-inter-token-latency",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_inter_token_latency,
+            help="The buckets of inter-token latency, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--bucket-e2e-request-latency",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_e2e_request_latency,
+            help="The buckets of end-to-end request latency, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--collect-tokens-histogram",
+            action="store_true",
+            default=ServerArgs.collect_tokens_histogram,
+            help="Collect prompt/generation tokens histogram.",
+        )
        parser.add_argument(
            "--decode-log-interval",
            type=int,
@@ -868,6 +901,11 @@ class ServerArgs:
            help="A dictionary in JSON string format used to override default model configurations.",
            default=ServerArgs.json_model_override_args,
        )
+        parser.add_argument(
+            "--preferred-sampling-params",
+            type=str,
+            help="json-formatted sampling settings that will be returned in /get_model_info",
+        )

        # LoRA
        parser.add_argument(
@@ -1043,6 +1081,11 @@ class ServerArgs:
            action="store_true",
            help="Enable NCCL NVLS for prefill heavy requests when available.",
        )
+        parser.add_argument(
+            "--enable-tokenizer-batch-encode",
+            action="store_true",
+            help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
+        )
        parser.add_argument(
            "--disable-outlines-disk-cache",
            action="store_true",
@@ -1053,12 +1096,6 @@ class ServerArgs:
            action="store_true",
            help="Disable the custom all-reduce kernel and fall back to NCCL.",
        )
-        parser.add_argument(
-            "--enable-multimodal",
-            default=ServerArgs.enable_multimodal,
-            action="store_true",
-            help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
-        )
        parser.add_argument(
            "--disable-overlap-schedule",
            action="store_true",