Minor improvements of TokenizerManager / health check (#6327)

This commit is contained in:
Lianmin Zheng
2025-05-15 15:29:25 -07:00
committed by GitHub
parent cd8d4b9dfc
commit e07a6977e7
9 changed files with 136 additions and 33 deletions

View File

@@ -46,7 +46,6 @@ class ServerArgs:
tokenizer_path: Optional[str] = None
tokenizer_mode: str = "auto"
skip_tokenizer_init: bool = False
enable_tokenizer_batch_encode: bool = False
load_format: str = "auto"
trust_remote_code: bool = False
dtype: str = "auto"
@@ -59,6 +58,7 @@ class ServerArgs:
chat_template: Optional[str] = None
completion_template: Optional[str] = None
is_embedding: bool = False
enable_multimodal: Optional[bool] = None
revision: Optional[str] = None
# Port for the HTTP server
@@ -97,6 +97,10 @@ class ServerArgs:
log_requests_level: int = 0
show_time_cost: bool = False
enable_metrics: bool = False
bucket_time_to_first_token: Optional[List[float]] = None
bucket_e2e_request_latency: Optional[List[float]] = None
bucket_inter_token_latency: Optional[List[float]] = None
collect_tokens_histogram: bool = False
decode_log_interval: int = 40
enable_request_time_stats_logging: bool = False
@@ -120,6 +124,7 @@ class ServerArgs:
# Model override args in JSON
json_model_override_args: str = "{}"
preferred_sampling_params: Optional[str] = None
# LoRA
lora_paths: Optional[List[str]] = None
@@ -154,9 +159,9 @@ class ServerArgs:
disable_cuda_graph: bool = False
disable_cuda_graph_padding: bool = False
enable_nccl_nvls: bool = False
enable_tokenizer_batch_encode: bool = False
disable_outlines_disk_cache: bool = False
disable_custom_all_reduce: bool = False
enable_multimodal: Optional[bool] = None
disable_overlap_schedule: bool = False
enable_mixed_chunk: bool = False
enable_dp_attention: bool = False
@@ -474,11 +479,6 @@ class ServerArgs:
action="store_true",
help="If set, skip init tokenizer and pass input_ids in generate request.",
)
parser.add_argument(
"--enable-tokenizer-batch-encode",
action="store_true",
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
)
parser.add_argument(
"--load-format",
type=str,
@@ -603,6 +603,12 @@ class ServerArgs:
action="store_true",
help="Whether to use a CausalLM as an embedding model.",
)
parser.add_argument(
"--enable-multimodal",
default=ServerArgs.enable_multimodal,
action="store_true",
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
)
parser.add_argument(
"--revision",
type=str,
@@ -780,6 +786,33 @@ class ServerArgs:
action="store_true",
help="Enable log prometheus metrics.",
)
parser.add_argument(
"--bucket-time-to-first-token",
type=float,
nargs="+",
default=ServerArgs.bucket_time_to_first_token,
help="The buckets of time to first token, specified as a list of floats.",
)
parser.add_argument(
"--bucket-inter-token-latency",
type=float,
nargs="+",
default=ServerArgs.bucket_inter_token_latency,
help="The buckets of inter-token latency, specified as a list of floats.",
)
parser.add_argument(
"--bucket-e2e-request-latency",
type=float,
nargs="+",
default=ServerArgs.bucket_e2e_request_latency,
help="The buckets of end-to-end request latency, specified as a list of floats.",
)
parser.add_argument(
"--collect-tokens-histogram",
action="store_true",
default=ServerArgs.collect_tokens_histogram,
help="Collect prompt/generation tokens histogram.",
)
parser.add_argument(
"--decode-log-interval",
type=int,
@@ -868,6 +901,11 @@ class ServerArgs:
help="A dictionary in JSON string format used to override default model configurations.",
default=ServerArgs.json_model_override_args,
)
parser.add_argument(
"--preferred-sampling-params",
type=str,
help="json-formatted sampling settings that will be returned in /get_model_info",
)
# LoRA
parser.add_argument(
@@ -1043,6 +1081,11 @@ class ServerArgs:
action="store_true",
help="Enable NCCL NVLS for prefill heavy requests when available.",
)
parser.add_argument(
"--enable-tokenizer-batch-encode",
action="store_true",
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
)
parser.add_argument(
"--disable-outlines-disk-cache",
action="store_true",
@@ -1053,12 +1096,6 @@ class ServerArgs:
action="store_true",
help="Disable the custom all-reduce kernel and fall back to NCCL.",
)
parser.add_argument(
"--enable-multimodal",
default=ServerArgs.enable_multimodal,
action="store_true",
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
)
parser.add_argument(
"--disable-overlap-schedule",
action="store_true",