Minor improvements of TokenizerManager / health check (#6327)
This commit is contained in:
@@ -46,7 +46,6 @@ class ServerArgs:
|
||||
tokenizer_path: Optional[str] = None
|
||||
tokenizer_mode: str = "auto"
|
||||
skip_tokenizer_init: bool = False
|
||||
enable_tokenizer_batch_encode: bool = False
|
||||
load_format: str = "auto"
|
||||
trust_remote_code: bool = False
|
||||
dtype: str = "auto"
|
||||
@@ -59,6 +58,7 @@ class ServerArgs:
|
||||
chat_template: Optional[str] = None
|
||||
completion_template: Optional[str] = None
|
||||
is_embedding: bool = False
|
||||
enable_multimodal: Optional[bool] = None
|
||||
revision: Optional[str] = None
|
||||
|
||||
# Port for the HTTP server
|
||||
@@ -97,6 +97,10 @@ class ServerArgs:
|
||||
log_requests_level: int = 0
|
||||
show_time_cost: bool = False
|
||||
enable_metrics: bool = False
|
||||
bucket_time_to_first_token: Optional[List[float]] = None
|
||||
bucket_e2e_request_latency: Optional[List[float]] = None
|
||||
bucket_inter_token_latency: Optional[List[float]] = None
|
||||
collect_tokens_histogram: bool = False
|
||||
decode_log_interval: int = 40
|
||||
enable_request_time_stats_logging: bool = False
|
||||
|
||||
@@ -120,6 +124,7 @@ class ServerArgs:
|
||||
|
||||
# Model override args in JSON
|
||||
json_model_override_args: str = "{}"
|
||||
preferred_sampling_params: Optional[str] = None
|
||||
|
||||
# LoRA
|
||||
lora_paths: Optional[List[str]] = None
|
||||
@@ -154,9 +159,9 @@ class ServerArgs:
|
||||
disable_cuda_graph: bool = False
|
||||
disable_cuda_graph_padding: bool = False
|
||||
enable_nccl_nvls: bool = False
|
||||
enable_tokenizer_batch_encode: bool = False
|
||||
disable_outlines_disk_cache: bool = False
|
||||
disable_custom_all_reduce: bool = False
|
||||
enable_multimodal: Optional[bool] = None
|
||||
disable_overlap_schedule: bool = False
|
||||
enable_mixed_chunk: bool = False
|
||||
enable_dp_attention: bool = False
|
||||
@@ -474,11 +479,6 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="If set, skip init tokenizer and pass input_ids in generate request.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-tokenizer-batch-encode",
|
||||
action="store_true",
|
||||
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--load-format",
|
||||
type=str,
|
||||
@@ -603,6 +603,12 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Whether to use a CausalLM as an embedding model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-multimodal",
|
||||
default=ServerArgs.enable_multimodal,
|
||||
action="store_true",
|
||||
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--revision",
|
||||
type=str,
|
||||
@@ -780,6 +786,33 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Enable log prometheus metrics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bucket-time-to-first-token",
|
||||
type=float,
|
||||
nargs="+",
|
||||
default=ServerArgs.bucket_time_to_first_token,
|
||||
help="The buckets of time to first token, specified as a list of floats.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bucket-inter-token-latency",
|
||||
type=float,
|
||||
nargs="+",
|
||||
default=ServerArgs.bucket_inter_token_latency,
|
||||
help="The buckets of inter-token latency, specified as a list of floats.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bucket-e2e-request-latency",
|
||||
type=float,
|
||||
nargs="+",
|
||||
default=ServerArgs.bucket_e2e_request_latency,
|
||||
help="The buckets of end-to-end request latency, specified as a list of floats.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--collect-tokens-histogram",
|
||||
action="store_true",
|
||||
default=ServerArgs.collect_tokens_histogram,
|
||||
help="Collect prompt/generation tokens histogram.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decode-log-interval",
|
||||
type=int,
|
||||
@@ -868,6 +901,11 @@ class ServerArgs:
|
||||
help="A dictionary in JSON string format used to override default model configurations.",
|
||||
default=ServerArgs.json_model_override_args,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--preferred-sampling-params",
|
||||
type=str,
|
||||
help="json-formatted sampling settings that will be returned in /get_model_info",
|
||||
)
|
||||
|
||||
# LoRA
|
||||
parser.add_argument(
|
||||
@@ -1043,6 +1081,11 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Enable NCCL NVLS for prefill heavy requests when available.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-tokenizer-batch-encode",
|
||||
action="store_true",
|
||||
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-outlines-disk-cache",
|
||||
action="store_true",
|
||||
@@ -1053,12 +1096,6 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-multimodal",
|
||||
default=ServerArgs.enable_multimodal,
|
||||
action="store_true",
|
||||
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-overlap-schedule",
|
||||
action="store_true",
|
||||
|
||||
Reference in New Issue
Block a user