Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)

Co-authored-by: Kan Wu <wukanustc@gmail.com>
This commit is contained in:
Lianmin Zheng
2025-06-29 23:16:19 -07:00
committed by GitHub
parent c5131f7a2f
commit 22352d47a9
24 changed files with 626 additions and 160 deletions

View File

@@ -99,6 +99,7 @@ class ServerArgs:
log_level_http: Optional[str] = None
log_requests: bool = False
log_requests_level: int = 0
crash_dump_folder: Optional[str] = None
show_time_cost: bool = False
enable_metrics: bool = False
bucket_time_to_first_token: Optional[List[float]] = None
@@ -927,8 +928,14 @@ class ServerArgs:
"--log-requests-level",
type=int,
default=0,
help="0: Log metadata. 1. Log metadata and partial input/output. 2. Log every input/output.",
choices=[0, 1, 2],
help="0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.",
choices=[0, 1, 2, 3],
)
parser.add_argument(
"--crash-dump-folder",
type=str,
default=ServerArgs.crash_dump_folder,
help="Folder path to dump requests from the last 5 min before a crash (if any). If not specified, crash dumping is disabled.",
)
parser.add_argument(
"--show-time-cost",