Support GC Freezing to improve latency & throughput (#9241)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
This commit is contained in:
Chanh Nguyen
2025-08-22 22:43:09 -07:00
committed by GitHub
parent 7e880286b5
commit 127d4b0d5e
8 changed files with 119 additions and 1 deletions

View File

@@ -123,6 +123,7 @@ class ServerArgs:
decode_log_interval: int = 40
enable_request_time_stats_logging: bool = False
kv_events_config: Optional[str] = None
gc_warning_threshold_secs: float = 0.0
# API related
api_key: Optional[str] = None
@@ -1172,6 +1173,12 @@ class ServerArgs:
default=ServerArgs.collect_tokens_histogram,
help="Collect prompt/generation tokens histogram.",
)
parser.add_argument(
"--gc-warning-threshold-secs",
type=float,
default=ServerArgs.gc_warning_threshold_secs,
help="The threshold for long GC warning. If a GC takes longer than this, a warning will be logged. Set to 0 to disable.",
)
parser.add_argument(
"--decode-log-interval",
type=int,