[Feature] Simple Improve Health Check Mechanism for Production-Grade Stability (#8115)

Signed-off-by: ybyang <ybyang7@iflytek.com>
This commit is contained in:
ybyang
2025-07-20 09:10:00 +08:00
committed by GitHub
parent abda2542d5
commit 4540a4666a
6 changed files with 82 additions and 11 deletions

View File

@@ -116,6 +116,7 @@ from sglang.srt.metrics.collector import TokenizerMetricsCollector
from sglang.srt.sampling.sampling_params import SamplingParams
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import (
ServerStatus,
dataclass_to_string_truncated,
get_bool_env_var,
get_zmq_socket,
@@ -173,6 +174,9 @@ class TokenizerManager:
server_args: ServerArgs,
port_args: PortArgs,
):
# Server Status
self.server_status = ServerStatus.Starting
# Parse args
self.server_args = server_args
self.enable_metrics = server_args.enable_metrics
@@ -251,7 +255,6 @@ class TokenizerManager:
# Store states
self.no_create_loop = False
self.rid_to_state: Dict[str, ReqState] = {}
self.health_check_failed = False
self.gracefully_exit = False
self.last_receive_tstamp = 0
self.dump_requests_folder = "" # By default do not dump
@@ -1332,7 +1335,7 @@ class TokenizerManager:
while True:
remain_num_req = len(self.rid_to_state)
if self.health_check_failed:
if not self.server_status.is_healthy():
# if health check failed, we should exit immediately
logger.error(
"Signal SIGTERM received while health check failed. Exiting... remaining number of requests: %d",