[Feature] Simple Improve Health Check Mechanism for Production-Grade Stability (#8115)

Signed-off-by: ybyang <ybyang7@iflytek.com>
This commit is contained in:
ybyang
2025-07-20 09:10:00 +08:00
committed by GitHub
parent abda2542d5
commit 4540a4666a
6 changed files with 82 additions and 11 deletions

View File

@@ -65,6 +65,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
from sglang.srt.utils import (
MultiprocessingSerializer,
ServerStatus,
assert_pkg_version,
configure_logger,
get_zmq_socket,
@@ -73,6 +74,7 @@ from sglang.srt.utils import (
launch_dummy_health_check_server,
maybe_set_triton_cache_manager,
prepare_model_and_tokenizer,
report_health,
set_prometheus_multiproc_dir,
set_ulimit,
)
@@ -661,6 +663,7 @@ def _set_envs_and_config(server_args: ServerArgs):
def sigchld_handler(signum, frame):
pid, exitcode = os.waitpid(0, os.WNOHANG)
if exitcode != 0:
report_health(ServerStatus.Crashed, server_args.host, server_args.port)
logger.warning(
f"Child process unexpectedly failed with {exitcode=}. {pid=}"
)
@@ -674,6 +677,7 @@ def _set_envs_and_config(server_args: ServerArgs):
logger.error(
"Received sigquit from a child process. It usually means the child failed."
)
report_health(ServerStatus.Crashed, server_args.host, server_args.port)
kill_process_tree(os.getpid())
signal.signal(signal.SIGQUIT, sigquit_handler)