[Feature] Simple Improve Health Check Mechanism for Production-Grade Stability (#8115)

Signed-off-by: ybyang <ybyang7@iflytek.com>
This commit is contained in:
ybyang
2025-07-20 09:10:00 +08:00
committed by GitHub
parent abda2542d5
commit 4540a4666a
6 changed files with 82 additions and 11 deletions

View File

@@ -143,6 +143,7 @@ from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
from sglang.srt.utils import (
DeepEPMode,
DynamicGradMode,
ServerStatus,
broadcast_pyobj,
configure_gc_logger,
configure_logger,
@@ -154,6 +155,7 @@ from sglang.srt.utils import (
kill_itself_when_parent_died,
point_to_point_pyobj,
pyspy_dump_schedulers,
report_health,
require_mlp_sync,
require_mlp_tp_gather,
set_gpu_proc_affinity,
@@ -2964,4 +2966,5 @@ def run_scheduler_process(
except Exception:
traceback = get_exception_traceback()
logger.error(f"Scheduler hit an exception: {traceback}")
report_health(ServerStatus.Crashed, server_args.host, ServerArgs.port)
parent_process.send_signal(signal.SIGQUIT)