[Feature] Simple Improve Health Check Mechanism for Production-Grade Stability (#8115)

Signed-off-by: ybyang <ybyang7@iflytek.com>
This commit is contained in:
ybyang
2025-07-20 09:10:00 +08:00
committed by GitHub
parent abda2542d5
commit 4540a4666a
6 changed files with 82 additions and 11 deletions

View File

@@ -65,6 +65,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
from sglang.srt.utils import (
MultiprocessingSerializer,
ServerStatus,
assert_pkg_version,
configure_logger,
get_zmq_socket,
@@ -73,6 +74,7 @@ from sglang.srt.utils import (
launch_dummy_health_check_server,
maybe_set_triton_cache_manager,
prepare_model_and_tokenizer,
report_health,
set_prometheus_multiproc_dir,
set_ulimit,
)
@@ -661,6 +663,7 @@ def _set_envs_and_config(server_args: ServerArgs):
def sigchld_handler(signum, frame):
pid, exitcode = os.waitpid(0, os.WNOHANG)
if exitcode != 0:
report_health(ServerStatus.Crashed, server_args.host, server_args.port)
logger.warning(
f"Child process unexpectedly failed with {exitcode=}. {pid=}"
)
@@ -674,6 +677,7 @@ def _set_envs_and_config(server_args: ServerArgs):
logger.error(
"Received sigquit from a child process. It usually means the child failed."
)
report_health(ServerStatus.Crashed, server_args.host, server_args.port)
kill_process_tree(os.getpid())
signal.signal(signal.SIGQUIT, sigquit_handler)

View File

@@ -77,6 +77,7 @@ from sglang.srt.managers.io_struct import (
ParseFunctionCallReq,
ProfileReqInput,
ReleaseMemoryOccupationReqInput,
ReportHealthInput,
ResumeMemoryOccupationReqInput,
SeparateReasoningReqInput,
SetInternalStateReq,
@@ -93,6 +94,7 @@ from sglang.srt.metrics.func_timer import enable_func_timer
from sglang.srt.reasoning_parser import ReasoningParser
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import (
ServerStatus,
add_api_key_middleware,
add_prometheus_middleware,
delete_directory,
@@ -220,8 +222,31 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
@app.get("/health")
async def health() -> Response:
"""Check the health of the http server."""
return Response(status_code=200)
"""Check the status of the http server."""
code = HTTPStatus.SERVICE_UNAVAILABLE.value
if _global_state.tokenizer_manager.server_status == ServerStatus.Up:
code = HTTPStatus.OK.value
return Response(
status_code=code,
content=json.dumps(
{"status": _global_state.tokenizer_manager.server_status.value}
),
)
@app.post("/health")
async def health_update(obj: ReportHealthInput, request: Request) -> Response:
"""Update the Status of the http server."""
try:
server_status = ServerStatus(obj.status)
_global_state.tokenizer_manager.server_status = server_status
if server_status != ServerStatus.Up:
return Response(
status_code=HTTPStatus.SERVICE_UNAVAILABLE.value, content=obj.msg
)
except Exception as e:
logger.error(e)
return Response(status_code=HTTPStatus.SERVICE_UNAVAILABLE.value)
@app.get("/health_generate")
@@ -256,7 +281,7 @@ async def health_generate(request: Request) -> Response:
if _global_state.tokenizer_manager.last_receive_tstamp > tic:
task.cancel()
_global_state.tokenizer_manager.rid_to_state.pop(rid, None)
_global_state.tokenizer_manager.health_check_failed = False
_global_state.tokenizer_manager.server_status = ServerStatus.Up
return Response(status_code=200)
task.cancel()
@@ -270,7 +295,7 @@ async def health_generate(request: Request) -> Response:
f"last_heartbeat time: {last_receive_time}"
)
_global_state.tokenizer_manager.rid_to_state.pop(rid, None)
_global_state.tokenizer_manager.health_check_failed = True
_global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
return Response(status_code=503)
@@ -1022,9 +1047,13 @@ def _execute_server_warmup(
headers=headers,
timeout=600,
)
assert res.status_code == 200, f"{res}"
if res.status_code == 200:
_global_state.tokenizer_manager.server_status = ServerStatus.Up
else:
_global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
logger.info(f"{res}")
else:
logger.info(f"Start of prefill warmup ...")
logger.info(f"Start of prefill/decode warmup ...")
json_data = {
"sampling_params": {
"temperature": 0.0,
@@ -1046,15 +1075,25 @@ def _execute_server_warmup(
headers=headers,
timeout=1800, # because of deep gemm precache is very long if not precache.
)
logger.info(
f"End of prefill warmup with status {res.status_code}, resp: {res.json()}"
)
if res.status_code == 200:
logger.info(
f"End of prefill disaggregation mode warmup with status {res.status_code}, resp: {res.json()}"
)
_global_state.tokenizer_manager.server_status = ServerStatus.Up
else:
logger.info(
"Prefill disaggregation mode warm Up Failed, status code: {}".format(
res.status_code
)
)
_global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
except Exception:
last_traceback = get_exception_traceback()
if pipe_finish_writer is not None:
pipe_finish_writer.send(last_traceback)
logger.error(f"Initialization failed. warmup error: {last_traceback}")
_global_state.tokenizer_manager.server_status = ServerStatus.Crashed
kill_process_tree(os.getpid())
return False