Split the scheduler into multiple mixin classes to reduce the file size (#8483)

This commit is contained in:
Lianmin Zheng
2025-07-29 12:46:50 -07:00
committed by GitHub
parent 5973675bc3
commit a4c3b121d8
12 changed files with 869 additions and 785 deletions

View File

@@ -652,25 +652,19 @@ def _set_envs_and_config(server_args: ServerArgs):
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
)
def sigchld_handler(signum, frame):
pid, exitcode = os.waitpid(0, os.WNOHANG)
if exitcode != 0:
logger.warning(
f"Child process unexpectedly failed with {exitcode=}. {pid=}"
if True: # Keep this check for internal code compatibility
# Register the signal handler.
# The child processes will send SIGQUIT to this process when any error happens
# This process then clean up the whole process tree
# Note: This sigquit handler is used in the launch phase, and may be replaced by
# the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
def launch_phase_sigquit_handler(signum, frame):
logger.error(
"Received sigquit from a child process. It usually means the child failed."
)
kill_process_tree(os.getpid())
signal.signal(signal.SIGCHLD, sigchld_handler)
# Register the signal handler.
# The child processes will send SIGQUIT to this process when any error happens
# This process then clean up the whole process tree
def sigquit_handler(signum, frame):
logger.error(
"Received sigquit from a child process. It usually means the child failed."
)
kill_process_tree(os.getpid())
signal.signal(signal.SIGQUIT, sigquit_handler)
signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
# Set mp start method
mp.set_start_method("spawn", force=True)

View File

@@ -238,6 +238,9 @@ async def health() -> Response:
@app.get("/health_generate")
async def health_generate(request: Request) -> Response:
"""Check the health of the inference server by generating one token."""
if _global_state.tokenizer_manager.gracefully_exit:
logger.info("Health check request received during shutdown. Returning 503.")
return Response(status_code=503)
sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
rid = f"HEALTH_CHECK_{time.time()}"
@@ -260,9 +263,14 @@ async def health_generate(request: Request) -> Response:
async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
break
tic = time.perf_counter()
# This request is a special request.
# If the server already has something running, this request will be ignored, so it creates zero overhead.
# If the server is not running, this request will be run, so we know whether the server is healthy.
task = asyncio.create_task(gen())
while time.perf_counter() < tic + HEALTH_CHECK_TIMEOUT:
# As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
tic = time.time()
while time.time() < tic + HEALTH_CHECK_TIMEOUT:
await asyncio.sleep(1)
if _global_state.tokenizer_manager.last_receive_tstamp > tic:
task.cancel()