Split the scheduler into multiple mixin classes to reduce the file size (#8483)
This commit is contained in:
@@ -652,25 +652,19 @@ def _set_envs_and_config(server_args: ServerArgs):
|
||||
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
||||
)
|
||||
|
||||
def sigchld_handler(signum, frame):
|
||||
pid, exitcode = os.waitpid(0, os.WNOHANG)
|
||||
if exitcode != 0:
|
||||
logger.warning(
|
||||
f"Child process unexpectedly failed with {exitcode=}. {pid=}"
|
||||
if True: # Keep this check for internal code compatibility
|
||||
# Register the signal handler.
|
||||
# The child processes will send SIGQUIT to this process when any error happens
|
||||
# This process then clean up the whole process tree
|
||||
# Note: This sigquit handler is used in the launch phase, and may be replaced by
|
||||
# the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
|
||||
def launch_phase_sigquit_handler(signum, frame):
|
||||
logger.error(
|
||||
"Received sigquit from a child process. It usually means the child failed."
|
||||
)
|
||||
kill_process_tree(os.getpid())
|
||||
|
||||
signal.signal(signal.SIGCHLD, sigchld_handler)
|
||||
|
||||
# Register the signal handler.
|
||||
# The child processes will send SIGQUIT to this process when any error happens
|
||||
# This process then clean up the whole process tree
|
||||
def sigquit_handler(signum, frame):
|
||||
logger.error(
|
||||
"Received sigquit from a child process. It usually means the child failed."
|
||||
)
|
||||
kill_process_tree(os.getpid())
|
||||
|
||||
signal.signal(signal.SIGQUIT, sigquit_handler)
|
||||
signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
|
||||
|
||||
# Set mp start method
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
@@ -238,6 +238,9 @@ async def health() -> Response:
|
||||
@app.get("/health_generate")
|
||||
async def health_generate(request: Request) -> Response:
|
||||
"""Check the health of the inference server by generating one token."""
|
||||
if _global_state.tokenizer_manager.gracefully_exit:
|
||||
logger.info("Health check request received during shutdown. Returning 503.")
|
||||
return Response(status_code=503)
|
||||
|
||||
sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
|
||||
rid = f"HEALTH_CHECK_{time.time()}"
|
||||
@@ -260,9 +263,14 @@ async def health_generate(request: Request) -> Response:
|
||||
async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
|
||||
break
|
||||
|
||||
tic = time.perf_counter()
|
||||
# This request is a special request.
|
||||
# If the server already has something running, this request will be ignored, so it creates zero overhead.
|
||||
# If the server is not running, this request will be run, so we know whether the server is healthy.
|
||||
task = asyncio.create_task(gen())
|
||||
while time.perf_counter() < tic + HEALTH_CHECK_TIMEOUT:
|
||||
|
||||
# As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
|
||||
tic = time.time()
|
||||
while time.time() < tic + HEALTH_CHECK_TIMEOUT:
|
||||
await asyncio.sleep(1)
|
||||
if _global_state.tokenizer_manager.last_receive_tstamp > tic:
|
||||
task.cancel()
|
||||
|
||||
Reference in New Issue
Block a user