Split the scheduler into multiple mixin classes to reduce the file size (#8483)

2025-07-29 12:46:50 -07:00
parent 5973675bc3
commit a4c3b121d8
12 changed files with 869 additions and 785 deletions
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -652,25 +652,19 @@ def _set_envs_and_config(server_args: ServerArgs):
            "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
        )

-    def sigchld_handler(signum, frame):
-        pid, exitcode = os.waitpid(0, os.WNOHANG)
-        if exitcode != 0:
-            logger.warning(
-                f"Child process unexpectedly failed with {exitcode=}. {pid=}"
+    if True:  # Keep this check for internal code compatibility
+        # Register the signal handler.
+        # The child processes will send SIGQUIT to this process when any error happens
+        # This process then clean up the whole process tree
+        # Note: This sigquit handler is used in the launch phase, and may be replaced by
+        # the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
+        def launch_phase_sigquit_handler(signum, frame):
+            logger.error(
+                "Received sigquit from a child process. It usually means the child failed."
            )
+            kill_process_tree(os.getpid())

-    signal.signal(signal.SIGCHLD, sigchld_handler)
-
-    # Register the signal handler.
-    # The child processes will send SIGQUIT to this process when any error happens
-    # This process then clean up the whole process tree
-    def sigquit_handler(signum, frame):
-        logger.error(
-            "Received sigquit from a child process. It usually means the child failed."
-        )
-        kill_process_tree(os.getpid())
-
-    signal.signal(signal.SIGQUIT, sigquit_handler)
+        signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)

    # Set mp start method
    mp.set_start_method("spawn", force=True)
--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -238,6 +238,9 @@ async def health() -> Response:
@app.get("/health_generate")
 async def health_generate(request: Request) -> Response:
    """Check the health of the inference server by generating one token."""
+    if _global_state.tokenizer_manager.gracefully_exit:
+        logger.info("Health check request received during shutdown. Returning 503.")
+        return Response(status_code=503)

    sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
    rid = f"HEALTH_CHECK_{time.time()}"
@@ -260,9 +263,14 @@ async def health_generate(request: Request) -> Response:
        async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
            break

-    tic = time.perf_counter()
+    # This request is a special request.
+    # If the server already has something running, this request will be ignored, so it creates zero overhead.
+    # If the server is not running, this request will be run, so we know whether the server is healthy.
    task = asyncio.create_task(gen())
-    while time.perf_counter() < tic + HEALTH_CHECK_TIMEOUT:
+
+    # As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
+    tic = time.time()
+    while time.time() < tic + HEALTH_CHECK_TIMEOUT:
        await asyncio.sleep(1)
        if _global_state.tokenizer_manager.last_receive_tstamp > tic:
            task.cancel()