fix: fix the missing metrics on non-rank0 nodes (#7720)

2025-07-27 15:55:25 +08:00
parent 2a1936de96
commit 36d6f0ba5b
2 changed files with 11 additions and 2 deletions
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -765,7 +765,9 @@ def _launch_subprocesses(
            # When using `Engine` as a Python API, we don't want to block here.
            return None, None, None

-        launch_dummy_health_check_server(server_args.host, server_args.port)
+        launch_dummy_health_check_server(
+            server_args.host, server_args.port, server_args.enable_metrics
+        )

        for proc in scheduler_procs:
            proc.join()
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -85,6 +85,8 @@ from torch.profiler import ProfilerActivity, profile, record_function
 from torch.utils._contextlib import _DecoratorContextManager
 from triton.runtime.cache import FileCacheManager

+from sglang.srt.metrics.func_timer import enable_func_timer
+
 logger = logging.getLogger(__name__)

 show_time_cost = False
@@ -2049,7 +2051,7 @@ def rank0_log(msg: str):
        logger.info(msg)


-def launch_dummy_health_check_server(host, port):
+def launch_dummy_health_check_server(host, port, enable_metrics):
    import asyncio

    import uvicorn
@@ -2067,6 +2069,11 @@ def launch_dummy_health_check_server(host, port):
        """Check the health of the http server."""
        return Response(status_code=200)

+    # Add prometheus middleware
+    if enable_metrics:
+        add_prometheus_middleware(app)
+        enable_func_timer()
+
    config = uvicorn.Config(
        app,
        host=host,