fix: fix the missing metrics on non-rank0 nodes (#7720)

This commit is contained in:
Yingchun Lai
2025-07-27 15:55:25 +08:00
committed by GitHub
parent 2a1936de96
commit 36d6f0ba5b
2 changed files with 11 additions and 2 deletions

View File

@@ -765,7 +765,9 @@ def _launch_subprocesses(
# When using `Engine` as a Python API, we don't want to block here.
return None, None, None
launch_dummy_health_check_server(server_args.host, server_args.port)
launch_dummy_health_check_server(
server_args.host, server_args.port, server_args.enable_metrics
)
for proc in scheduler_procs:
proc.join()

View File

@@ -85,6 +85,8 @@ from torch.profiler import ProfilerActivity, profile, record_function
from torch.utils._contextlib import _DecoratorContextManager
from triton.runtime.cache import FileCacheManager
from sglang.srt.metrics.func_timer import enable_func_timer
logger = logging.getLogger(__name__)
show_time_cost = False
@@ -2049,7 +2051,7 @@ def rank0_log(msg: str):
logger.info(msg)
def launch_dummy_health_check_server(host, port):
def launch_dummy_health_check_server(host, port, enable_metrics):
import asyncio
import uvicorn
@@ -2067,6 +2069,11 @@ def launch_dummy_health_check_server(host, port):
"""Check the health of the http server."""
return Response(status_code=200)
# Add prometheus middleware
if enable_metrics:
add_prometheus_middleware(app)
enable_func_timer()
config = uvicorn.Config(
app,
host=host,