From 95a4ed129ae24df6bca2d0e01c522253b2d385cb Mon Sep 17 00:00:00 2001 From: Yudi Xue <10211+binarycrayon@users.noreply.github.com> Date: Fri, 8 Nov 2024 23:21:11 -0800 Subject: [PATCH] Fix metrics (#1963) --- python/pyproject.toml | 2 +- .../sglang/srt/metrics/metrics_collector.py | 54 +++++++++++- python/sglang/srt/metrics/metrics_types.py | 3 - python/sglang/srt/server.py | 4 +- python/sglang/test/test_utils.py | 3 + test/srt/test_enable_metrics.py | 84 +++++++++++++++++++ 6 files changed, 142 insertions(+), 8 deletions(-) create mode 100644 test/srt/test_enable_metrics.py diff --git a/python/pyproject.toml b/python/pyproject.toml index 5ac2e8372..99d6e4171 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -17,7 +17,7 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"] [project.optional-dependencies] runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular", - "orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart", + "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart", "torchao", "uvicorn", "uvloop", "zmq", "outlines>=0.0.44", "modelscope"] srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"] diff --git a/python/sglang/srt/metrics/metrics_collector.py b/python/sglang/srt/metrics/metrics_collector.py index 91a849414..5299cecc8 100644 --- a/python/sglang/srt/metrics/metrics_collector.py +++ b/python/sglang/srt/metrics/metrics_collector.py @@ -213,19 +213,67 @@ class Metrics: name="sglang:e2e_request_latency_seconds", documentation="Histogram of End-to-end request latency in seconds", labelnames=labelnames, - buckets=build_1_2_5_buckets(max_model_len), + buckets=[ + 0.3, + 0.5, + 0.8, + 1.0, + 1.5, + 2.0, + 2.5, + 5.0, + 10.0, + 15.0, + 20.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], ) self.histogram_time_waiting_requests = Histogram( name="sglang:waiting_request_latency_seconds", documentation="Histogram of request waiting time in seconds", labelnames=labelnames, - buckets=build_1_2_5_buckets(max_model_len), + buckets=[ + 0.3, + 0.5, + 0.8, + 1.0, + 1.5, + 2.0, + 2.5, + 5.0, + 10.0, + 15.0, + 20.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], ) self.histogram_time_decode_requests = Histogram( name="sglang:decode_request_latency_seconds", documentation="Histogram of request decoding time in seconds", labelnames=labelnames, - buckets=build_1_2_5_buckets(max_model_len), + buckets=[ + 0.3, + 0.5, + 0.8, + 1.0, + 1.5, + 2.0, + 2.5, + 5.0, + 10.0, + 15.0, + 20.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], ) diff --git a/python/sglang/srt/metrics/metrics_types.py b/python/sglang/srt/metrics/metrics_types.py index 15e4a0dc7..2bd0f54f5 100644 --- a/python/sglang/srt/metrics/metrics_types.py +++ b/python/sglang/srt/metrics/metrics_types.py @@ -34,15 +34,12 @@ class Stats: num_running_req: int = 0 num_waiting_req: int = 0 gen_throughput: float = 0.0 - num_token: int = 0 - token_usage: float = 0.0 waiting_queue: int = 0 time_e2e_requests: List[float] = field(default_factory=list) time_waiting_requests: List[float] = field(default_factory=list) time_decode_requests: List[float] = field(default_factory=list) # system stats token_usage: float = 0.0 - is_mixed_chunk: bool = False new_seq: int = 0 new_token: int = 0 cached_token: int = 0 diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 3d0816ce0..75ffed325 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -446,6 +446,9 @@ def launch_server( 2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library. """ + if server_args.enable_metrics: + _set_prometheus_env() + launch_engine(server_args=server_args) # Add api key authorization @@ -454,7 +457,6 @@ def launch_server( # add prometheus middleware if server_args.enable_metrics: - _set_prometheus_env() add_prometheus_middleware(app) # Send a warmup request diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 2c68a22b4..a7893494b 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -404,6 +404,7 @@ def popen_launch_server( other_args: tuple = (), env: Optional[dict] = None, return_stdout_stderr: Optional[tuple] = None, + enable_metrics: bool = False, ): _, host, port = base_url.split(":") host = host[2:] @@ -422,6 +423,8 @@ def popen_launch_server( ] if api_key: command += ["--api-key", api_key] + if enable_metrics: + command += ["--enable-metrics"] if return_stdout_stderr: process = subprocess.Popen( diff --git a/test/srt/test_enable_metrics.py b/test/srt/test_enable_metrics.py new file mode 100644 index 000000000..794e2a325 --- /dev/null +++ b/test/srt/test_enable_metrics.py @@ -0,0 +1,84 @@ +import unittest +from types import SimpleNamespace + +import requests + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + +TEST_MODEL = ( + DEFAULT_MODEL_NAME_FOR_TEST # I used "google/gemma-2-2b-it" for testing locally +) + + +class TestEnableMetrics(unittest.TestCase): + def test_metrics_enabled(self): + """Test that metrics endpoint returns data when enabled""" + # Launch server with metrics enabled + process = popen_launch_server( + model=TEST_MODEL, + base_url=DEFAULT_URL_FOR_TEST, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + enable_metrics=True, + ) + + try: + # Make a request to generate some metrics + response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate") + self.assertEqual(response.status_code, 200) + + # Get metrics + metrics_response = requests.get(f"{DEFAULT_URL_FOR_TEST}/metrics") + self.assertEqual(metrics_response.status_code, 200) + metrics_content = metrics_response.text + + # Verify essential metrics are present + essential_metrics = [ + "sglang:prompt_tokens_total", + "sglang:generation_tokens_total", + "sglang:max_total_num_tokens", + "sglang:context_len", + "sglang:time_to_first_token_seconds", + "sglang:time_per_output_token_seconds", + "sglang:e2e_request_latency_seconds", + ] + + for metric in essential_metrics: + self.assertIn(metric, metrics_content, f"Missing metric: {metric}") + + # Verify model name label is present and correct + expected_model_name = TEST_MODEL + self.assertIn(f'model_name="{expected_model_name}"', metrics_content) + # Verify metrics have values (not empty) + self.assertIn("_sum{", metrics_content) + self.assertIn("_count{", metrics_content) + self.assertIn("_bucket{", metrics_content) + + finally: + kill_child_process(process.pid, include_self=True) + + def test_metrics_disabled(self): + """Test that metrics endpoint returns 404 when disabled""" + # Launch server with metrics disabled + process = popen_launch_server( + model=TEST_MODEL, + base_url=DEFAULT_URL_FOR_TEST, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + enable_metrics=False, + ) + + try: + response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate") + self.assertEqual(response.status_code, 200) + # Verify metrics endpoint is not available + metrics_response = requests.get(f"{DEFAULT_URL_FOR_TEST}/metrics") + self.assertEqual(metrics_response.status_code, 404) + + finally: + kill_child_process(process.pid, include_self=True)