Fix metrics (#1963)
This commit is contained in:
@@ -17,7 +17,7 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
|
||||
|
||||
[project.optional-dependencies]
|
||||
runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
||||
"orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
||||
"orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
|
||||
"torchao", "uvicorn", "uvloop", "zmq",
|
||||
"outlines>=0.0.44", "modelscope"]
|
||||
srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
|
||||
|
||||
@@ -213,19 +213,67 @@ class Metrics:
|
||||
name="sglang:e2e_request_latency_seconds",
|
||||
documentation="Histogram of End-to-end request latency in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
buckets=[
|
||||
0.3,
|
||||
0.5,
|
||||
0.8,
|
||||
1.0,
|
||||
1.5,
|
||||
2.0,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
30.0,
|
||||
40.0,
|
||||
50.0,
|
||||
60.0,
|
||||
],
|
||||
)
|
||||
self.histogram_time_waiting_requests = Histogram(
|
||||
name="sglang:waiting_request_latency_seconds",
|
||||
documentation="Histogram of request waiting time in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
buckets=[
|
||||
0.3,
|
||||
0.5,
|
||||
0.8,
|
||||
1.0,
|
||||
1.5,
|
||||
2.0,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
30.0,
|
||||
40.0,
|
||||
50.0,
|
||||
60.0,
|
||||
],
|
||||
)
|
||||
self.histogram_time_decode_requests = Histogram(
|
||||
name="sglang:decode_request_latency_seconds",
|
||||
documentation="Histogram of request decoding time in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
buckets=[
|
||||
0.3,
|
||||
0.5,
|
||||
0.8,
|
||||
1.0,
|
||||
1.5,
|
||||
2.0,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
30.0,
|
||||
40.0,
|
||||
50.0,
|
||||
60.0,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -34,15 +34,12 @@ class Stats:
|
||||
num_running_req: int = 0
|
||||
num_waiting_req: int = 0
|
||||
gen_throughput: float = 0.0
|
||||
num_token: int = 0
|
||||
token_usage: float = 0.0
|
||||
waiting_queue: int = 0
|
||||
time_e2e_requests: List[float] = field(default_factory=list)
|
||||
time_waiting_requests: List[float] = field(default_factory=list)
|
||||
time_decode_requests: List[float] = field(default_factory=list)
|
||||
# system stats
|
||||
token_usage: float = 0.0
|
||||
is_mixed_chunk: bool = False
|
||||
new_seq: int = 0
|
||||
new_token: int = 0
|
||||
cached_token: int = 0
|
||||
|
||||
@@ -446,6 +446,9 @@ def launch_server(
|
||||
2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
|
||||
"""
|
||||
|
||||
if server_args.enable_metrics:
|
||||
_set_prometheus_env()
|
||||
|
||||
launch_engine(server_args=server_args)
|
||||
|
||||
# Add api key authorization
|
||||
@@ -454,7 +457,6 @@ def launch_server(
|
||||
|
||||
# add prometheus middleware
|
||||
if server_args.enable_metrics:
|
||||
_set_prometheus_env()
|
||||
add_prometheus_middleware(app)
|
||||
|
||||
# Send a warmup request
|
||||
|
||||
@@ -404,6 +404,7 @@ def popen_launch_server(
|
||||
other_args: tuple = (),
|
||||
env: Optional[dict] = None,
|
||||
return_stdout_stderr: Optional[tuple] = None,
|
||||
enable_metrics: bool = False,
|
||||
):
|
||||
_, host, port = base_url.split(":")
|
||||
host = host[2:]
|
||||
@@ -422,6 +423,8 @@ def popen_launch_server(
|
||||
]
|
||||
if api_key:
|
||||
command += ["--api-key", api_key]
|
||||
if enable_metrics:
|
||||
command += ["--enable-metrics"]
|
||||
|
||||
if return_stdout_stderr:
|
||||
process = subprocess.Popen(
|
||||
|
||||
84
test/srt/test_enable_metrics.py
Normal file
84
test/srt/test_enable_metrics.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
import requests
|
||||
|
||||
from sglang.srt.utils import kill_child_process
|
||||
from sglang.test.run_eval import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
TEST_MODEL = (
|
||||
DEFAULT_MODEL_NAME_FOR_TEST # I used "google/gemma-2-2b-it" for testing locally
|
||||
)
|
||||
|
||||
|
||||
class TestEnableMetrics(unittest.TestCase):
|
||||
def test_metrics_enabled(self):
|
||||
"""Test that metrics endpoint returns data when enabled"""
|
||||
# Launch server with metrics enabled
|
||||
process = popen_launch_server(
|
||||
model=TEST_MODEL,
|
||||
base_url=DEFAULT_URL_FOR_TEST,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
enable_metrics=True,
|
||||
)
|
||||
|
||||
try:
|
||||
# Make a request to generate some metrics
|
||||
response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Get metrics
|
||||
metrics_response = requests.get(f"{DEFAULT_URL_FOR_TEST}/metrics")
|
||||
self.assertEqual(metrics_response.status_code, 200)
|
||||
metrics_content = metrics_response.text
|
||||
|
||||
# Verify essential metrics are present
|
||||
essential_metrics = [
|
||||
"sglang:prompt_tokens_total",
|
||||
"sglang:generation_tokens_total",
|
||||
"sglang:max_total_num_tokens",
|
||||
"sglang:context_len",
|
||||
"sglang:time_to_first_token_seconds",
|
||||
"sglang:time_per_output_token_seconds",
|
||||
"sglang:e2e_request_latency_seconds",
|
||||
]
|
||||
|
||||
for metric in essential_metrics:
|
||||
self.assertIn(metric, metrics_content, f"Missing metric: {metric}")
|
||||
|
||||
# Verify model name label is present and correct
|
||||
expected_model_name = TEST_MODEL
|
||||
self.assertIn(f'model_name="{expected_model_name}"', metrics_content)
|
||||
# Verify metrics have values (not empty)
|
||||
self.assertIn("_sum{", metrics_content)
|
||||
self.assertIn("_count{", metrics_content)
|
||||
self.assertIn("_bucket{", metrics_content)
|
||||
|
||||
finally:
|
||||
kill_child_process(process.pid, include_self=True)
|
||||
|
||||
def test_metrics_disabled(self):
|
||||
"""Test that metrics endpoint returns 404 when disabled"""
|
||||
# Launch server with metrics disabled
|
||||
process = popen_launch_server(
|
||||
model=TEST_MODEL,
|
||||
base_url=DEFAULT_URL_FOR_TEST,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
enable_metrics=False,
|
||||
)
|
||||
|
||||
try:
|
||||
response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
# Verify metrics endpoint is not available
|
||||
metrics_response = requests.get(f"{DEFAULT_URL_FOR_TEST}/metrics")
|
||||
self.assertEqual(metrics_response.status_code, 404)
|
||||
|
||||
finally:
|
||||
kill_child_process(process.pid, include_self=True)
|
||||
Reference in New Issue
Block a user