Fix metrics (#1963)

2024-11-08 23:21:11 -08:00
parent d1150e9a00
commit 95a4ed129a
6 changed files with 142 additions and 8 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -17,7 +17,7 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]

 [project.optional-dependencies]
 runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
-    "orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
+    "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
    "torchao", "uvicorn", "uvloop", "zmq",
    "outlines>=0.0.44", "modelscope"]
 srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
--- a/python/sglang/srt/metrics/metrics_collector.py
+++ b/python/sglang/srt/metrics/metrics_collector.py
@@ -213,19 +213,67 @@ class Metrics:
            name="sglang:e2e_request_latency_seconds",
            documentation="Histogram of End-to-end request latency in seconds",
            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
+            buckets=[
+                0.3,
+                0.5,
+                0.8,
+                1.0,
+                1.5,
+                2.0,
+                2.5,
+                5.0,
+                10.0,
+                15.0,
+                20.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
        )
        self.histogram_time_waiting_requests = Histogram(
            name="sglang:waiting_request_latency_seconds",
            documentation="Histogram of request waiting time in seconds",
            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
+            buckets=[
+                0.3,
+                0.5,
+                0.8,
+                1.0,
+                1.5,
+                2.0,
+                2.5,
+                5.0,
+                10.0,
+                15.0,
+                20.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
        )
        self.histogram_time_decode_requests = Histogram(
            name="sglang:decode_request_latency_seconds",
            documentation="Histogram of request decoding time in seconds",
            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
+            buckets=[
+                0.3,
+                0.5,
+                0.8,
+                1.0,
+                1.5,
+                2.0,
+                2.5,
+                5.0,
+                10.0,
+                15.0,
+                20.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
        )


--- a/python/sglang/srt/metrics/metrics_types.py
+++ b/python/sglang/srt/metrics/metrics_types.py
@@ -34,15 +34,12 @@ class Stats:
    num_running_req: int = 0
    num_waiting_req: int = 0
    gen_throughput: float = 0.0
-    num_token: int = 0
-    token_usage: float = 0.0
    waiting_queue: int = 0
    time_e2e_requests: List[float] = field(default_factory=list)
    time_waiting_requests: List[float] = field(default_factory=list)
    time_decode_requests: List[float] = field(default_factory=list)
    # system stats
    token_usage: float = 0.0
-    is_mixed_chunk: bool = False
    new_seq: int = 0
    new_token: int = 0
    cached_token: int = 0
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -446,6 +446,9 @@ def launch_server(
    2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
    """

+    if server_args.enable_metrics:
+        _set_prometheus_env()
+
    launch_engine(server_args=server_args)

    # Add api key authorization
@@ -454,7 +457,6 @@ def launch_server(

    # add prometheus middleware
    if server_args.enable_metrics:
-        _set_prometheus_env()
        add_prometheus_middleware(app)

    # Send a warmup request
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -404,6 +404,7 @@ def popen_launch_server(
    other_args: tuple = (),
    env: Optional[dict] = None,
    return_stdout_stderr: Optional[tuple] = None,
+    enable_metrics: bool = False,
 ):
    _, host, port = base_url.split(":")
    host = host[2:]
@@ -422,6 +423,8 @@ def popen_launch_server(
    ]
    if api_key:
        command += ["--api-key", api_key]
+    if enable_metrics:
+        command += ["--enable-metrics"]

    if return_stdout_stderr:
        process = subprocess.Popen(