Simplify prometheus metrics (#1981)

Co-authored-by: Mohit Reddy <mohitreddy1996@users.noreply.github.com>
2024-11-10 04:39:32 -08:00
parent ed53ac84b4
commit 1929c06762
11 changed files with 483 additions and 632 deletions
--- a/python/sglang/srt/metrics/collector.py
+++ b/python/sglang/srt/metrics/collector.py
@@ -0,0 +1,211 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Utilities for Prometheus Metrics Collection."""
+
+from dataclasses import dataclass
+from typing import Dict, Union
+
+
+@dataclass
+class SchedulerStats:
+    num_running_reqs: int = 0
+    num_used_tokens: int = 0
+    token_usage: float = 0.0
+    gen_throughput: float = 0.0
+    num_queue_reqs: int = 0
+    cache_hit_rate: float = 0.0
+
+
+class SchedulerMetricsCollector:
+
+    def __init__(self, labels: Dict[str, str]) -> None:
+        # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+        from prometheus_client import Gauge
+
+        self.labels = labels
+
+        self.num_running_reqs = Gauge(
+            name="sglang:num_running_reqs",
+            documentation="The number of running requests",
+            labelnames=labels.keys(),
+            multiprocess_mode="sum",
+        )
+
+        self.num_used_tokens = Gauge(
+            name="sglang:num_used_tokens",
+            documentation="The number of used tokens",
+            labelnames=labels.keys(),
+            multiprocess_mode="sum",
+        )
+
+        self.token_usage = Gauge(
+            name="sglang:token_usage",
+            documentation="The token usage",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+        self.gen_throughput = Gauge(
+            name="sglang:gen_throughput",
+            documentation="The generate throughput (token/s)",
+            labelnames=labels.keys(),
+            multiprocess_mode="sum",
+        )
+
+        self.num_queue_reqs = Gauge(
+            name="sglang:num_queue_reqs",
+            documentation="The number of requests in the waiting queue",
+            labelnames=labels.keys(),
+            multiprocess_mode="sum",
+        )
+
+        self.cache_hit_rate = Gauge(
+            name="sglang:cache_hit_rate",
+            documentation="The cache hit rate",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+        # Convenience function for logging to gauge.
+        gauge.labels(**self.labels).set(data)
+
+    def log_stats(self, stats: SchedulerStats) -> None:
+        self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
+        self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
+        self._log_gauge(self.token_usage, stats.token_usage)
+        self._log_gauge(self.gen_throughput, stats.gen_throughput)
+        self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
+        self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
+
+
+class TokenizerMetricsCollector:
+    def __init__(self, labels: Dict[str, str]) -> None:
+        # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+        from prometheus_client import Counter, Histogram
+
+        self.labels = labels
+
+        self.prompt_tokens_total = Counter(
+            name="sglang:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labels.keys(),
+        )
+
+        self.generation_tokens_total = Counter(
+            name="sglang:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labels.keys(),
+        )
+
+        self.histogram_time_to_first_token = Histogram(
+            name="sglang:time_to_first_token_seconds",
+            documentation="Histogram of time to first token in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.001,
+                0.005,
+                0.01,
+                0.02,
+                0.04,
+                0.06,
+                0.08,
+                0.1,
+                0.25,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                15.0,
+                20.0,
+                25.0,
+                30.0,
+            ],
+        )
+
+        self.histogram_time_per_output_token = Histogram(
+            name="sglang:time_per_output_token_seconds",
+            documentation="Histogram of time per output token in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.005,
+                0.01,
+                0.015,
+                0.02,
+                0.025,
+                0.03,
+                0.04,
+                0.05,
+                0.075,
+                0.1,
+                0.15,
+                0.2,
+                0.3,
+                0.4,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+            ],
+        )
+
+        self.histogram_e2e_request_latency = Histogram(
+            name="sglang:e2e_request_latency_seconds",
+            documentation="Histogram of End-to-end request latency in seconds",
+            labelnames=labels.keys(),
+            buckets=[
+                0.3,
+                0.5,
+                0.8,
+                1.0,
+                1.5,
+                2.0,
+                2.5,
+                5.0,
+                10.0,
+                15.0,
+                20.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
+        )
+
+    def _log_histogram(self, histogram, data: Union[int, float]) -> None:
+        histogram.labels(**self.labels).observe(data)
+
+    def _log_counter(self, counter, data: Union[int, float]) -> None:
+        # Convenience function for logging to counter.
+        counter.labels(**self.labels).inc(data)
+
+    def inc_prompt_tokens(self, value: int):
+        self._log_counter(self.prompt_tokens_total, value)
+
+    def inc_generation_tokens(self, value: int):
+        self._log_counter(self.generation_tokens_total, value)
+
+    def observe_time_to_first_token(self, value: Union[float, int]):
+        self._log_histogram(self.histogram_time_to_first_token, value)
+
+    def observe_time_per_output_token(self, value: Union[float, int]):
+        self._log_histogram(self.histogram_time_per_output_token, value)
+
+    def observe_e2e_request_latency(self, value: Union[float, int]):
+        self._log_histogram(self.histogram_e2e_request_latency, value)
--- a/python/sglang/srt/metrics/func_timer.py
+++ b/python/sglang/srt/metrics/func_timer.py
@@ -0,0 +1,108 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Records the latency of some functions
+"""
+
+import asyncio
+import time
+from functools import wraps
+from typing import Any, Callable, List, Optional
+
+enable_metrics = False
+
+
+def enable_func_timer():
+    # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+    from prometheus_client import Histogram
+
+    global enable_metrics, FUNC_LATENCY
+    enable_metrics = True
+
+    FUNC_LATENCY = Histogram(
+        "sglang:func_latency_seconds",
+        "Function latency in seconds",
+        # captures latency in range [50ms - ~50s]
+        buckets=exponential_buckets(start=0.05, width=1.5, length=18),
+        labelnames=["name"],
+    )
+
+
+FUNC_LATENCY = None
+
+
+def exponential_buckets(start: float, width: float, length: int) -> List[float]:
+    buckets = []
+    for i in range(length):
+        buckets.append(start * (width**i))
+    return buckets
+
+
+def time_func_latency(
+    func: Callable = None, name: Optional[str] = None
+) -> Callable[..., Any]:
+    """
+    A decorator to observe the latency of a function's execution. Supports both sync and async functions.
+
+    NOTE: We use our own implementation of a timer decorator since prometheus_client does not support async
+    context manager yet.
+
+    Overhead: The overhead introduced here in case of an async function could likely be because of `await` introduced
+    which will return in another coroutine object creation and under heavy load could see longer wall time
+    (scheduling delays due to introduction of another awaitable).
+    """
+
+    def measure(func: Callable[..., Any]) -> Callable[..., Any]:
+        nonlocal name
+
+        name = name or func.__name__
+
+        @wraps(func)
+        async def async_wrapper(*args, **kwargs):
+            if not enable_metrics:
+                return await func(*args, **kwargs)
+
+            metric = FUNC_LATENCY
+            start = time.monotonic()
+            ret = func(*args, **kwargs)
+            if isinstance(ret, asyncio.Future) or asyncio.iscoroutine(ret):
+                try:
+                    ret = await ret
+                finally:
+                    metric.labels(name=name).observe(time.monotonic() - start)
+            return ret
+
+        @wraps(func)
+        def sync_wrapper(*args, **kwargs):
+            if not enable_metrics:
+                return func(*args, **kwargs)
+
+            metric = FUNC_LATENCY
+            start = time.monotonic()
+            try:
+                ret = func(*args, **kwargs)
+            finally:
+                metric.labels(name=name).observe(time.monotonic() - start)
+            return ret
+
+        if asyncio.iscoroutinefunction(func):
+            return async_wrapper
+        return sync_wrapper
+
+    if func:
+        return measure(func)
+    else:
+        return measure
--- a/python/sglang/srt/metrics/metrics_collector.py
+++ b/python/sglang/srt/metrics/metrics_collector.py
@@ -1,388 +0,0 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-"""Utilities for Prometheus Metrics Collection."""
-
-import logging
-from abc import ABC, abstractmethod
-from typing import Counter as CollectionsCounter
-from typing import Dict, List, Union
-
-import numpy as np
-from prometheus_client import Counter, Gauge, Histogram
-
-from sglang.srt.metrics.metrics_types import Stats
-
-
-class Metrics:
-    """
-    SGLang Metrics
-    """
-
-    def __init__(self, labelnames: List[str], max_model_len):
-
-        # Configuration Stats
-        self.max_total_num_tokens = Gauge(
-            name="sglang:max_total_num_tokens",
-            documentation="Maximum total number of tokens",
-            labelnames=labelnames,
-            multiprocess_mode="min",
-        )  # static across processes
-
-        self.max_prefill_tokens = Gauge(
-            name="sglang:max_prefill_tokens",
-            documentation="Maximum prefill tokens",
-            labelnames=labelnames,
-            multiprocess_mode="min",
-        )  # static across processes
-
-        self.max_running_requests = Gauge(
-            name="sglang:max_running_requests",
-            documentation="Maximum running requests",
-            labelnames=labelnames,
-            multiprocess_mode="min",
-        )  # static across processes
-
-        self.context_len = Gauge(
-            name="sglang:context_len",
-            documentation="Context length",
-            labelnames=labelnames,
-            multiprocess_mode="min",
-        )  # static across processes
-        # Decode Stats
-        self.num_running_sys = Gauge(
-            name="sglang:num_requests_running",
-            documentation="Number of requests currently running on GPU",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        self.num_waiting_sys = Gauge(
-            name="sglang:num_requests_waiting",
-            documentation="Number of requests waiting to be processed.",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        self.gen_throughput = Gauge(
-            name="sglang:gen_throughput",
-            documentation="Gen token throughput (token/s)",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        self.token_usage = Gauge(
-            name="sglang:token_usage",
-            documentation="Total token usage",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        # System Stats
-        #   KV Cache Usage in %
-        # self.gpu_cache_usage_sys = Gauge(
-        #     "gpu_cache_usage_perc",
-        #     "GPU KV-cache usage. 1 means 100 percent usage.",
-        #     labelnames=labelnames,
-        #     multiprocess_mode="sum")
-
-        self.new_seq = Gauge(
-            name="sglang:new_seq",
-            documentation="Number of new sequences",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        self.new_token = Gauge(
-            name="sglang:new_token",
-            documentation="Number of new token",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        #   Prefix caching block hit rate
-        self.cached_token = Gauge(
-            name="sglang:cached_token",
-            documentation="Number of cached token",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        self.cache_hit_rate = Gauge(
-            name="sglang:cache_hit_rate",
-            documentation="Cache hit rate",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        self.queue_req = Gauge(
-            name="sglang:queue_req",
-            documentation="Number of queued requests",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-
-        # Iteration stats
-        self.counter_prompt_tokens = Counter(
-            name="sglang:prompt_tokens_total",
-            documentation="Number of prefill tokens processed.",
-            labelnames=labelnames,
-        )
-        self.counter_generation_tokens = Counter(
-            name="sglang:generation_tokens_total",
-            documentation="Number of generation tokens processed.",
-            labelnames=labelnames,
-        )
-        self.histogram_time_to_first_token = Histogram(
-            name="sglang:time_to_first_token_seconds",
-            documentation="Histogram of time to first token in seconds.",
-            labelnames=labelnames,
-            buckets=[
-                0.001,
-                0.005,
-                0.01,
-                0.02,
-                0.04,
-                0.06,
-                0.08,
-                0.1,
-                0.25,
-                0.5,
-                0.75,
-                1.0,
-                2.5,
-                5.0,
-                7.5,
-                10.0,
-                15.0,
-                20.0,
-                25.0,
-                30.0,
-            ],
-        )
-        self.histogram_time_per_output_token = Histogram(
-            name="sglang:time_per_output_token_seconds",
-            documentation="Histogram of time per output token in seconds.",
-            labelnames=labelnames,
-            buckets=[
-                0.005,
-                0.01,
-                0.015,
-                0.02,
-                0.025,
-                0.03,
-                0.04,
-                0.05,
-                0.075,
-                0.1,
-                0.15,
-                0.2,
-                0.3,
-                0.4,
-                0.5,
-                0.75,
-                1.0,
-                2.5,
-            ],
-        )
-
-        # Request Stats
-        #   Metadata
-        self.num_prompt_tokens_requests = Histogram(
-            name="sglang:request_prompt_tokens",
-            documentation="Number of prefill tokens processed",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.num_generation_tokens_requests = Histogram(
-            name="sglang:request_generation_tokens",
-            documentation="Number of generation tokens processed.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.finished_reason_requests = Counter(
-            name="sglang:request_success_total",
-            documentation="Count of successfully processed requests.",
-            labelnames=labelnames + ["finished_reason"],
-        )
-        self.histogram_time_e2e_requests = Histogram(
-            name="sglang:e2e_request_latency_seconds",
-            documentation="Histogram of End-to-end request latency in seconds",
-            labelnames=labelnames,
-            buckets=[
-                0.3,
-                0.5,
-                0.8,
-                1.0,
-                1.5,
-                2.0,
-                2.5,
-                5.0,
-                10.0,
-                15.0,
-                20.0,
-                30.0,
-                40.0,
-                50.0,
-                60.0,
-            ],
-        )
-        self.histogram_time_waiting_requests = Histogram(
-            name="sglang:waiting_request_latency_seconds",
-            documentation="Histogram of request waiting time in seconds",
-            labelnames=labelnames,
-            buckets=[
-                0.3,
-                0.5,
-                0.8,
-                1.0,
-                1.5,
-                2.0,
-                2.5,
-                5.0,
-                10.0,
-                15.0,
-                20.0,
-                30.0,
-                40.0,
-                50.0,
-                60.0,
-            ],
-        )
-        self.histogram_time_decode_requests = Histogram(
-            name="sglang:decode_request_latency_seconds",
-            documentation="Histogram of request decoding time in seconds",
-            labelnames=labelnames,
-            buckets=[
-                0.3,
-                0.5,
-                0.8,
-                1.0,
-                1.5,
-                2.0,
-                2.5,
-                5.0,
-                10.0,
-                15.0,
-                20.0,
-                30.0,
-                40.0,
-                50.0,
-                60.0,
-            ],
-        )
-
-
-class MetricsCollector(ABC):
-    """
-    SGLang Metrics Collector
-    """
-
-    @abstractmethod
-    def log_stats(self, stats: Stats) -> None:
-        pass
-
-
-class PrometheusMetricsCollector(MetricsCollector):
-    """
-    SGLang Metrics Collector
-    """
-
-    def __init__(self, labels: Dict[str, str], max_model_len: int) -> None:
-        self.labels = labels
-        self.metrics = Metrics(
-            labelnames=list(labels.keys()), max_model_len=max_model_len
-        )
-
-    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
-        # Convenience function for logging to gauge.
-        gauge.labels(**self.labels).set(data)
-
-    def _log_counter(self, counter, data: Union[int, float]) -> None:
-        # Convenience function for logging to counter.
-        counter.labels(**self.labels).inc(data)
-
-    def _log_counter_labels(
-        self, counter, data: CollectionsCounter, label_key: str
-    ) -> None:
-        # Convenience function for collection counter of labels.
-        for label, count in data.items():
-            counter.labels(**{**self.labels, label_key: label}).inc(count)
-
-    def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
-        # Convenience function for logging list to histogram.
-        for datum in data:
-            histogram.labels(**self.labels).observe(datum)
-
-    def log_stats(self, stats: Stats) -> None:
-        self._log_gauge(self.metrics.max_total_num_tokens, stats.max_total_num_tokens)
-        self._log_gauge(self.metrics.max_prefill_tokens, stats.max_prefill_tokens)
-        self._log_gauge(self.metrics.max_running_requests, stats.max_running_requests)
-        self._log_gauge(self.metrics.context_len, stats.context_len)
-        self._log_histogram(
-            self.metrics.num_prompt_tokens_requests, stats.num_prompt_tokens_requests
-        )
-        self._log_histogram(
-            self.metrics.num_generation_tokens_requests,
-            stats.num_generation_tokens_requests,
-        )
-
-        self._log_counter(
-            self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
-        )
-        self._log_counter(
-            self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter
-        )
-        self._log_histogram(
-            self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter
-        )
-        self._log_histogram(
-            self.metrics.histogram_time_per_output_token,
-            stats.time_per_output_tokens_iter,
-        )
-
-        # self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
-        self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
-        self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
-        self._log_gauge(self.metrics.gen_throughput, stats.gen_throughput)
-        self._log_gauge(self.metrics.token_usage, stats.token_usage)
-        self._log_histogram(
-            self.metrics.histogram_time_e2e_requests, stats.time_e2e_requests
-        )
-        self._log_histogram(
-            self.metrics.histogram_time_waiting_requests, stats.time_waiting_requests
-        )
-        self._log_histogram(
-            self.metrics.histogram_time_decode_requests, stats.time_decode_requests
-        )
-        self._log_gauge(self.metrics.new_seq, stats.new_seq)
-        self._log_gauge(self.metrics.new_token, stats.new_token)
-        self._log_gauge(self.metrics.cached_token, stats.cached_token)
-        self._log_gauge(self.metrics.cache_hit_rate, stats.cache_hit_rate)
-        self._log_gauge(self.metrics.queue_req, stats.queue_req)
-
-
-def build_1_2_5_buckets(max_value: int) -> List[int]:
-    """
-    Builds a list of buckets with increasing powers of 10 multiplied by
-    mantissa values (1, 2, 5) until the value exceeds the specified maximum.
-
-    Example:
-    >>> build_1_2_5_buckets(100)
-    [1, 2, 5, 10, 20, 50, 100]
-    """
-    mantissa_lst = [1, 2, 5]
-    exponent = 0
-    buckets: List[int] = []
-    while True:
-        for m in mantissa_lst:
-            value = m * 10**exponent
-            if value <= max_value:
-                buckets.append(value)
-            else:
-                return buckets
-        exponent += 1
--- a/python/sglang/srt/metrics/metrics_types.py
+++ b/python/sglang/srt/metrics/metrics_types.py
@@ -1,54 +0,0 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-"""Metrics Types"""
-
-from dataclasses import dataclass, field
-from typing import List
-
-
-@dataclass
-class Stats:
-    # config
-    max_total_num_tokens: int = 0
-    max_prefill_tokens: int = 0
-    max_running_requests: int = 0
-    context_len: int = 0
-    # request stats
-    num_prompt_tokens_requests: List[int] = field(default_factory=list)
-    num_generation_tokens_requests: List[int] = field(default_factory=list)
-    finished_reason_requests: List[str] = field(default_factory=list)
-    # decode stats
-    num_running_req: int = 0
-    num_waiting_req: int = 0
-    gen_throughput: float = 0.0
-    waiting_queue: int = 0
-    time_e2e_requests: List[float] = field(default_factory=list)
-    time_waiting_requests: List[float] = field(default_factory=list)
-    time_decode_requests: List[float] = field(default_factory=list)
-    # system stats
-    token_usage: float = 0.0
-    new_seq: int = 0
-    new_token: int = 0
-    cached_token: int = 0
-    cache_hit_rate: float = 0.0
-    running_req: int = 0
-    queue_req: int = 0
-
-    # Iteration stats (should have _iter suffix)
-    num_prompt_tokens_iter: int = 0
-    num_generation_tokens_iter: int = 0
-    time_to_first_tokens_iter: List[float] = field(default_factory=list)
-    time_per_output_tokens_iter: List[float] = field(default_factory=list)