Simplify prometheus metrics (#1981)

Co-authored-by: Mohit Reddy <mohitreddy1996@users.noreply.github.com>
This commit is contained in:
Lianmin Zheng
2024-11-10 04:39:32 -08:00
committed by GitHub
parent ed53ac84b4
commit 1929c06762
11 changed files with 483 additions and 632 deletions

View File

@@ -0,0 +1,211 @@
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""Utilities for Prometheus Metrics Collection."""
from dataclasses import dataclass
from typing import Dict, Union
@dataclass
class SchedulerStats:
num_running_reqs: int = 0
num_used_tokens: int = 0
token_usage: float = 0.0
gen_throughput: float = 0.0
num_queue_reqs: int = 0
cache_hit_rate: float = 0.0
class SchedulerMetricsCollector:
def __init__(self, labels: Dict[str, str]) -> None:
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
from prometheus_client import Gauge
self.labels = labels
self.num_running_reqs = Gauge(
name="sglang:num_running_reqs",
documentation="The number of running requests",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.num_used_tokens = Gauge(
name="sglang:num_used_tokens",
documentation="The number of used tokens",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.token_usage = Gauge(
name="sglang:token_usage",
documentation="The token usage",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
self.gen_throughput = Gauge(
name="sglang:gen_throughput",
documentation="The generate throughput (token/s)",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.num_queue_reqs = Gauge(
name="sglang:num_queue_reqs",
documentation="The number of requests in the waiting queue",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.cache_hit_rate = Gauge(
name="sglang:cache_hit_rate",
documentation="The cache hit rate",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge.
gauge.labels(**self.labels).set(data)
def log_stats(self, stats: SchedulerStats) -> None:
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
self._log_gauge(self.token_usage, stats.token_usage)
self._log_gauge(self.gen_throughput, stats.gen_throughput)
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
class TokenizerMetricsCollector:
def __init__(self, labels: Dict[str, str]) -> None:
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
from prometheus_client import Counter, Histogram
self.labels = labels
self.prompt_tokens_total = Counter(
name="sglang:prompt_tokens_total",
documentation="Number of prefill tokens processed.",
labelnames=labels.keys(),
)
self.generation_tokens_total = Counter(
name="sglang:generation_tokens_total",
documentation="Number of generation tokens processed.",
labelnames=labels.keys(),
)
self.histogram_time_to_first_token = Histogram(
name="sglang:time_to_first_token_seconds",
documentation="Histogram of time to first token in seconds.",
labelnames=labels.keys(),
buckets=[
0.001,
0.005,
0.01,
0.02,
0.04,
0.06,
0.08,
0.1,
0.25,
0.5,
0.75,
1.0,
2.5,
5.0,
7.5,
10.0,
15.0,
20.0,
25.0,
30.0,
],
)
self.histogram_time_per_output_token = Histogram(
name="sglang:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.",
labelnames=labels.keys(),
buckets=[
0.005,
0.01,
0.015,
0.02,
0.025,
0.03,
0.04,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.4,
0.5,
0.75,
1.0,
2.5,
],
)
self.histogram_e2e_request_latency = Histogram(
name="sglang:e2e_request_latency_seconds",
documentation="Histogram of End-to-end request latency in seconds",
labelnames=labels.keys(),
buckets=[
0.3,
0.5,
0.8,
1.0,
1.5,
2.0,
2.5,
5.0,
10.0,
15.0,
20.0,
30.0,
40.0,
50.0,
60.0,
],
)
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
histogram.labels(**self.labels).observe(data)
def _log_counter(self, counter, data: Union[int, float]) -> None:
# Convenience function for logging to counter.
counter.labels(**self.labels).inc(data)
def inc_prompt_tokens(self, value: int):
self._log_counter(self.prompt_tokens_total, value)
def inc_generation_tokens(self, value: int):
self._log_counter(self.generation_tokens_total, value)
def observe_time_to_first_token(self, value: Union[float, int]):
self._log_histogram(self.histogram_time_to_first_token, value)
def observe_time_per_output_token(self, value: Union[float, int]):
self._log_histogram(self.histogram_time_per_output_token, value)
def observe_e2e_request_latency(self, value: Union[float, int]):
self._log_histogram(self.histogram_e2e_request_latency, value)