support prometheus metrics (#1853)
Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com> Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
This commit is contained in:
297
python/sglang/srt/metrics/metrics_collector.py
Normal file
297
python/sglang/srt/metrics/metrics_collector.py
Normal file
@@ -0,0 +1,297 @@
|
||||
"""
|
||||
Copyright 2023-2024 SGLang Team
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
"""Utilities for Prometheus Metrics Collection."""
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Counter as CollectionsCounter
|
||||
from typing import Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
from prometheus_client import Counter, Gauge, Histogram
|
||||
|
||||
from sglang.srt.metrics.metrics_types import Stats
|
||||
|
||||
|
||||
class Metrics:
|
||||
"""
|
||||
SGLang Metrics
|
||||
"""
|
||||
|
||||
def __init__(self, labelnames: List[str], max_model_len):
|
||||
|
||||
# Configuration Stats
|
||||
self.max_total_num_tokens = Gauge(
|
||||
name="sglang:max_total_num_tokens",
|
||||
documentation="Maximum total number of tokens",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="min",
|
||||
) # static across processes
|
||||
|
||||
self.max_prefill_tokens = Gauge(
|
||||
name="sglang:max_prefill_tokens",
|
||||
documentation="Maximum prefill tokens",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="min",
|
||||
) # static across processes
|
||||
|
||||
self.max_running_requests = Gauge(
|
||||
name="sglang:max_running_requests",
|
||||
documentation="Maximum running requests",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="min",
|
||||
) # static across processes
|
||||
|
||||
self.context_len = Gauge(
|
||||
name="sglang:context_len",
|
||||
documentation="Context length",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="min",
|
||||
) # static across processes
|
||||
# Decode Stats
|
||||
self.num_running_sys = Gauge(
|
||||
name="sglang:num_requests_running",
|
||||
documentation="Number of requests currently running on GPU",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.num_waiting_sys = Gauge(
|
||||
name="sglang:num_requests_waiting",
|
||||
documentation="Number of requests waiting to be processed.",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.gen_throughput = Gauge(
|
||||
name="sglang:gen_throughput",
|
||||
documentation="Gen token throughput (token/s)",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.token_usage = Gauge(
|
||||
name="sglang:token_usage",
|
||||
documentation="Total token usage",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
# System Stats
|
||||
# KV Cache Usage in %
|
||||
# self.gpu_cache_usage_sys = Gauge(
|
||||
# "gpu_cache_usage_perc",
|
||||
# "GPU KV-cache usage. 1 means 100 percent usage.",
|
||||
# labelnames=labelnames,
|
||||
# multiprocess_mode="sum")
|
||||
|
||||
self.new_seq = Gauge(
|
||||
name="sglang:new_seq",
|
||||
documentation="Number of new sequences",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.new_token = Gauge(
|
||||
name="sglang:new_token",
|
||||
documentation="Number of new token",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
# Prefix caching block hit rate
|
||||
self.cached_token = Gauge(
|
||||
name="sglang:cached_token",
|
||||
documentation="Number of cached token",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.cache_hit_rate = Gauge(
|
||||
name="sglang:cache_hit_rate",
|
||||
documentation="Cache hit rate",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.queue_req = Gauge(
|
||||
name="sglang:queue_req",
|
||||
documentation="Number of queued requests",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
|
||||
# Iteration stats
|
||||
self.counter_prompt_tokens = Counter(
|
||||
name="sglang:prompt_tokens_total",
|
||||
documentation="Number of prefill tokens processed.",
|
||||
labelnames=labelnames)
|
||||
self.counter_generation_tokens = Counter(
|
||||
name="sglang:generation_tokens_total",
|
||||
documentation="Number of generation tokens processed.",
|
||||
labelnames=labelnames)
|
||||
self.histogram_time_to_first_token = Histogram(
|
||||
name="sglang:time_to_first_token_seconds",
|
||||
documentation="Histogram of time to first token in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
|
||||
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 15.0, 20.0, 25.0, 30.0
|
||||
])
|
||||
self.histogram_time_per_output_token = Histogram(
|
||||
name="sglang:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5
|
||||
])
|
||||
|
||||
# Request Stats
|
||||
# Metadata
|
||||
self.num_prompt_tokens_requests = Histogram(
|
||||
name="sglang:request_prompt_tokens",
|
||||
documentation="Number of prefill tokens processed",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.num_generation_tokens_requests = Histogram(
|
||||
name="sglang:request_generation_tokens",
|
||||
documentation="Number of generation tokens processed.",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.finished_reason_requests = Counter(
|
||||
name="sglang:request_success_total",
|
||||
documentation="Count of successfully processed requests.",
|
||||
labelnames=labelnames + ["finished_reason"],
|
||||
)
|
||||
self.histogram_time_e2e_requests = Histogram(
|
||||
name="sglang:e2e_request_latency_seconds",
|
||||
documentation="Histogram of End-to-end request latency in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.histogram_time_waiting_requests = Histogram(
|
||||
name="sglang:waiting_request_latency_seconds",
|
||||
documentation="Histogram of request waiting time in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.histogram_time_decode_requests = Histogram(
|
||||
name="sglang:decode_request_latency_seconds",
|
||||
documentation="Histogram of request decoding time in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
|
||||
|
||||
class MetricsCollector(ABC):
|
||||
"""
|
||||
SGLang Metrics Collector
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def log_stats(self, stats: Stats) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class PrometheusMetricsCollector(MetricsCollector):
|
||||
"""
|
||||
SGLang Metrics Collector
|
||||
"""
|
||||
|
||||
def __init__(self, labels: Dict[str, str], max_model_len: int) -> None:
|
||||
self.labels = labels
|
||||
self.metrics = Metrics(
|
||||
labelnames=list(labels.keys()), max_model_len=max_model_len
|
||||
)
|
||||
|
||||
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to gauge.
|
||||
gauge.labels(**self.labels).set(data)
|
||||
|
||||
def _log_counter(self, counter, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to counter.
|
||||
counter.labels(**self.labels).inc(data)
|
||||
|
||||
def _log_counter_labels(
|
||||
self, counter, data: CollectionsCounter, label_key: str
|
||||
) -> None:
|
||||
# Convenience function for collection counter of labels.
|
||||
for label, count in data.items():
|
||||
counter.labels(**{**self.labels, label_key: label}).inc(count)
|
||||
|
||||
def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
|
||||
# Convenience function for logging list to histogram.
|
||||
for datum in data:
|
||||
histogram.labels(**self.labels).observe(datum)
|
||||
|
||||
def log_stats(self, stats: Stats) -> None:
|
||||
self._log_gauge(self.metrics.max_total_num_tokens, stats.max_total_num_tokens)
|
||||
self._log_gauge(self.metrics.max_prefill_tokens, stats.max_prefill_tokens)
|
||||
self._log_gauge(self.metrics.max_running_requests, stats.max_running_requests)
|
||||
self._log_gauge(self.metrics.context_len, stats.context_len)
|
||||
self._log_histogram(
|
||||
self.metrics.num_prompt_tokens_requests, stats.num_prompt_tokens_requests
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.num_generation_tokens_requests,
|
||||
stats.num_generation_tokens_requests,
|
||||
)
|
||||
|
||||
self._log_counter(self.metrics.counter_prompt_tokens,
|
||||
stats.num_prompt_tokens_iter)
|
||||
self._log_counter(self.metrics.counter_generation_tokens,
|
||||
stats.num_generation_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_to_first_token,
|
||||
stats.time_to_first_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_per_output_token,
|
||||
stats.time_per_output_tokens_iter)
|
||||
|
||||
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
|
||||
self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
|
||||
self._log_gauge(self.metrics.gen_throughput, stats.gen_throughput)
|
||||
self._log_gauge(self.metrics.token_usage, stats.token_usage)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_e2e_requests, stats.time_e2e_requests
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_waiting_requests, stats.time_waiting_requests
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_decode_requests, stats.time_decode_requests
|
||||
)
|
||||
self._log_gauge(self.metrics.new_seq, stats.new_seq)
|
||||
self._log_gauge(self.metrics.new_token, stats.new_token)
|
||||
self._log_gauge(self.metrics.cached_token, stats.cached_token)
|
||||
self._log_gauge(self.metrics.cache_hit_rate, stats.cache_hit_rate)
|
||||
self._log_gauge(self.metrics.queue_req, stats.queue_req)
|
||||
|
||||
|
||||
def build_1_2_5_buckets(max_value: int) -> List[int]:
|
||||
"""
|
||||
Builds a list of buckets with increasing powers of 10 multiplied by
|
||||
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
|
||||
|
||||
Example:
|
||||
>>> build_1_2_5_buckets(100)
|
||||
[1, 2, 5, 10, 20, 50, 100]
|
||||
"""
|
||||
mantissa_lst = [1, 2, 5]
|
||||
exponent = 0
|
||||
buckets: List[int] = []
|
||||
while True:
|
||||
for m in mantissa_lst:
|
||||
value = m * 10**exponent
|
||||
if value <= max_value:
|
||||
buckets.append(value)
|
||||
else:
|
||||
return buckets
|
||||
exponent += 1
|
||||
57
python/sglang/srt/metrics/metrics_types.py
Normal file
57
python/sglang/srt/metrics/metrics_types.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""
|
||||
Copyright 2023-2024 SGLang Team
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
"""Metrics Types"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class Stats:
|
||||
# config
|
||||
max_total_num_tokens: int = 0
|
||||
max_prefill_tokens: int = 0
|
||||
max_running_requests: int = 0
|
||||
context_len: int = 0
|
||||
# request stats
|
||||
num_prompt_tokens_requests: List[int] = field(default_factory=list)
|
||||
num_generation_tokens_requests: List[int] = field(default_factory=list)
|
||||
finished_reason_requests: List[str] = field(default_factory=list)
|
||||
# decode stats
|
||||
num_running_req: int = 0
|
||||
num_waiting_req: int = 0
|
||||
gen_throughput: float = 0.0
|
||||
num_token: int = 0
|
||||
token_usage: float = 0.0
|
||||
waiting_queue: int = 0
|
||||
time_e2e_requests: List[float] = field(default_factory=list)
|
||||
time_waiting_requests: List[float] = field(default_factory=list)
|
||||
time_decode_requests: List[float] = field(default_factory=list)
|
||||
# system stats
|
||||
token_usage: float = 0.0
|
||||
is_mixed_chunk: bool = False
|
||||
new_seq: int = 0
|
||||
new_token: int = 0
|
||||
cached_token: int = 0
|
||||
cache_hit_rate: float = 0.0
|
||||
running_req: int = 0
|
||||
queue_req: int = 0
|
||||
|
||||
# Iteration stats (should have _iter suffix)
|
||||
num_prompt_tokens_iter: int = 0
|
||||
num_generation_tokens_iter: int = 0
|
||||
time_to_first_tokens_iter: List[float] = field(default_factory=list)
|
||||
time_per_output_tokens_iter: List[float] = field(default_factory=list)
|
||||
Reference in New Issue
Block a user