Simplify prometheus metrics (#1981)

Co-authored-by: Mohit Reddy <mohitreddy1996@users.noreply.github.com>
This commit is contained in:
Lianmin Zheng
2024-11-10 04:39:32 -08:00
committed by GitHub
parent ed53ac84b4
commit 1929c06762
11 changed files with 483 additions and 632 deletions

View File

@@ -0,0 +1,211 @@
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""Utilities for Prometheus Metrics Collection."""
from dataclasses import dataclass
from typing import Dict, Union
@dataclass
class SchedulerStats:
num_running_reqs: int = 0
num_used_tokens: int = 0
token_usage: float = 0.0
gen_throughput: float = 0.0
num_queue_reqs: int = 0
cache_hit_rate: float = 0.0
class SchedulerMetricsCollector:
def __init__(self, labels: Dict[str, str]) -> None:
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
from prometheus_client import Gauge
self.labels = labels
self.num_running_reqs = Gauge(
name="sglang:num_running_reqs",
documentation="The number of running requests",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.num_used_tokens = Gauge(
name="sglang:num_used_tokens",
documentation="The number of used tokens",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.token_usage = Gauge(
name="sglang:token_usage",
documentation="The token usage",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
self.gen_throughput = Gauge(
name="sglang:gen_throughput",
documentation="The generate throughput (token/s)",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.num_queue_reqs = Gauge(
name="sglang:num_queue_reqs",
documentation="The number of requests in the waiting queue",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.cache_hit_rate = Gauge(
name="sglang:cache_hit_rate",
documentation="The cache hit rate",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge.
gauge.labels(**self.labels).set(data)
def log_stats(self, stats: SchedulerStats) -> None:
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
self._log_gauge(self.token_usage, stats.token_usage)
self._log_gauge(self.gen_throughput, stats.gen_throughput)
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
class TokenizerMetricsCollector:
def __init__(self, labels: Dict[str, str]) -> None:
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
from prometheus_client import Counter, Histogram
self.labels = labels
self.prompt_tokens_total = Counter(
name="sglang:prompt_tokens_total",
documentation="Number of prefill tokens processed.",
labelnames=labels.keys(),
)
self.generation_tokens_total = Counter(
name="sglang:generation_tokens_total",
documentation="Number of generation tokens processed.",
labelnames=labels.keys(),
)
self.histogram_time_to_first_token = Histogram(
name="sglang:time_to_first_token_seconds",
documentation="Histogram of time to first token in seconds.",
labelnames=labels.keys(),
buckets=[
0.001,
0.005,
0.01,
0.02,
0.04,
0.06,
0.08,
0.1,
0.25,
0.5,
0.75,
1.0,
2.5,
5.0,
7.5,
10.0,
15.0,
20.0,
25.0,
30.0,
],
)
self.histogram_time_per_output_token = Histogram(
name="sglang:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.",
labelnames=labels.keys(),
buckets=[
0.005,
0.01,
0.015,
0.02,
0.025,
0.03,
0.04,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.4,
0.5,
0.75,
1.0,
2.5,
],
)
self.histogram_e2e_request_latency = Histogram(
name="sglang:e2e_request_latency_seconds",
documentation="Histogram of End-to-end request latency in seconds",
labelnames=labels.keys(),
buckets=[
0.3,
0.5,
0.8,
1.0,
1.5,
2.0,
2.5,
5.0,
10.0,
15.0,
20.0,
30.0,
40.0,
50.0,
60.0,
],
)
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
histogram.labels(**self.labels).observe(data)
def _log_counter(self, counter, data: Union[int, float]) -> None:
# Convenience function for logging to counter.
counter.labels(**self.labels).inc(data)
def inc_prompt_tokens(self, value: int):
self._log_counter(self.prompt_tokens_total, value)
def inc_generation_tokens(self, value: int):
self._log_counter(self.generation_tokens_total, value)
def observe_time_to_first_token(self, value: Union[float, int]):
self._log_histogram(self.histogram_time_to_first_token, value)
def observe_time_per_output_token(self, value: Union[float, int]):
self._log_histogram(self.histogram_time_per_output_token, value)
def observe_e2e_request_latency(self, value: Union[float, int]):
self._log_histogram(self.histogram_e2e_request_latency, value)

View File

@@ -0,0 +1,108 @@
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""
Records the latency of some functions
"""
import asyncio
import time
from functools import wraps
from typing import Any, Callable, List, Optional
enable_metrics = False
def enable_func_timer():
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
from prometheus_client import Histogram
global enable_metrics, FUNC_LATENCY
enable_metrics = True
FUNC_LATENCY = Histogram(
"sglang:func_latency_seconds",
"Function latency in seconds",
# captures latency in range [50ms - ~50s]
buckets=exponential_buckets(start=0.05, width=1.5, length=18),
labelnames=["name"],
)
FUNC_LATENCY = None
def exponential_buckets(start: float, width: float, length: int) -> List[float]:
buckets = []
for i in range(length):
buckets.append(start * (width**i))
return buckets
def time_func_latency(
func: Callable = None, name: Optional[str] = None
) -> Callable[..., Any]:
"""
A decorator to observe the latency of a function's execution. Supports both sync and async functions.
NOTE: We use our own implementation of a timer decorator since prometheus_client does not support async
context manager yet.
Overhead: The overhead introduced here in case of an async function could likely be because of `await` introduced
which will return in another coroutine object creation and under heavy load could see longer wall time
(scheduling delays due to introduction of another awaitable).
"""
def measure(func: Callable[..., Any]) -> Callable[..., Any]:
nonlocal name
name = name or func.__name__
@wraps(func)
async def async_wrapper(*args, **kwargs):
if not enable_metrics:
return await func(*args, **kwargs)
metric = FUNC_LATENCY
start = time.monotonic()
ret = func(*args, **kwargs)
if isinstance(ret, asyncio.Future) or asyncio.iscoroutine(ret):
try:
ret = await ret
finally:
metric.labels(name=name).observe(time.monotonic() - start)
return ret
@wraps(func)
def sync_wrapper(*args, **kwargs):
if not enable_metrics:
return func(*args, **kwargs)
metric = FUNC_LATENCY
start = time.monotonic()
try:
ret = func(*args, **kwargs)
finally:
metric.labels(name=name).observe(time.monotonic() - start)
return ret
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
if func:
return measure(func)
else:
return measure

View File

@@ -1,388 +0,0 @@
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""Utilities for Prometheus Metrics Collection."""
import logging
from abc import ABC, abstractmethod
from typing import Counter as CollectionsCounter
from typing import Dict, List, Union
import numpy as np
from prometheus_client import Counter, Gauge, Histogram
from sglang.srt.metrics.metrics_types import Stats
class Metrics:
"""
SGLang Metrics
"""
def __init__(self, labelnames: List[str], max_model_len):
# Configuration Stats
self.max_total_num_tokens = Gauge(
name="sglang:max_total_num_tokens",
documentation="Maximum total number of tokens",
labelnames=labelnames,
multiprocess_mode="min",
) # static across processes
self.max_prefill_tokens = Gauge(
name="sglang:max_prefill_tokens",
documentation="Maximum prefill tokens",
labelnames=labelnames,
multiprocess_mode="min",
) # static across processes
self.max_running_requests = Gauge(
name="sglang:max_running_requests",
documentation="Maximum running requests",
labelnames=labelnames,
multiprocess_mode="min",
) # static across processes
self.context_len = Gauge(
name="sglang:context_len",
documentation="Context length",
labelnames=labelnames,
multiprocess_mode="min",
) # static across processes
# Decode Stats
self.num_running_sys = Gauge(
name="sglang:num_requests_running",
documentation="Number of requests currently running on GPU",
labelnames=labelnames,
multiprocess_mode="sum",
)
self.num_waiting_sys = Gauge(
name="sglang:num_requests_waiting",
documentation="Number of requests waiting to be processed.",
labelnames=labelnames,
multiprocess_mode="sum",
)
self.gen_throughput = Gauge(
name="sglang:gen_throughput",
documentation="Gen token throughput (token/s)",
labelnames=labelnames,
multiprocess_mode="sum",
)
self.token_usage = Gauge(
name="sglang:token_usage",
documentation="Total token usage",
labelnames=labelnames,
multiprocess_mode="sum",
)
# System Stats
# KV Cache Usage in %
# self.gpu_cache_usage_sys = Gauge(
# "gpu_cache_usage_perc",
# "GPU KV-cache usage. 1 means 100 percent usage.",
# labelnames=labelnames,
# multiprocess_mode="sum")
self.new_seq = Gauge(
name="sglang:new_seq",
documentation="Number of new sequences",
labelnames=labelnames,
multiprocess_mode="sum",
)
self.new_token = Gauge(
name="sglang:new_token",
documentation="Number of new token",
labelnames=labelnames,
multiprocess_mode="sum",
)
# Prefix caching block hit rate
self.cached_token = Gauge(
name="sglang:cached_token",
documentation="Number of cached token",
labelnames=labelnames,
multiprocess_mode="sum",
)
self.cache_hit_rate = Gauge(
name="sglang:cache_hit_rate",
documentation="Cache hit rate",
labelnames=labelnames,
multiprocess_mode="sum",
)
self.queue_req = Gauge(
name="sglang:queue_req",
documentation="Number of queued requests",
labelnames=labelnames,
multiprocess_mode="sum",
)
# Iteration stats
self.counter_prompt_tokens = Counter(
name="sglang:prompt_tokens_total",
documentation="Number of prefill tokens processed.",
labelnames=labelnames,
)
self.counter_generation_tokens = Counter(
name="sglang:generation_tokens_total",
documentation="Number of generation tokens processed.",
labelnames=labelnames,
)
self.histogram_time_to_first_token = Histogram(
name="sglang:time_to_first_token_seconds",
documentation="Histogram of time to first token in seconds.",
labelnames=labelnames,
buckets=[
0.001,
0.005,
0.01,
0.02,
0.04,
0.06,
0.08,
0.1,
0.25,
0.5,
0.75,
1.0,
2.5,
5.0,
7.5,
10.0,
15.0,
20.0,
25.0,
30.0,
],
)
self.histogram_time_per_output_token = Histogram(
name="sglang:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.",
labelnames=labelnames,
buckets=[
0.005,
0.01,
0.015,
0.02,
0.025,
0.03,
0.04,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.4,
0.5,
0.75,
1.0,
2.5,
],
)
# Request Stats
# Metadata
self.num_prompt_tokens_requests = Histogram(
name="sglang:request_prompt_tokens",
documentation="Number of prefill tokens processed",
labelnames=labelnames,
buckets=build_1_2_5_buckets(max_model_len),
)
self.num_generation_tokens_requests = Histogram(
name="sglang:request_generation_tokens",
documentation="Number of generation tokens processed.",
labelnames=labelnames,
buckets=build_1_2_5_buckets(max_model_len),
)
self.finished_reason_requests = Counter(
name="sglang:request_success_total",
documentation="Count of successfully processed requests.",
labelnames=labelnames + ["finished_reason"],
)
self.histogram_time_e2e_requests = Histogram(
name="sglang:e2e_request_latency_seconds",
documentation="Histogram of End-to-end request latency in seconds",
labelnames=labelnames,
buckets=[
0.3,
0.5,
0.8,
1.0,
1.5,
2.0,
2.5,
5.0,
10.0,
15.0,
20.0,
30.0,
40.0,
50.0,
60.0,
],
)
self.histogram_time_waiting_requests = Histogram(
name="sglang:waiting_request_latency_seconds",
documentation="Histogram of request waiting time in seconds",
labelnames=labelnames,
buckets=[
0.3,
0.5,
0.8,
1.0,
1.5,
2.0,
2.5,
5.0,
10.0,
15.0,
20.0,
30.0,
40.0,
50.0,
60.0,
],
)
self.histogram_time_decode_requests = Histogram(
name="sglang:decode_request_latency_seconds",
documentation="Histogram of request decoding time in seconds",
labelnames=labelnames,
buckets=[
0.3,
0.5,
0.8,
1.0,
1.5,
2.0,
2.5,
5.0,
10.0,
15.0,
20.0,
30.0,
40.0,
50.0,
60.0,
],
)
class MetricsCollector(ABC):
"""
SGLang Metrics Collector
"""
@abstractmethod
def log_stats(self, stats: Stats) -> None:
pass
class PrometheusMetricsCollector(MetricsCollector):
"""
SGLang Metrics Collector
"""
def __init__(self, labels: Dict[str, str], max_model_len: int) -> None:
self.labels = labels
self.metrics = Metrics(
labelnames=list(labels.keys()), max_model_len=max_model_len
)
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge.
gauge.labels(**self.labels).set(data)
def _log_counter(self, counter, data: Union[int, float]) -> None:
# Convenience function for logging to counter.
counter.labels(**self.labels).inc(data)
def _log_counter_labels(
self, counter, data: CollectionsCounter, label_key: str
) -> None:
# Convenience function for collection counter of labels.
for label, count in data.items():
counter.labels(**{**self.labels, label_key: label}).inc(count)
def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
# Convenience function for logging list to histogram.
for datum in data:
histogram.labels(**self.labels).observe(datum)
def log_stats(self, stats: Stats) -> None:
self._log_gauge(self.metrics.max_total_num_tokens, stats.max_total_num_tokens)
self._log_gauge(self.metrics.max_prefill_tokens, stats.max_prefill_tokens)
self._log_gauge(self.metrics.max_running_requests, stats.max_running_requests)
self._log_gauge(self.metrics.context_len, stats.context_len)
self._log_histogram(
self.metrics.num_prompt_tokens_requests, stats.num_prompt_tokens_requests
)
self._log_histogram(
self.metrics.num_generation_tokens_requests,
stats.num_generation_tokens_requests,
)
self._log_counter(
self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
)
self._log_counter(
self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter
)
self._log_histogram(
self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter
)
self._log_histogram(
self.metrics.histogram_time_per_output_token,
stats.time_per_output_tokens_iter,
)
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
self._log_gauge(self.metrics.gen_throughput, stats.gen_throughput)
self._log_gauge(self.metrics.token_usage, stats.token_usage)
self._log_histogram(
self.metrics.histogram_time_e2e_requests, stats.time_e2e_requests
)
self._log_histogram(
self.metrics.histogram_time_waiting_requests, stats.time_waiting_requests
)
self._log_histogram(
self.metrics.histogram_time_decode_requests, stats.time_decode_requests
)
self._log_gauge(self.metrics.new_seq, stats.new_seq)
self._log_gauge(self.metrics.new_token, stats.new_token)
self._log_gauge(self.metrics.cached_token, stats.cached_token)
self._log_gauge(self.metrics.cache_hit_rate, stats.cache_hit_rate)
self._log_gauge(self.metrics.queue_req, stats.queue_req)
def build_1_2_5_buckets(max_value: int) -> List[int]:
"""
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
mantissa_lst = [1, 2, 5]
exponent = 0
buckets: List[int] = []
while True:
for m in mantissa_lst:
value = m * 10**exponent
if value <= max_value:
buckets.append(value)
else:
return buckets
exponent += 1

View File

@@ -1,54 +0,0 @@
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""Metrics Types"""
from dataclasses import dataclass, field
from typing import List
@dataclass
class Stats:
# config
max_total_num_tokens: int = 0
max_prefill_tokens: int = 0
max_running_requests: int = 0
context_len: int = 0
# request stats
num_prompt_tokens_requests: List[int] = field(default_factory=list)
num_generation_tokens_requests: List[int] = field(default_factory=list)
finished_reason_requests: List[str] = field(default_factory=list)
# decode stats
num_running_req: int = 0
num_waiting_req: int = 0
gen_throughput: float = 0.0
waiting_queue: int = 0
time_e2e_requests: List[float] = field(default_factory=list)
time_waiting_requests: List[float] = field(default_factory=list)
time_decode_requests: List[float] = field(default_factory=list)
# system stats
token_usage: float = 0.0
new_seq: int = 0
new_token: int = 0
cached_token: int = 0
cache_hit_rate: float = 0.0
running_req: int = 0
queue_req: int = 0
# Iteration stats (should have _iter suffix)
num_prompt_tokens_iter: int = 0
num_generation_tokens_iter: int = 0
time_to_first_tokens_iter: List[float] = field(default_factory=list)
time_per_output_tokens_iter: List[float] = field(default_factory=list)