Simplify prometheus metrics (#1981)
Co-authored-by: Mohit Reddy <mohitreddy1996@users.noreply.github.com>
This commit is contained in:
211
python/sglang/srt/metrics/collector.py
Normal file
211
python/sglang/srt/metrics/collector.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
Copyright 2023-2024 SGLang Team
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
"""Utilities for Prometheus Metrics Collection."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Union
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchedulerStats:
|
||||
num_running_reqs: int = 0
|
||||
num_used_tokens: int = 0
|
||||
token_usage: float = 0.0
|
||||
gen_throughput: float = 0.0
|
||||
num_queue_reqs: int = 0
|
||||
cache_hit_rate: float = 0.0
|
||||
|
||||
|
||||
class SchedulerMetricsCollector:
|
||||
|
||||
def __init__(self, labels: Dict[str, str]) -> None:
|
||||
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
||||
from prometheus_client import Gauge
|
||||
|
||||
self.labels = labels
|
||||
|
||||
self.num_running_reqs = Gauge(
|
||||
name="sglang:num_running_reqs",
|
||||
documentation="The number of running requests",
|
||||
labelnames=labels.keys(),
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
|
||||
self.num_used_tokens = Gauge(
|
||||
name="sglang:num_used_tokens",
|
||||
documentation="The number of used tokens",
|
||||
labelnames=labels.keys(),
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
|
||||
self.token_usage = Gauge(
|
||||
name="sglang:token_usage",
|
||||
documentation="The token usage",
|
||||
labelnames=labels.keys(),
|
||||
multiprocess_mode="mostrecent",
|
||||
)
|
||||
|
||||
self.gen_throughput = Gauge(
|
||||
name="sglang:gen_throughput",
|
||||
documentation="The generate throughput (token/s)",
|
||||
labelnames=labels.keys(),
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
|
||||
self.num_queue_reqs = Gauge(
|
||||
name="sglang:num_queue_reqs",
|
||||
documentation="The number of requests in the waiting queue",
|
||||
labelnames=labels.keys(),
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
|
||||
self.cache_hit_rate = Gauge(
|
||||
name="sglang:cache_hit_rate",
|
||||
documentation="The cache hit rate",
|
||||
labelnames=labels.keys(),
|
||||
multiprocess_mode="mostrecent",
|
||||
)
|
||||
|
||||
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to gauge.
|
||||
gauge.labels(**self.labels).set(data)
|
||||
|
||||
def log_stats(self, stats: SchedulerStats) -> None:
|
||||
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
|
||||
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
|
||||
self._log_gauge(self.token_usage, stats.token_usage)
|
||||
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
||||
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
||||
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
||||
|
||||
|
||||
class TokenizerMetricsCollector:
|
||||
def __init__(self, labels: Dict[str, str]) -> None:
|
||||
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
||||
from prometheus_client import Counter, Histogram
|
||||
|
||||
self.labels = labels
|
||||
|
||||
self.prompt_tokens_total = Counter(
|
||||
name="sglang:prompt_tokens_total",
|
||||
documentation="Number of prefill tokens processed.",
|
||||
labelnames=labels.keys(),
|
||||
)
|
||||
|
||||
self.generation_tokens_total = Counter(
|
||||
name="sglang:generation_tokens_total",
|
||||
documentation="Number of generation tokens processed.",
|
||||
labelnames=labels.keys(),
|
||||
)
|
||||
|
||||
self.histogram_time_to_first_token = Histogram(
|
||||
name="sglang:time_to_first_token_seconds",
|
||||
documentation="Histogram of time to first token in seconds.",
|
||||
labelnames=labels.keys(),
|
||||
buckets=[
|
||||
0.001,
|
||||
0.005,
|
||||
0.01,
|
||||
0.02,
|
||||
0.04,
|
||||
0.06,
|
||||
0.08,
|
||||
0.1,
|
||||
0.25,
|
||||
0.5,
|
||||
0.75,
|
||||
1.0,
|
||||
2.5,
|
||||
5.0,
|
||||
7.5,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
25.0,
|
||||
30.0,
|
||||
],
|
||||
)
|
||||
|
||||
self.histogram_time_per_output_token = Histogram(
|
||||
name="sglang:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
labelnames=labels.keys(),
|
||||
buckets=[
|
||||
0.005,
|
||||
0.01,
|
||||
0.015,
|
||||
0.02,
|
||||
0.025,
|
||||
0.03,
|
||||
0.04,
|
||||
0.05,
|
||||
0.075,
|
||||
0.1,
|
||||
0.15,
|
||||
0.2,
|
||||
0.3,
|
||||
0.4,
|
||||
0.5,
|
||||
0.75,
|
||||
1.0,
|
||||
2.5,
|
||||
],
|
||||
)
|
||||
|
||||
self.histogram_e2e_request_latency = Histogram(
|
||||
name="sglang:e2e_request_latency_seconds",
|
||||
documentation="Histogram of End-to-end request latency in seconds",
|
||||
labelnames=labels.keys(),
|
||||
buckets=[
|
||||
0.3,
|
||||
0.5,
|
||||
0.8,
|
||||
1.0,
|
||||
1.5,
|
||||
2.0,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
30.0,
|
||||
40.0,
|
||||
50.0,
|
||||
60.0,
|
||||
],
|
||||
)
|
||||
|
||||
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
|
||||
histogram.labels(**self.labels).observe(data)
|
||||
|
||||
def _log_counter(self, counter, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to counter.
|
||||
counter.labels(**self.labels).inc(data)
|
||||
|
||||
def inc_prompt_tokens(self, value: int):
|
||||
self._log_counter(self.prompt_tokens_total, value)
|
||||
|
||||
def inc_generation_tokens(self, value: int):
|
||||
self._log_counter(self.generation_tokens_total, value)
|
||||
|
||||
def observe_time_to_first_token(self, value: Union[float, int]):
|
||||
self._log_histogram(self.histogram_time_to_first_token, value)
|
||||
|
||||
def observe_time_per_output_token(self, value: Union[float, int]):
|
||||
self._log_histogram(self.histogram_time_per_output_token, value)
|
||||
|
||||
def observe_e2e_request_latency(self, value: Union[float, int]):
|
||||
self._log_histogram(self.histogram_e2e_request_latency, value)
|
||||
108
python/sglang/srt/metrics/func_timer.py
Normal file
108
python/sglang/srt/metrics/func_timer.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""
|
||||
Copyright 2023-2024 SGLang Team
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
"""
|
||||
Records the latency of some functions
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from functools import wraps
|
||||
from typing import Any, Callable, List, Optional
|
||||
|
||||
enable_metrics = False
|
||||
|
||||
|
||||
def enable_func_timer():
|
||||
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
||||
from prometheus_client import Histogram
|
||||
|
||||
global enable_metrics, FUNC_LATENCY
|
||||
enable_metrics = True
|
||||
|
||||
FUNC_LATENCY = Histogram(
|
||||
"sglang:func_latency_seconds",
|
||||
"Function latency in seconds",
|
||||
# captures latency in range [50ms - ~50s]
|
||||
buckets=exponential_buckets(start=0.05, width=1.5, length=18),
|
||||
labelnames=["name"],
|
||||
)
|
||||
|
||||
|
||||
FUNC_LATENCY = None
|
||||
|
||||
|
||||
def exponential_buckets(start: float, width: float, length: int) -> List[float]:
|
||||
buckets = []
|
||||
for i in range(length):
|
||||
buckets.append(start * (width**i))
|
||||
return buckets
|
||||
|
||||
|
||||
def time_func_latency(
|
||||
func: Callable = None, name: Optional[str] = None
|
||||
) -> Callable[..., Any]:
|
||||
"""
|
||||
A decorator to observe the latency of a function's execution. Supports both sync and async functions.
|
||||
|
||||
NOTE: We use our own implementation of a timer decorator since prometheus_client does not support async
|
||||
context manager yet.
|
||||
|
||||
Overhead: The overhead introduced here in case of an async function could likely be because of `await` introduced
|
||||
which will return in another coroutine object creation and under heavy load could see longer wall time
|
||||
(scheduling delays due to introduction of another awaitable).
|
||||
"""
|
||||
|
||||
def measure(func: Callable[..., Any]) -> Callable[..., Any]:
|
||||
nonlocal name
|
||||
|
||||
name = name or func.__name__
|
||||
|
||||
@wraps(func)
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
if not enable_metrics:
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
metric = FUNC_LATENCY
|
||||
start = time.monotonic()
|
||||
ret = func(*args, **kwargs)
|
||||
if isinstance(ret, asyncio.Future) or asyncio.iscoroutine(ret):
|
||||
try:
|
||||
ret = await ret
|
||||
finally:
|
||||
metric.labels(name=name).observe(time.monotonic() - start)
|
||||
return ret
|
||||
|
||||
@wraps(func)
|
||||
def sync_wrapper(*args, **kwargs):
|
||||
if not enable_metrics:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
metric = FUNC_LATENCY
|
||||
start = time.monotonic()
|
||||
try:
|
||||
ret = func(*args, **kwargs)
|
||||
finally:
|
||||
metric.labels(name=name).observe(time.monotonic() - start)
|
||||
return ret
|
||||
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return async_wrapper
|
||||
return sync_wrapper
|
||||
|
||||
if func:
|
||||
return measure(func)
|
||||
else:
|
||||
return measure
|
||||
@@ -1,388 +0,0 @@
|
||||
"""
|
||||
Copyright 2023-2024 SGLang Team
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
"""Utilities for Prometheus Metrics Collection."""
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Counter as CollectionsCounter
|
||||
from typing import Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
from prometheus_client import Counter, Gauge, Histogram
|
||||
|
||||
from sglang.srt.metrics.metrics_types import Stats
|
||||
|
||||
|
||||
class Metrics:
|
||||
"""
|
||||
SGLang Metrics
|
||||
"""
|
||||
|
||||
def __init__(self, labelnames: List[str], max_model_len):
|
||||
|
||||
# Configuration Stats
|
||||
self.max_total_num_tokens = Gauge(
|
||||
name="sglang:max_total_num_tokens",
|
||||
documentation="Maximum total number of tokens",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="min",
|
||||
) # static across processes
|
||||
|
||||
self.max_prefill_tokens = Gauge(
|
||||
name="sglang:max_prefill_tokens",
|
||||
documentation="Maximum prefill tokens",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="min",
|
||||
) # static across processes
|
||||
|
||||
self.max_running_requests = Gauge(
|
||||
name="sglang:max_running_requests",
|
||||
documentation="Maximum running requests",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="min",
|
||||
) # static across processes
|
||||
|
||||
self.context_len = Gauge(
|
||||
name="sglang:context_len",
|
||||
documentation="Context length",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="min",
|
||||
) # static across processes
|
||||
# Decode Stats
|
||||
self.num_running_sys = Gauge(
|
||||
name="sglang:num_requests_running",
|
||||
documentation="Number of requests currently running on GPU",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.num_waiting_sys = Gauge(
|
||||
name="sglang:num_requests_waiting",
|
||||
documentation="Number of requests waiting to be processed.",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.gen_throughput = Gauge(
|
||||
name="sglang:gen_throughput",
|
||||
documentation="Gen token throughput (token/s)",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.token_usage = Gauge(
|
||||
name="sglang:token_usage",
|
||||
documentation="Total token usage",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
# System Stats
|
||||
# KV Cache Usage in %
|
||||
# self.gpu_cache_usage_sys = Gauge(
|
||||
# "gpu_cache_usage_perc",
|
||||
# "GPU KV-cache usage. 1 means 100 percent usage.",
|
||||
# labelnames=labelnames,
|
||||
# multiprocess_mode="sum")
|
||||
|
||||
self.new_seq = Gauge(
|
||||
name="sglang:new_seq",
|
||||
documentation="Number of new sequences",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.new_token = Gauge(
|
||||
name="sglang:new_token",
|
||||
documentation="Number of new token",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
# Prefix caching block hit rate
|
||||
self.cached_token = Gauge(
|
||||
name="sglang:cached_token",
|
||||
documentation="Number of cached token",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.cache_hit_rate = Gauge(
|
||||
name="sglang:cache_hit_rate",
|
||||
documentation="Cache hit rate",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
self.queue_req = Gauge(
|
||||
name="sglang:queue_req",
|
||||
documentation="Number of queued requests",
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum",
|
||||
)
|
||||
|
||||
# Iteration stats
|
||||
self.counter_prompt_tokens = Counter(
|
||||
name="sglang:prompt_tokens_total",
|
||||
documentation="Number of prefill tokens processed.",
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.counter_generation_tokens = Counter(
|
||||
name="sglang:generation_tokens_total",
|
||||
documentation="Number of generation tokens processed.",
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.histogram_time_to_first_token = Histogram(
|
||||
name="sglang:time_to_first_token_seconds",
|
||||
documentation="Histogram of time to first token in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.001,
|
||||
0.005,
|
||||
0.01,
|
||||
0.02,
|
||||
0.04,
|
||||
0.06,
|
||||
0.08,
|
||||
0.1,
|
||||
0.25,
|
||||
0.5,
|
||||
0.75,
|
||||
1.0,
|
||||
2.5,
|
||||
5.0,
|
||||
7.5,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
25.0,
|
||||
30.0,
|
||||
],
|
||||
)
|
||||
self.histogram_time_per_output_token = Histogram(
|
||||
name="sglang:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.005,
|
||||
0.01,
|
||||
0.015,
|
||||
0.02,
|
||||
0.025,
|
||||
0.03,
|
||||
0.04,
|
||||
0.05,
|
||||
0.075,
|
||||
0.1,
|
||||
0.15,
|
||||
0.2,
|
||||
0.3,
|
||||
0.4,
|
||||
0.5,
|
||||
0.75,
|
||||
1.0,
|
||||
2.5,
|
||||
],
|
||||
)
|
||||
|
||||
# Request Stats
|
||||
# Metadata
|
||||
self.num_prompt_tokens_requests = Histogram(
|
||||
name="sglang:request_prompt_tokens",
|
||||
documentation="Number of prefill tokens processed",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.num_generation_tokens_requests = Histogram(
|
||||
name="sglang:request_generation_tokens",
|
||||
documentation="Number of generation tokens processed.",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.finished_reason_requests = Counter(
|
||||
name="sglang:request_success_total",
|
||||
documentation="Count of successfully processed requests.",
|
||||
labelnames=labelnames + ["finished_reason"],
|
||||
)
|
||||
self.histogram_time_e2e_requests = Histogram(
|
||||
name="sglang:e2e_request_latency_seconds",
|
||||
documentation="Histogram of End-to-end request latency in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.3,
|
||||
0.5,
|
||||
0.8,
|
||||
1.0,
|
||||
1.5,
|
||||
2.0,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
30.0,
|
||||
40.0,
|
||||
50.0,
|
||||
60.0,
|
||||
],
|
||||
)
|
||||
self.histogram_time_waiting_requests = Histogram(
|
||||
name="sglang:waiting_request_latency_seconds",
|
||||
documentation="Histogram of request waiting time in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.3,
|
||||
0.5,
|
||||
0.8,
|
||||
1.0,
|
||||
1.5,
|
||||
2.0,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
30.0,
|
||||
40.0,
|
||||
50.0,
|
||||
60.0,
|
||||
],
|
||||
)
|
||||
self.histogram_time_decode_requests = Histogram(
|
||||
name="sglang:decode_request_latency_seconds",
|
||||
documentation="Histogram of request decoding time in seconds",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.3,
|
||||
0.5,
|
||||
0.8,
|
||||
1.0,
|
||||
1.5,
|
||||
2.0,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
30.0,
|
||||
40.0,
|
||||
50.0,
|
||||
60.0,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class MetricsCollector(ABC):
|
||||
"""
|
||||
SGLang Metrics Collector
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def log_stats(self, stats: Stats) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class PrometheusMetricsCollector(MetricsCollector):
|
||||
"""
|
||||
SGLang Metrics Collector
|
||||
"""
|
||||
|
||||
def __init__(self, labels: Dict[str, str], max_model_len: int) -> None:
|
||||
self.labels = labels
|
||||
self.metrics = Metrics(
|
||||
labelnames=list(labels.keys()), max_model_len=max_model_len
|
||||
)
|
||||
|
||||
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to gauge.
|
||||
gauge.labels(**self.labels).set(data)
|
||||
|
||||
def _log_counter(self, counter, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to counter.
|
||||
counter.labels(**self.labels).inc(data)
|
||||
|
||||
def _log_counter_labels(
|
||||
self, counter, data: CollectionsCounter, label_key: str
|
||||
) -> None:
|
||||
# Convenience function for collection counter of labels.
|
||||
for label, count in data.items():
|
||||
counter.labels(**{**self.labels, label_key: label}).inc(count)
|
||||
|
||||
def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
|
||||
# Convenience function for logging list to histogram.
|
||||
for datum in data:
|
||||
histogram.labels(**self.labels).observe(datum)
|
||||
|
||||
def log_stats(self, stats: Stats) -> None:
|
||||
self._log_gauge(self.metrics.max_total_num_tokens, stats.max_total_num_tokens)
|
||||
self._log_gauge(self.metrics.max_prefill_tokens, stats.max_prefill_tokens)
|
||||
self._log_gauge(self.metrics.max_running_requests, stats.max_running_requests)
|
||||
self._log_gauge(self.metrics.context_len, stats.context_len)
|
||||
self._log_histogram(
|
||||
self.metrics.num_prompt_tokens_requests, stats.num_prompt_tokens_requests
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.num_generation_tokens_requests,
|
||||
stats.num_generation_tokens_requests,
|
||||
)
|
||||
|
||||
self._log_counter(
|
||||
self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
|
||||
)
|
||||
self._log_counter(
|
||||
self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_per_output_token,
|
||||
stats.time_per_output_tokens_iter,
|
||||
)
|
||||
|
||||
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
|
||||
self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
|
||||
self._log_gauge(self.metrics.gen_throughput, stats.gen_throughput)
|
||||
self._log_gauge(self.metrics.token_usage, stats.token_usage)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_e2e_requests, stats.time_e2e_requests
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_waiting_requests, stats.time_waiting_requests
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_decode_requests, stats.time_decode_requests
|
||||
)
|
||||
self._log_gauge(self.metrics.new_seq, stats.new_seq)
|
||||
self._log_gauge(self.metrics.new_token, stats.new_token)
|
||||
self._log_gauge(self.metrics.cached_token, stats.cached_token)
|
||||
self._log_gauge(self.metrics.cache_hit_rate, stats.cache_hit_rate)
|
||||
self._log_gauge(self.metrics.queue_req, stats.queue_req)
|
||||
|
||||
|
||||
def build_1_2_5_buckets(max_value: int) -> List[int]:
|
||||
"""
|
||||
Builds a list of buckets with increasing powers of 10 multiplied by
|
||||
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
|
||||
|
||||
Example:
|
||||
>>> build_1_2_5_buckets(100)
|
||||
[1, 2, 5, 10, 20, 50, 100]
|
||||
"""
|
||||
mantissa_lst = [1, 2, 5]
|
||||
exponent = 0
|
||||
buckets: List[int] = []
|
||||
while True:
|
||||
for m in mantissa_lst:
|
||||
value = m * 10**exponent
|
||||
if value <= max_value:
|
||||
buckets.append(value)
|
||||
else:
|
||||
return buckets
|
||||
exponent += 1
|
||||
@@ -1,54 +0,0 @@
|
||||
"""
|
||||
Copyright 2023-2024 SGLang Team
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
"""Metrics Types"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class Stats:
|
||||
# config
|
||||
max_total_num_tokens: int = 0
|
||||
max_prefill_tokens: int = 0
|
||||
max_running_requests: int = 0
|
||||
context_len: int = 0
|
||||
# request stats
|
||||
num_prompt_tokens_requests: List[int] = field(default_factory=list)
|
||||
num_generation_tokens_requests: List[int] = field(default_factory=list)
|
||||
finished_reason_requests: List[str] = field(default_factory=list)
|
||||
# decode stats
|
||||
num_running_req: int = 0
|
||||
num_waiting_req: int = 0
|
||||
gen_throughput: float = 0.0
|
||||
waiting_queue: int = 0
|
||||
time_e2e_requests: List[float] = field(default_factory=list)
|
||||
time_waiting_requests: List[float] = field(default_factory=list)
|
||||
time_decode_requests: List[float] = field(default_factory=list)
|
||||
# system stats
|
||||
token_usage: float = 0.0
|
||||
new_seq: int = 0
|
||||
new_token: int = 0
|
||||
cached_token: int = 0
|
||||
cache_hit_rate: float = 0.0
|
||||
running_req: int = 0
|
||||
queue_req: int = 0
|
||||
|
||||
# Iteration stats (should have _iter suffix)
|
||||
num_prompt_tokens_iter: int = 0
|
||||
num_generation_tokens_iter: int = 0
|
||||
time_to_first_tokens_iter: List[float] = field(default_factory=list)
|
||||
time_per_output_tokens_iter: List[float] = field(default_factory=list)
|
||||
Reference in New Issue
Block a user