70 lines
3.0 KiB
Python
70 lines
3.0 KiB
Python
from vllm.engine.metrics_types import (StatLoggerBase, Stats)
|
|
import vllm_vacc.vllm.model_executor.models.vars as global_vars
|
|
|
|
class LoggingStatLogger(StatLoggerBase):
|
|
"""LoggingStatLogger is used in LLMEngine to log to Stdout."""
|
|
|
|
def log(self, stats: Stats) -> None:
|
|
from vllm.engine.metrics import local_interval_elapsed, get_throughput, logger
|
|
"""Called by LLMEngine.
|
|
Logs to Stdout every self.local_interval seconds."""
|
|
|
|
# Save tracked stats for token counters.
|
|
self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
|
|
self.num_generation_tokens.append(stats.num_generation_tokens_iter)
|
|
|
|
# Update spec decode metrics
|
|
self.maybe_update_spec_decode_metrics(stats)
|
|
|
|
# Log locally every local_interval seconds.
|
|
if local_interval_elapsed(stats.now, self.last_local_log,
|
|
self.local_interval):
|
|
# Compute summary metrics for tracked stats (and log them
|
|
# to promethus if applicable).
|
|
prompt_throughput = get_throughput(self.num_prompt_tokens,
|
|
now=stats.now,
|
|
last_log=self.last_local_log)
|
|
generation_throughput = get_throughput(
|
|
self.num_generation_tokens,
|
|
now=stats.now,
|
|
last_log=self.last_local_log)
|
|
|
|
log_fn = logger.info
|
|
if not any((prompt_throughput, generation_throughput,
|
|
self.last_prompt_throughput,
|
|
self.last_generation_throughput)):
|
|
# Avoid log noise on an idle production system
|
|
log_fn = logger.debug
|
|
|
|
log_fn(
|
|
"Avg prompt throughput: %.1f tokens/s, "
|
|
"Avg generation throughput: %.1f tokens/s, "
|
|
"Running: %d reqs, Swapped: %d reqs, "
|
|
"Pending: %d reqs, GPU KV cache usage: %.1f%%, "
|
|
"CPU KV cache usage: %.1f%%., "
|
|
"Do sequences length: %s",
|
|
prompt_throughput,
|
|
generation_throughput,
|
|
stats.num_running_sys,
|
|
stats.num_swapped_sys,
|
|
stats.num_waiting_sys,
|
|
stats.gpu_cache_usage_sys * 100,
|
|
stats.cpu_cache_usage_sys * 100,
|
|
str(global_vars.DO_SEQ_LENS)
|
|
)
|
|
if (stats.cpu_prefix_cache_hit_rate >= 0
|
|
or stats.gpu_prefix_cache_hit_rate >= 0):
|
|
log_fn(
|
|
"Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
|
|
stats.gpu_prefix_cache_hit_rate * 100,
|
|
stats.cpu_prefix_cache_hit_rate * 100,
|
|
)
|
|
if self.spec_decode_metrics is not None:
|
|
logger.debug(
|
|
self._format_spec_decode_metrics_str(
|
|
self.spec_decode_metrics))
|
|
|
|
self._reset(stats, prompt_throughput, generation_throughput)
|
|
|
|
|