init
This commit is contained in:
69
vllm_vacc/vllm/engine/metrics.py
Normal file
69
vllm_vacc/vllm/engine/metrics.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from vllm.engine.metrics_types import (StatLoggerBase, Stats)
|
||||
import vllm_vacc.vllm.model_executor.models.vars as global_vars
|
||||
|
||||
class LoggingStatLogger(StatLoggerBase):
|
||||
"""LoggingStatLogger is used in LLMEngine to log to Stdout."""
|
||||
|
||||
def log(self, stats: Stats) -> None:
|
||||
from vllm.engine.metrics import local_interval_elapsed, get_throughput, logger
|
||||
"""Called by LLMEngine.
|
||||
Logs to Stdout every self.local_interval seconds."""
|
||||
|
||||
# Save tracked stats for token counters.
|
||||
self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
|
||||
self.num_generation_tokens.append(stats.num_generation_tokens_iter)
|
||||
|
||||
# Update spec decode metrics
|
||||
self.maybe_update_spec_decode_metrics(stats)
|
||||
|
||||
# Log locally every local_interval seconds.
|
||||
if local_interval_elapsed(stats.now, self.last_local_log,
|
||||
self.local_interval):
|
||||
# Compute summary metrics for tracked stats (and log them
|
||||
# to promethus if applicable).
|
||||
prompt_throughput = get_throughput(self.num_prompt_tokens,
|
||||
now=stats.now,
|
||||
last_log=self.last_local_log)
|
||||
generation_throughput = get_throughput(
|
||||
self.num_generation_tokens,
|
||||
now=stats.now,
|
||||
last_log=self.last_local_log)
|
||||
|
||||
log_fn = logger.info
|
||||
if not any((prompt_throughput, generation_throughput,
|
||||
self.last_prompt_throughput,
|
||||
self.last_generation_throughput)):
|
||||
# Avoid log noise on an idle production system
|
||||
log_fn = logger.debug
|
||||
|
||||
log_fn(
|
||||
"Avg prompt throughput: %.1f tokens/s, "
|
||||
"Avg generation throughput: %.1f tokens/s, "
|
||||
"Running: %d reqs, Swapped: %d reqs, "
|
||||
"Pending: %d reqs, GPU KV cache usage: %.1f%%, "
|
||||
"CPU KV cache usage: %.1f%%., "
|
||||
"Do sequences length: %s",
|
||||
prompt_throughput,
|
||||
generation_throughput,
|
||||
stats.num_running_sys,
|
||||
stats.num_swapped_sys,
|
||||
stats.num_waiting_sys,
|
||||
stats.gpu_cache_usage_sys * 100,
|
||||
stats.cpu_cache_usage_sys * 100,
|
||||
str(global_vars.DO_SEQ_LENS)
|
||||
)
|
||||
if (stats.cpu_prefix_cache_hit_rate >= 0
|
||||
or stats.gpu_prefix_cache_hit_rate >= 0):
|
||||
log_fn(
|
||||
"Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
|
||||
stats.gpu_prefix_cache_hit_rate * 100,
|
||||
stats.cpu_prefix_cache_hit_rate * 100,
|
||||
)
|
||||
if self.spec_decode_metrics is not None:
|
||||
logger.debug(
|
||||
self._format_spec_decode_metrics_str(
|
||||
self.spec_decode_metrics))
|
||||
|
||||
self._reset(stats, prompt_throughput, generation_throughput)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user