49 lines
1.7 KiB
Python
49 lines
1.7 KiB
Python
|
|
import time
|
|
|
|
from vllm.v1.metrics.loggers import StatLoggerBase
|
|
from vllm.v1.metrics.loggers import logger
|
|
|
|
class LoggingStatLogger(StatLoggerBase):
|
|
|
|
def log(self):
|
|
now = time.monotonic()
|
|
prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
|
|
generation_throughput = self._get_throughput(
|
|
self.num_generation_tokens, now)
|
|
|
|
self._reset(now)
|
|
|
|
scheduler_stats = self.last_scheduler_stats
|
|
|
|
log_fn = logger.info
|
|
if not any(
|
|
(prompt_throughput, generation_throughput,
|
|
self.last_prompt_throughput, self.last_generation_throughput)):
|
|
# Avoid log noise on an idle production system
|
|
log_fn = logger.debug
|
|
|
|
self.last_generation_throughput = generation_throughput
|
|
self.last_prompt_throughput = prompt_throughput
|
|
|
|
# Format and print output.
|
|
log_fn(
|
|
"Engine %03d: "
|
|
"Avg prompt throughput: %.1f tokens/s, "
|
|
"Avg generation throughput: %.1f tokens/s, "
|
|
"Running: %d reqs, Waiting: %d reqs, "
|
|
"GPU KV cache usage: %.1f%%, "
|
|
"Prefix cache hit rate: %.1f%%, "
|
|
"running seqlens: %s ",
|
|
self.engine_index,
|
|
prompt_throughput,
|
|
generation_throughput,
|
|
scheduler_stats.num_running_reqs,
|
|
scheduler_stats.num_waiting_reqs,
|
|
scheduler_stats.kv_cache_usage * 100,
|
|
self.prefix_caching_metrics.hit_rate * 100,
|
|
str(scheduler_stats.running_seqlens),
|
|
)
|
|
self.spec_decoding_logging.log(log_fn=log_fn)
|
|
self.kv_transfer_logging.log(log_fn=log_fn)
|
|
|