import time from vllm.v1.metrics.loggers import StatLoggerBase from vllm.v1.metrics.loggers import logger class LoggingStatLogger(StatLoggerBase): def log(self): now = time.monotonic() prompt_throughput = self._get_throughput(self.num_prompt_tokens, now) generation_throughput = self._get_throughput( self.num_generation_tokens, now) self._reset(now) scheduler_stats = self.last_scheduler_stats log_fn = logger.info if not any( (prompt_throughput, generation_throughput, self.last_prompt_throughput, self.last_generation_throughput)): # Avoid log noise on an idle production system log_fn = logger.debug self.last_generation_throughput = generation_throughput self.last_prompt_throughput = prompt_throughput # Format and print output. log_fn( "Engine %03d: " "Avg prompt throughput: %.1f tokens/s, " "Avg generation throughput: %.1f tokens/s, " "Running: %d reqs, Waiting: %d reqs, " "GPU KV cache usage: %.1f%%, " "Prefix cache hit rate: %.1f%%, " "running seqlens: %s ", self.engine_index, prompt_throughput, generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, scheduler_stats.kv_cache_usage * 100, self.prefix_caching_metrics.hit_rate * 100, str(scheduler_stats.running_seqlens), ) self.spec_decoding_logging.log(log_fn=log_fn) self.kv_transfer_logging.log(log_fn=log_fn)