from vllm.engine.metrics_types import (StatLoggerBase, Stats) import vllm_vacc.vllm.model_executor.models.vars as global_vars class LoggingStatLogger(StatLoggerBase): """LoggingStatLogger is used in LLMEngine to log to Stdout.""" def log(self, stats: Stats) -> None: from vllm.engine.metrics import local_interval_elapsed, get_throughput, logger """Called by LLMEngine. Logs to Stdout every self.local_interval seconds.""" # Save tracked stats for token counters. self.num_prompt_tokens.append(stats.num_prompt_tokens_iter) self.num_generation_tokens.append(stats.num_generation_tokens_iter) # Update spec decode metrics self.maybe_update_spec_decode_metrics(stats) # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): # Compute summary metrics for tracked stats (and log them # to promethus if applicable). prompt_throughput = get_throughput(self.num_prompt_tokens, now=stats.now, last_log=self.last_local_log) generation_throughput = get_throughput( self.num_generation_tokens, now=stats.now, last_log=self.last_local_log) log_fn = logger.info if not any((prompt_throughput, generation_throughput, self.last_prompt_throughput, self.last_generation_throughput)): # Avoid log noise on an idle production system log_fn = logger.debug log_fn( "Avg prompt throughput: %.1f tokens/s, " "Avg generation throughput: %.1f tokens/s, " "Running: %d reqs, Swapped: %d reqs, " "Pending: %d reqs, GPU KV cache usage: %.1f%%, " "CPU KV cache usage: %.1f%%., " "Do sequences length: %s", prompt_throughput, generation_throughput, stats.num_running_sys, stats.num_swapped_sys, stats.num_waiting_sys, stats.gpu_cache_usage_sys * 100, stats.cpu_cache_usage_sys * 100, str(global_vars.DO_SEQ_LENS) ) if (stats.cpu_prefix_cache_hit_rate >= 0 or stats.gpu_prefix_cache_hit_rate >= 0): log_fn( "Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%", stats.gpu_prefix_cache_hit_rate * 100, stats.cpu_prefix_cache_hit_rate * 100, ) if self.spec_decode_metrics is not None: logger.debug( self._format_spec_decode_metrics_str( self.spec_decode_metrics)) self._reset(stats, prompt_throughput, generation_throughput)