diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index 60e0758ea..bd3204079 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -118,6 +118,7 @@ class SchedulerStats: num_running_reqs: int = 0 num_used_tokens: int = 0 token_usage: float = 0.0 + pending_prealloc_token_usage: float = 0.0 swa_token_usage: float = 0.0 gen_throughput: float = 0.0 num_queue_reqs: int = 0 @@ -177,6 +178,12 @@ class SchedulerMetricsCollector: labelnames=labels.keys(), multiprocess_mode="mostrecent", ) + self.pending_prealloc_token_usage = Gauge( + name="sglang:pending_prealloc_token_usage", + documentation="The token usage for pending preallocated tokens (not preallocated yet).", + labelnames=labels.keys(), + multiprocess_mode="mostrecent", + ) self.swa_token_usage = Gauge( name="sglang:swa_token_usage", documentation="The token usage for SWA layers.", @@ -516,6 +523,9 @@ class SchedulerMetricsCollector: self._log_gauge(self.num_running_reqs, stats.num_running_reqs) self._log_gauge(self.num_used_tokens, stats.num_used_tokens) self._log_gauge(self.token_usage, stats.token_usage) + self._log_gauge( + self.pending_prealloc_token_usage, stats.pending_prealloc_token_usage + ) self._log_gauge(self.swa_token_usage, stats.swa_token_usage) self._log_gauge(self.gen_throughput, stats.gen_throughput) self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)