diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index ab966f924..874ed60f0 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -675,6 +675,7 @@ class Scheduler( self.spec_num_total_forward_ct = 0 self.cum_spec_accept_length = 0 self.cum_spec_accept_count = 0 + self.total_retracted_reqs = 0 self.stats = SchedulerStats() if self.enable_metrics: engine_type = "unified" @@ -1477,6 +1478,7 @@ class Scheduler( self.stats.num_queue_reqs = len(self.waiting_queue) self.stats.num_grammar_queue_reqs = len(self.grammar_queue) self.stats.spec_accept_length = spec_accept_length + self.stats.total_retracted_reqs = self.total_retracted_reqs self.metrics_collector.log_stats(self.stats) self._emit_kv_metrics() self._publish_kv_events() @@ -1824,14 +1826,17 @@ class Scheduler( old_ratio = self.new_token_ratio retracted_reqs, new_token_ratio = batch.retract_decode(self.server_args) + num_retracted_reqs = len(retracted_reqs) self.new_token_ratio = new_token_ratio logger.info( "KV cache pool is full. Retract requests. " - f"#retracted_reqs: {len(retracted_reqs)}, " + f"#retracted_reqs: {num_retracted_reqs}, " f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}" ) + self._extend_requests_to_queue(retracted_reqs, is_retracted=True) + self.total_retracted_reqs += num_retracted_reqs else: self.new_token_ratio = max( self.new_token_ratio - self.new_token_ratio_decay, diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index f8dac4472..4c32b8fc6 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -145,6 +145,7 @@ class SchedulerStats: num_prefill_infight_queue_reqs: int = 0 num_decode_prealloc_queue_reqs: int = 0 num_decode_transfer_queue_reqs: int = 0 + total_retracted_reqs: int = 0 class SchedulerMetricsCollector: @@ -219,6 +220,13 @@ class SchedulerMetricsCollector: multiprocess_mode="mostrecent", ) + self.total_retracted_reqs = Gauge( + name="sglang:total_retracted_reqs", + documentation="The total number of retracted requests due to kvcache full.", + labelnames=labels.keys(), + multiprocess_mode="mostrecent", + ) + # Disaggregation queue metrics self.num_prefill_prealloc_queue_reqs = Gauge( name="sglang:num_prefill_prealloc_queue_reqs", @@ -279,6 +287,7 @@ class SchedulerMetricsCollector: self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs) self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate) self._log_gauge(self.spec_accept_length, stats.spec_accept_length) + self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs) # Disaggregation metrics self._log_gauge(