feat: add production metric for retracted requests due to insufficient kvcache (#7030)
Signed-off-by: Zhao Chen <zhaochen.zju@gmail.com>
This commit is contained in:
@@ -675,6 +675,7 @@ class Scheduler(
|
|||||||
self.spec_num_total_forward_ct = 0
|
self.spec_num_total_forward_ct = 0
|
||||||
self.cum_spec_accept_length = 0
|
self.cum_spec_accept_length = 0
|
||||||
self.cum_spec_accept_count = 0
|
self.cum_spec_accept_count = 0
|
||||||
|
self.total_retracted_reqs = 0
|
||||||
self.stats = SchedulerStats()
|
self.stats = SchedulerStats()
|
||||||
if self.enable_metrics:
|
if self.enable_metrics:
|
||||||
engine_type = "unified"
|
engine_type = "unified"
|
||||||
@@ -1477,6 +1478,7 @@ class Scheduler(
|
|||||||
self.stats.num_queue_reqs = len(self.waiting_queue)
|
self.stats.num_queue_reqs = len(self.waiting_queue)
|
||||||
self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
|
self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
|
||||||
self.stats.spec_accept_length = spec_accept_length
|
self.stats.spec_accept_length = spec_accept_length
|
||||||
|
self.stats.total_retracted_reqs = self.total_retracted_reqs
|
||||||
self.metrics_collector.log_stats(self.stats)
|
self.metrics_collector.log_stats(self.stats)
|
||||||
self._emit_kv_metrics()
|
self._emit_kv_metrics()
|
||||||
self._publish_kv_events()
|
self._publish_kv_events()
|
||||||
@@ -1824,14 +1826,17 @@ class Scheduler(
|
|||||||
old_ratio = self.new_token_ratio
|
old_ratio = self.new_token_ratio
|
||||||
|
|
||||||
retracted_reqs, new_token_ratio = batch.retract_decode(self.server_args)
|
retracted_reqs, new_token_ratio = batch.retract_decode(self.server_args)
|
||||||
|
num_retracted_reqs = len(retracted_reqs)
|
||||||
self.new_token_ratio = new_token_ratio
|
self.new_token_ratio = new_token_ratio
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"KV cache pool is full. Retract requests. "
|
"KV cache pool is full. Retract requests. "
|
||||||
f"#retracted_reqs: {len(retracted_reqs)}, "
|
f"#retracted_reqs: {num_retracted_reqs}, "
|
||||||
f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
|
f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
|
||||||
)
|
)
|
||||||
|
|
||||||
self._extend_requests_to_queue(retracted_reqs, is_retracted=True)
|
self._extend_requests_to_queue(retracted_reqs, is_retracted=True)
|
||||||
|
self.total_retracted_reqs += num_retracted_reqs
|
||||||
else:
|
else:
|
||||||
self.new_token_ratio = max(
|
self.new_token_ratio = max(
|
||||||
self.new_token_ratio - self.new_token_ratio_decay,
|
self.new_token_ratio - self.new_token_ratio_decay,
|
||||||
|
|||||||
@@ -145,6 +145,7 @@ class SchedulerStats:
|
|||||||
num_prefill_infight_queue_reqs: int = 0
|
num_prefill_infight_queue_reqs: int = 0
|
||||||
num_decode_prealloc_queue_reqs: int = 0
|
num_decode_prealloc_queue_reqs: int = 0
|
||||||
num_decode_transfer_queue_reqs: int = 0
|
num_decode_transfer_queue_reqs: int = 0
|
||||||
|
total_retracted_reqs: int = 0
|
||||||
|
|
||||||
|
|
||||||
class SchedulerMetricsCollector:
|
class SchedulerMetricsCollector:
|
||||||
@@ -219,6 +220,13 @@ class SchedulerMetricsCollector:
|
|||||||
multiprocess_mode="mostrecent",
|
multiprocess_mode="mostrecent",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.total_retracted_reqs = Gauge(
|
||||||
|
name="sglang:total_retracted_reqs",
|
||||||
|
documentation="The total number of retracted requests due to kvcache full.",
|
||||||
|
labelnames=labels.keys(),
|
||||||
|
multiprocess_mode="mostrecent",
|
||||||
|
)
|
||||||
|
|
||||||
# Disaggregation queue metrics
|
# Disaggregation queue metrics
|
||||||
self.num_prefill_prealloc_queue_reqs = Gauge(
|
self.num_prefill_prealloc_queue_reqs = Gauge(
|
||||||
name="sglang:num_prefill_prealloc_queue_reqs",
|
name="sglang:num_prefill_prealloc_queue_reqs",
|
||||||
@@ -279,6 +287,7 @@ class SchedulerMetricsCollector:
|
|||||||
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
||||||
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
||||||
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
||||||
|
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
||||||
|
|
||||||
# Disaggregation metrics
|
# Disaggregation metrics
|
||||||
self._log_gauge(
|
self._log_gauge(
|
||||||
|
|||||||
Reference in New Issue
Block a user