[PD] Improve disaggregation metrics output: update the metrics to keep reflecting real stats (#7317)

This commit is contained in:
SCDESPERTATE
2025-08-25 14:16:43 +08:00
committed by GitHub
parent ca4b86c564
commit b5c6529e17
4 changed files with 28 additions and 5 deletions

View File

@@ -334,6 +334,8 @@ class DecodePreallocQueue:
error_message,
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
)
if self.scheduler.enable_metrics:
self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
else:
raise ValueError(f"Unexpected poll case: {poll}")
@@ -595,6 +597,8 @@ class DecodeTransferQueue:
# unlock the kv cache or it will have memory leak
self.tree_cache.cache_finished_req(decode_req.req)
indices_to_remove.add(i)
if self.scheduler.enable_metrics:
self.scheduler.metrics_collector.increment_transfer_failed_reqs()
continue
elif poll == KVPoll.Success:

View File

@@ -238,6 +238,8 @@ class PrefillBootstrapQueue:
self.scheduler.stream_output([req], req.return_logprob)
indices_to_remove.add(i)
failed_reqs.append(req)
if self.scheduler.enable_metrics:
self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
continue
# KV.WaitingForInput - init here
@@ -522,6 +524,8 @@ class SchedulerDisaggregationPrefillMixin:
req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
)
done_reqs.append(req)
if self.enable_metrics:
self.metrics_collector.increment_transfer_failed_reqs()
else:
assert False, f"Unexpected polling state {poll=}"