[PD] Improve disaggregation metrics output: update the metrics to keep reflecting real stats (#7317)
This commit is contained in:
@@ -334,6 +334,8 @@ class DecodePreallocQueue:
|
||||
error_message,
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
||||
)
|
||||
if self.scheduler.enable_metrics:
|
||||
self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
|
||||
else:
|
||||
raise ValueError(f"Unexpected poll case: {poll}")
|
||||
|
||||
@@ -595,6 +597,8 @@ class DecodeTransferQueue:
|
||||
# unlock the kv cache or it will have memory leak
|
||||
self.tree_cache.cache_finished_req(decode_req.req)
|
||||
indices_to_remove.add(i)
|
||||
if self.scheduler.enable_metrics:
|
||||
self.scheduler.metrics_collector.increment_transfer_failed_reqs()
|
||||
continue
|
||||
elif poll == KVPoll.Success:
|
||||
|
||||
|
||||
@@ -238,6 +238,8 @@ class PrefillBootstrapQueue:
|
||||
self.scheduler.stream_output([req], req.return_logprob)
|
||||
indices_to_remove.add(i)
|
||||
failed_reqs.append(req)
|
||||
if self.scheduler.enable_metrics:
|
||||
self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
|
||||
continue
|
||||
|
||||
# KV.WaitingForInput - init here
|
||||
@@ -522,6 +524,8 @@ class SchedulerDisaggregationPrefillMixin:
|
||||
req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
|
||||
)
|
||||
done_reqs.append(req)
|
||||
if self.enable_metrics:
|
||||
self.metrics_collector.increment_transfer_failed_reqs()
|
||||
else:
|
||||
assert False, f"Unexpected polling state {poll=}"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user