metrics: support customer labels specified in request header (#10143)

This commit is contained in:
Yingchun Lai
2025-09-15 11:00:08 +08:00
committed by GitHub
parent 8f6a175803
commit fc2c3a3d8e
11 changed files with 126 additions and 28 deletions

View File

@@ -306,12 +306,16 @@ class TokenizerManager(TokenizerCommunicatorMixin):
# Metrics
if self.enable_metrics:
labels = {
"model_name": self.server_args.served_model_name,
# TODO: Add lora name/path in the future,
}
if server_args.tokenizer_metrics_allowed_customer_labels:
for label in server_args.tokenizer_metrics_allowed_customer_labels:
labels[label] = ""
self.metrics_collector = TokenizerMetricsCollector(
server_args=server_args,
labels={
"model_name": self.server_args.served_model_name,
# TODO: Add lora name/path in the future,
},
labels=labels,
bucket_time_to_first_token=self.server_args.bucket_time_to_first_token,
bucket_e2e_request_latency=self.server_args.bucket_e2e_request_latency,
bucket_inter_token_latency=self.server_args.bucket_inter_token_latency,
@@ -1036,7 +1040,6 @@ class TokenizerManager(TokenizerCommunicatorMixin):
return
req = AbortReq(rid, abort_all)
self.send_to_scheduler.send_pyobj(req)
if self.enable_metrics:
self.metrics_collector.observe_one_aborted_request()
@@ -1616,6 +1619,12 @@ class TokenizerManager(TokenizerCommunicatorMixin):
else 0
)
customer_labels = getattr(state.obj, "customer_labels", None)
labels = (
{**self.metrics_collector.labels, **customer_labels}
if customer_labels
else self.metrics_collector.labels
)
if (
state.first_token_time == 0.0
and self.disaggregation_mode != DisaggregationMode.PREFILL
@@ -1623,7 +1632,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
state.first_token_time = state.last_time = time.time()
state.last_completion_tokens = completion_tokens
self.metrics_collector.observe_time_to_first_token(
state.first_token_time - state.created_time
labels, state.first_token_time - state.created_time
)
else:
num_new_tokens = completion_tokens - state.last_completion_tokens
@@ -1631,6 +1640,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
new_time = time.time()
interval = new_time - state.last_time
self.metrics_collector.observe_inter_token_latency(
labels,
interval,
num_new_tokens,
)
@@ -1645,6 +1655,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
or state.obj.sampling_params.get("structural_tag", None)
)
self.metrics_collector.observe_one_finished_request(
labels,
recv_obj.prompt_tokens[i],
completion_tokens,
recv_obj.cached_tokens[i],