[router] add tokenizer metrics (#9307)

Co-authored-by: Chang Su <chang.s.su@oracle.com>
This commit is contained in:
Simo Lin
2025-08-18 09:25:51 -07:00
committed by GitHub
parent 4c0bb411e5
commit 24247b4168
5 changed files with 344 additions and 11 deletions

View File

@@ -148,6 +148,94 @@ pub fn init_metrics() {
"sgl_router_running_requests",
"Number of running requests per worker"
);
// Tokenizer metrics
describe_histogram!(
"sgl_tokenizer_encode_duration_seconds",
"Time to encode text to tokens"
);
describe_histogram!(
"sgl_tokenizer_decode_duration_seconds",
"Time to decode tokens to text"
);
describe_histogram!(
"sgl_tokenizer_encode_batch_duration_seconds",
"Time to encode a batch of texts"
);
describe_counter!(
"sgl_tokenizer_encode_requests_total",
"Total number of encode requests by tokenizer type"
);
describe_counter!(
"sgl_tokenizer_decode_requests_total",
"Total number of decode requests by tokenizer type"
);
describe_counter!(
"sgl_tokenizer_encode_errors_total",
"Total number of encode errors by error type"
);
describe_counter!(
"sgl_tokenizer_decode_errors_total",
"Total number of decode errors by error type"
);
describe_histogram!(
"sgl_tokenizer_tokens_per_encode",
"Number of tokens produced per encode operation"
);
describe_histogram!(
"sgl_tokenizer_chars_per_encode",
"Number of characters in input text per encode"
);
describe_histogram!(
"sgl_tokenizer_tokens_per_decode",
"Number of tokens decoded per operation"
);
describe_gauge!(
"sgl_tokenizer_vocab_size",
"Vocabulary size of the loaded tokenizer"
);
// Stop sequence detection metrics
describe_counter!(
"sgl_tokenizer_stop_sequences_detected_total",
"Total stop sequences detected by type"
);
describe_counter!(
"sgl_tokenizer_partial_matches_total",
"Total partial stop sequence matches (jailed text)"
);
describe_histogram!(
"sgl_tokenizer_stop_detection_duration_seconds",
"Time to check for stop sequences per token"
);
// Streaming decode metrics
describe_counter!(
"sgl_tokenizer_stream_tokens_total",
"Total tokens processed in streaming decode"
);
describe_counter!(
"sgl_tokenizer_stream_incomplete_utf8_total",
"Total incomplete UTF-8 sequences detected"
);
describe_histogram!(
"sgl_tokenizer_stream_step_duration_seconds",
"Time per streaming decode step"
);
// Factory metrics
describe_counter!(
"sgl_tokenizer_factory_loads_total",
"Total tokenizer loads by file type"
);
describe_counter!(
"sgl_tokenizer_factory_errors_total",
"Total tokenizer loading errors by type"
);
describe_histogram!(
"sgl_tokenizer_factory_load_duration_seconds",
"Time to load and initialize tokenizer"
);
}
pub fn start_prometheus(config: PrometheusConfig) {
@@ -177,6 +265,8 @@ pub fn start_prometheus(config: PrometheusConfig) {
pub struct RouterMetrics;
pub struct TokenizerMetrics;
impl RouterMetrics {
// Request metrics
pub fn record_request(route: &str) {
@@ -384,6 +474,122 @@ impl RouterMetrics {
}
}
impl TokenizerMetrics {
// Encoding metrics
pub fn record_encode_request(tokenizer_type: &str) {
counter!("sgl_tokenizer_encode_requests_total",
"tokenizer_type" => tokenizer_type.to_string()
)
.increment(1);
}
pub fn record_encode_duration(duration: Duration) {
histogram!("sgl_tokenizer_encode_duration_seconds").record(duration.as_secs_f64());
}
pub fn record_encode_error(error_type: &str) {
counter!("sgl_tokenizer_encode_errors_total",
"error_type" => error_type.to_string()
)
.increment(1);
}
pub fn record_tokens_per_encode(token_count: usize) {
histogram!("sgl_tokenizer_tokens_per_encode").record(token_count as f64);
}
pub fn record_chars_per_encode(char_count: usize) {
histogram!("sgl_tokenizer_chars_per_encode").record(char_count as f64);
}
// Decoding metrics
pub fn record_decode_request(tokenizer_type: &str) {
counter!("sgl_tokenizer_decode_requests_total",
"tokenizer_type" => tokenizer_type.to_string()
)
.increment(1);
}
pub fn record_decode_duration(duration: Duration) {
histogram!("sgl_tokenizer_decode_duration_seconds").record(duration.as_secs_f64());
}
pub fn record_decode_error(error_type: &str) {
counter!("sgl_tokenizer_decode_errors_total",
"error_type" => error_type.to_string()
)
.increment(1);
}
pub fn record_tokens_per_decode(token_count: usize) {
histogram!("sgl_tokenizer_tokens_per_decode").record(token_count as f64);
}
// Batch encoding metrics
pub fn record_encode_batch_duration(duration: Duration, batch_size: usize) {
histogram!("sgl_tokenizer_encode_batch_duration_seconds",
"batch_size" => batch_size.to_string()
)
.record(duration.as_secs_f64());
}
// Stop sequence detection metrics
pub fn record_stop_sequence_detected(stop_type: &str) {
counter!("sgl_tokenizer_stop_sequences_detected_total",
"type" => stop_type.to_string()
)
.increment(1);
}
pub fn record_partial_match() {
counter!("sgl_tokenizer_partial_matches_total").increment(1);
}
pub fn record_stop_detection_duration(duration: Duration) {
histogram!("sgl_tokenizer_stop_detection_duration_seconds").record(duration.as_secs_f64());
}
// Streaming decode metrics
pub fn record_stream_token() {
counter!("sgl_tokenizer_stream_tokens_total").increment(1);
}
pub fn record_incomplete_utf8() {
counter!("sgl_tokenizer_stream_incomplete_utf8_total").increment(1);
}
pub fn record_stream_step_duration(duration: Duration) {
histogram!("sgl_tokenizer_stream_step_duration_seconds").record(duration.as_secs_f64());
}
// Factory metrics
pub fn record_factory_load(file_type: &str) {
counter!("sgl_tokenizer_factory_loads_total",
"file_type" => file_type.to_string()
)
.increment(1);
}
pub fn record_factory_error(error_type: &str) {
counter!("sgl_tokenizer_factory_errors_total",
"error_type" => error_type.to_string()
)
.increment(1);
}
pub fn record_factory_load_duration(duration: Duration) {
histogram!("sgl_tokenizer_factory_load_duration_seconds").record(duration.as_secs_f64());
}
// Vocabulary metrics
pub fn set_vocab_size(tokenizer_type: &str, size: usize) {
gauge!("sgl_tokenizer_vocab_size",
"tokenizer_type" => tokenizer_type.to_string()
)
.set(size as f64);
}
}
#[cfg(test)]
mod tests {
use super::*;
@@ -646,6 +852,46 @@ mod tests {
RouterMetrics::set_running_requests("http://worker1", 15);
}
#[test]
fn test_tokenizer_metrics_static_methods() {
// Test that all tokenizer metric methods can be called without panic
// Encoding metrics
TokenizerMetrics::record_encode_request("huggingface");
TokenizerMetrics::record_encode_duration(Duration::from_millis(10));
TokenizerMetrics::record_encode_error("invalid_input");
TokenizerMetrics::record_tokens_per_encode(100);
TokenizerMetrics::record_chars_per_encode(500);
// Decoding metrics
TokenizerMetrics::record_decode_request("huggingface");
TokenizerMetrics::record_decode_duration(Duration::from_millis(5));
TokenizerMetrics::record_decode_error("invalid_tokens");
TokenizerMetrics::record_tokens_per_decode(50);
// Batch encoding
TokenizerMetrics::record_encode_batch_duration(Duration::from_millis(100), 10);
// Stop sequence detection
TokenizerMetrics::record_stop_sequence_detected("token");
TokenizerMetrics::record_stop_sequence_detected("string");
TokenizerMetrics::record_partial_match();
TokenizerMetrics::record_stop_detection_duration(Duration::from_micros(100));
// Streaming decode
TokenizerMetrics::record_stream_token();
TokenizerMetrics::record_incomplete_utf8();
TokenizerMetrics::record_stream_step_duration(Duration::from_micros(50));
// Factory metrics
TokenizerMetrics::record_factory_load("json");
TokenizerMetrics::record_factory_error("unsupported_format");
TokenizerMetrics::record_factory_load_duration(Duration::from_millis(200));
// Vocabulary metrics
TokenizerMetrics::set_vocab_size("huggingface", 50000);
}
// ============= Port Availability Tests =============
#[test]