[router] add tokenizer metrics (#9307)

Co-authored-by: Chang Su <chang.s.su@oracle.com>
2025-08-18 09:25:51 -07:00
parent 4c0bb411e5
commit 24247b4168
5 changed files with 344 additions and 11 deletions
--- a/sgl-router/src/metrics.rs
+++ b/sgl-router/src/metrics.rs
@@ -148,6 +148,94 @@ pub fn init_metrics() {
        "sgl_router_running_requests",
        "Number of running requests per worker"
    );
+
+    // Tokenizer metrics
+    describe_histogram!(
+        "sgl_tokenizer_encode_duration_seconds",
+        "Time to encode text to tokens"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_decode_duration_seconds",
+        "Time to decode tokens to text"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_encode_batch_duration_seconds",
+        "Time to encode a batch of texts"
+    );
+    describe_counter!(
+        "sgl_tokenizer_encode_requests_total",
+        "Total number of encode requests by tokenizer type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_decode_requests_total",
+        "Total number of decode requests by tokenizer type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_encode_errors_total",
+        "Total number of encode errors by error type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_decode_errors_total",
+        "Total number of decode errors by error type"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_tokens_per_encode",
+        "Number of tokens produced per encode operation"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_chars_per_encode",
+        "Number of characters in input text per encode"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_tokens_per_decode",
+        "Number of tokens decoded per operation"
+    );
+    describe_gauge!(
+        "sgl_tokenizer_vocab_size",
+        "Vocabulary size of the loaded tokenizer"
+    );
+
+    // Stop sequence detection metrics
+    describe_counter!(
+        "sgl_tokenizer_stop_sequences_detected_total",
+        "Total stop sequences detected by type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_partial_matches_total",
+        "Total partial stop sequence matches (jailed text)"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_stop_detection_duration_seconds",
+        "Time to check for stop sequences per token"
+    );
+
+    // Streaming decode metrics
+    describe_counter!(
+        "sgl_tokenizer_stream_tokens_total",
+        "Total tokens processed in streaming decode"
+    );
+    describe_counter!(
+        "sgl_tokenizer_stream_incomplete_utf8_total",
+        "Total incomplete UTF-8 sequences detected"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_stream_step_duration_seconds",
+        "Time per streaming decode step"
+    );
+
+    // Factory metrics
+    describe_counter!(
+        "sgl_tokenizer_factory_loads_total",
+        "Total tokenizer loads by file type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_factory_errors_total",
+        "Total tokenizer loading errors by type"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_factory_load_duration_seconds",
+        "Time to load and initialize tokenizer"
+    );
 }

 pub fn start_prometheus(config: PrometheusConfig) {
@@ -177,6 +265,8 @@ pub fn start_prometheus(config: PrometheusConfig) {

 pub struct RouterMetrics;

+pub struct TokenizerMetrics;
+
 impl RouterMetrics {
    // Request metrics
    pub fn record_request(route: &str) {
@@ -384,6 +474,122 @@ impl RouterMetrics {
    }
 }

+impl TokenizerMetrics {
+    // Encoding metrics
+    pub fn record_encode_request(tokenizer_type: &str) {
+        counter!("sgl_tokenizer_encode_requests_total",
+            "tokenizer_type" => tokenizer_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_encode_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_encode_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    pub fn record_encode_error(error_type: &str) {
+        counter!("sgl_tokenizer_encode_errors_total",
+            "error_type" => error_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_tokens_per_encode(token_count: usize) {
+        histogram!("sgl_tokenizer_tokens_per_encode").record(token_count as f64);
+    }
+
+    pub fn record_chars_per_encode(char_count: usize) {
+        histogram!("sgl_tokenizer_chars_per_encode").record(char_count as f64);
+    }
+
+    // Decoding metrics
+    pub fn record_decode_request(tokenizer_type: &str) {
+        counter!("sgl_tokenizer_decode_requests_total",
+            "tokenizer_type" => tokenizer_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_decode_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_decode_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    pub fn record_decode_error(error_type: &str) {
+        counter!("sgl_tokenizer_decode_errors_total",
+            "error_type" => error_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_tokens_per_decode(token_count: usize) {
+        histogram!("sgl_tokenizer_tokens_per_decode").record(token_count as f64);
+    }
+
+    // Batch encoding metrics
+    pub fn record_encode_batch_duration(duration: Duration, batch_size: usize) {
+        histogram!("sgl_tokenizer_encode_batch_duration_seconds",
+            "batch_size" => batch_size.to_string()
+        )
+        .record(duration.as_secs_f64());
+    }
+
+    // Stop sequence detection metrics
+    pub fn record_stop_sequence_detected(stop_type: &str) {
+        counter!("sgl_tokenizer_stop_sequences_detected_total",
+            "type" => stop_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_partial_match() {
+        counter!("sgl_tokenizer_partial_matches_total").increment(1);
+    }
+
+    pub fn record_stop_detection_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_stop_detection_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    // Streaming decode metrics
+    pub fn record_stream_token() {
+        counter!("sgl_tokenizer_stream_tokens_total").increment(1);
+    }
+
+    pub fn record_incomplete_utf8() {
+        counter!("sgl_tokenizer_stream_incomplete_utf8_total").increment(1);
+    }
+
+    pub fn record_stream_step_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_stream_step_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    // Factory metrics
+    pub fn record_factory_load(file_type: &str) {
+        counter!("sgl_tokenizer_factory_loads_total",
+            "file_type" => file_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_factory_error(error_type: &str) {
+        counter!("sgl_tokenizer_factory_errors_total",
+            "error_type" => error_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_factory_load_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_factory_load_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    // Vocabulary metrics
+    pub fn set_vocab_size(tokenizer_type: &str, size: usize) {
+        gauge!("sgl_tokenizer_vocab_size",
+            "tokenizer_type" => tokenizer_type.to_string()
+        )
+        .set(size as f64);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -646,6 +852,46 @@ mod tests {
        RouterMetrics::set_running_requests("http://worker1", 15);
    }

+    #[test]
+    fn test_tokenizer_metrics_static_methods() {
+        // Test that all tokenizer metric methods can be called without panic
+
+        // Encoding metrics
+        TokenizerMetrics::record_encode_request("huggingface");
+        TokenizerMetrics::record_encode_duration(Duration::from_millis(10));
+        TokenizerMetrics::record_encode_error("invalid_input");
+        TokenizerMetrics::record_tokens_per_encode(100);
+        TokenizerMetrics::record_chars_per_encode(500);
+
+        // Decoding metrics
+        TokenizerMetrics::record_decode_request("huggingface");
+        TokenizerMetrics::record_decode_duration(Duration::from_millis(5));
+        TokenizerMetrics::record_decode_error("invalid_tokens");
+        TokenizerMetrics::record_tokens_per_decode(50);
+
+        // Batch encoding
+        TokenizerMetrics::record_encode_batch_duration(Duration::from_millis(100), 10);
+
+        // Stop sequence detection
+        TokenizerMetrics::record_stop_sequence_detected("token");
+        TokenizerMetrics::record_stop_sequence_detected("string");
+        TokenizerMetrics::record_partial_match();
+        TokenizerMetrics::record_stop_detection_duration(Duration::from_micros(100));
+
+        // Streaming decode
+        TokenizerMetrics::record_stream_token();
+        TokenizerMetrics::record_incomplete_utf8();
+        TokenizerMetrics::record_stream_step_duration(Duration::from_micros(50));
+
+        // Factory metrics
+        TokenizerMetrics::record_factory_load("json");
+        TokenizerMetrics::record_factory_error("unsupported_format");
+        TokenizerMetrics::record_factory_load_duration(Duration::from_millis(200));
+
+        // Vocabulary metrics
+        TokenizerMetrics::set_vocab_size("huggingface", 50000);
+    }
+
    // ============= Port Availability Tests =============

    #[test]