diff --git a/sgl-router/src/tokenizer/factory.rs b/sgl-router/src/tokenizer/factory.rs index fb6bef510..6c938b26c 100644 --- a/sgl-router/src/tokenizer/factory.rs +++ b/sgl-router/src/tokenizer/factory.rs @@ -1,11 +1,9 @@ -use super::traits::{self, Tokenizer as TokenizerTrait}; -use crate::metrics::TokenizerMetrics; +use super::traits; use anyhow::{Error, Result}; use std::fs::File; use std::io::Read; use std::path::Path; use std::sync::Arc; -use std::time::Instant; #[cfg(feature = "huggingface")] use super::huggingface::HuggingFaceTokenizer; @@ -34,8 +32,6 @@ pub fn create_tokenizer_with_chat_template( file_path: &str, chat_template_path: Option<&str>, ) -> Result> { - let start_time = Instant::now(); - // Special case for testing if file_path == "mock" || file_path == "test" { return Ok(Arc::new(super::mock::MockTokenizer::new())); @@ -45,7 +41,6 @@ pub fn create_tokenizer_with_chat_template( // Check if file exists if !path.exists() { - TokenizerMetrics::record_factory_error("file_not_found"); return Err(Error::msg(format!("File not found: {}", file_path))); } @@ -64,14 +59,10 @@ pub fn create_tokenizer_with_chat_template( chat_template_path, )?; - TokenizerMetrics::record_factory_load("json"); - TokenizerMetrics::set_vocab_size("huggingface", tokenizer.vocab_size()); - Ok(Arc::new(tokenizer) as Arc) } #[cfg(not(feature = "huggingface"))] { - TokenizerMetrics::record_factory_error("huggingface_disabled"); Err(Error::msg( "HuggingFace support not enabled. Enable the 'huggingface' feature.", )) @@ -79,26 +70,18 @@ pub fn create_tokenizer_with_chat_template( } Some("model") => { // SentencePiece model file - TokenizerMetrics::record_factory_error("unsupported_sentencepiece"); Err(Error::msg("SentencePiece models not yet supported")) } Some("gguf") => { // GGUF format - TokenizerMetrics::record_factory_error("unsupported_gguf"); Err(Error::msg("GGUF format not yet supported")) } _ => { // Try to auto-detect by reading file content - auto_detect_tokenizer(file_path).inspect(|tokenizer| { - TokenizerMetrics::record_factory_load("auto_detected"); - TokenizerMetrics::set_vocab_size("auto_detected", tokenizer.vocab_size()); - }) + auto_detect_tokenizer(file_path) } }; - if result.is_ok() { - TokenizerMetrics::record_factory_load_duration(start_time.elapsed()); - } result } @@ -190,8 +173,6 @@ pub fn create_tokenizer(model_name_or_path: &str) -> Result Result { - let start = Instant::now(); - - TokenizerMetrics::record_encode_request("huggingface"); - TokenizerMetrics::record_chars_per_encode(input.len()); - self.tokenizer .encode(input, false) - .map_err(|e| { - TokenizerMetrics::record_encode_error("encoding_failed"); - Error::msg(format!("Encoding failed: {}", e)) - }) - .map(|encoding| { - TokenizerMetrics::record_tokens_per_encode(encoding.get_ids().len()); - TokenizerMetrics::record_encode_duration(start.elapsed()); - Encoding::Hf(Box::new(encoding)) - }) + .map_err(|e| Error::msg(format!("Encoding failed: {}", e))) + .map(|encoding| Encoding::Hf(Box::new(encoding))) } fn encode_batch(&self, inputs: &[&str]) -> Result> { - let start = Instant::now(); - let encodings = self .tokenizer .encode_batch(inputs.to_vec(), false) - .map_err(|e| { - TokenizerMetrics::record_encode_error("batch_encoding_failed"); - Error::msg(format!("Batch encoding failed: {}", e)) - })?; - - TokenizerMetrics::record_encode_batch_duration(start.elapsed(), inputs.len()); + .map_err(|e| Error::msg(format!("Batch encoding failed: {}", e)))?; Ok(encodings .into_iter() @@ -236,20 +215,9 @@ impl Encoder for HuggingFaceTokenizer { impl Decoder for HuggingFaceTokenizer { fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result { - let start = Instant::now(); - - TokenizerMetrics::record_decode_request("huggingface"); - TokenizerMetrics::record_tokens_per_decode(token_ids.len()); - self.tokenizer .decode(token_ids, skip_special_tokens) - .map_err(|e| { - TokenizerMetrics::record_decode_error("decoding_failed"); - Error::msg(format!("Decoding failed: {}", e)) - }) - .inspect(|_| { - TokenizerMetrics::record_decode_duration(start.elapsed()); - }) + .map_err(|e| Error::msg(format!("Decoding failed: {}", e))) } } diff --git a/sgl-router/src/tokenizer/stop.rs b/sgl-router/src/tokenizer/stop.rs index 69376e20b..1efda15b6 100644 --- a/sgl-router/src/tokenizer/stop.rs +++ b/sgl-router/src/tokenizer/stop.rs @@ -1,9 +1,7 @@ use super::traits::{self, TokenIdType}; -use crate::metrics::TokenizerMetrics; use anyhow::Result; use std::collections::HashSet; use std::sync::Arc; -use std::time::Instant; /// Output from the sequence decoder #[derive(Debug, Clone, PartialEq)] @@ -95,8 +93,6 @@ impl StopSequenceDecoder { /// Process a single token pub fn process_token(&mut self, token_id: TokenIdType) -> Result { - let start = Instant::now(); - if self.stopped { return Ok(SequenceDecoderOutput::Stopped); } @@ -104,22 +100,18 @@ impl StopSequenceDecoder { // Check for token-level stops first if self.config.stop_tokens.contains(&token_id) { self.stopped = true; - TokenizerMetrics::record_stop_sequence_detected("token"); // Flush any jailed text before stopping if !self.jail_buffer.is_empty() { let output = self.jail_buffer.clone(); self.jail_buffer.clear(); - TokenizerMetrics::record_stop_detection_duration(start.elapsed()); return Ok(SequenceDecoderOutput::StoppedWithText(output)); } - TokenizerMetrics::record_stop_detection_duration(start.elapsed()); return Ok(SequenceDecoderOutput::Stopped); } if self.config.visible_stop_tokens.contains(&token_id) { self.stopped = true; - TokenizerMetrics::record_stop_sequence_detected("visible_token"); // Include jailed text plus the stop token let stop_text = self @@ -127,7 +119,6 @@ impl StopSequenceDecoder { .decode(&[token_id], self.skip_special_tokens)?; let output = format!("{}{}", self.jail_buffer, stop_text); self.jail_buffer.clear(); - TokenizerMetrics::record_stop_detection_duration(start.elapsed()); return Ok(SequenceDecoderOutput::StoppedWithText(output)); } @@ -172,12 +163,10 @@ impl StopSequenceDecoder { for stop_seq in &self.config.stop_sequences { if let Some(pos) = check_text.find(stop_seq) { self.stopped = true; - TokenizerMetrics::record_stop_sequence_detected("string"); // Output text before the stop sequence let output = check_text[..pos].to_string(); self.jail_buffer.clear(); - TokenizerMetrics::record_stop_detection_duration(start.elapsed()); return Ok(if output.is_empty() { SequenceDecoderOutput::Stopped } else { @@ -190,13 +179,11 @@ impl StopSequenceDecoder { for stop_seq in &self.config.visible_stop_sequences { if let Some(pos) = check_text.find(stop_seq) { self.stopped = true; - TokenizerMetrics::record_stop_sequence_detected("visible_string"); // Include the stop sequence in output let end_pos = pos + stop_seq.len(); let output = check_text[..end_pos].to_string(); self.jail_buffer.clear(); - TokenizerMetrics::record_stop_detection_duration(start.elapsed()); return Ok(SequenceDecoderOutput::StoppedWithText(output)); } } @@ -219,8 +206,6 @@ impl StopSequenceDecoder { } if partial_match_len > 0 { - TokenizerMetrics::record_partial_match(); - // Split: output safe text, jail the potential match let safe_end = check_text.len() - partial_match_len; let safe_text = &check_text[..safe_end]; @@ -230,8 +215,6 @@ impl StopSequenceDecoder { self.prefix_offset = self.read_offset; self.read_offset = self.token_buffer.len(); - TokenizerMetrics::record_stop_detection_duration(start.elapsed()); - if safe_text.is_empty() { Ok(SequenceDecoderOutput::Held) } else { @@ -245,8 +228,6 @@ impl StopSequenceDecoder { self.prefix_offset = self.read_offset; self.read_offset = self.token_buffer.len(); - TokenizerMetrics::record_stop_detection_duration(start.elapsed()); - Ok(SequenceDecoderOutput::Text(check_text)) } } diff --git a/sgl-router/src/tokenizer/stream.rs b/sgl-router/src/tokenizer/stream.rs index bea7ede8d..848be8a8c 100644 --- a/sgl-router/src/tokenizer/stream.rs +++ b/sgl-router/src/tokenizer/stream.rs @@ -1,10 +1,8 @@ // src/tokenizer/stream.rs use super::traits::{self, TokenIdType}; -use crate::metrics::TokenizerMetrics; use anyhow::Result; use std::sync::Arc; -use std::time::Instant; const INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET: usize = 5; @@ -45,12 +43,8 @@ impl DecodeStream { /// Step appends a token_id to the internal state and tries to produce a text chunk. /// Returning `None` means the given id is not enough to produce a chunk. pub fn step(&mut self, id: TokenIdType) -> Result> { - let start = Instant::now(); - self.all_token_ids.push(id); - TokenizerMetrics::record_stream_token(); - let prefix_text = self.tokenizer.decode( &self.all_token_ids[self.prefix_offset..self.read_offset], self.skip_special_tokens, @@ -67,16 +61,8 @@ impl DecodeStream { self.prefix_offset = self.read_offset; self.read_offset = self.all_token_ids.len(); - TokenizerMetrics::record_stream_step_duration(start.elapsed()); - Ok(Some(new_text)) } else { - if new_text.ends_with("�") { - TokenizerMetrics::record_incomplete_utf8(); - } - - TokenizerMetrics::record_stream_step_duration(start.elapsed()); - Ok(None) } } diff --git a/sgl-router/src/tokenizer/tests.rs b/sgl-router/src/tokenizer/tests.rs index 93c8f1621..2c4d4b108 100644 --- a/sgl-router/src/tokenizer/tests.rs +++ b/sgl-router/src/tokenizer/tests.rs @@ -129,9 +129,7 @@ fn test_thread_safety() { thread::spawn(move || { let text = "Hello test".to_string(); let encoding = tokenizer_clone.encode(&text).unwrap(); - let decoded = tokenizer_clone - .decode(&encoding.token_ids(), false) - .unwrap(); + let decoded = tokenizer_clone.decode(encoding.token_ids(), false).unwrap(); assert!(decoded.contains("Hello") || decoded.contains("test")); i }) diff --git a/sgl-router/src/tokenizer/tiktoken.rs b/sgl-router/src/tokenizer/tiktoken.rs index 9ba49ec9a..0af5a9791 100644 --- a/sgl-router/src/tokenizer/tiktoken.rs +++ b/sgl-router/src/tokenizer/tiktoken.rs @@ -213,7 +213,7 @@ mod tests { let text = "Hello, world!"; let encoding = tokenizer.encode(text).unwrap(); - let decoded = tokenizer.decode(&encoding.token_ids(), false).unwrap(); + let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap(); assert_eq!(decoded, text); } @@ -226,7 +226,7 @@ mod tests { assert_eq!(encodings.len(), 3); for (i, encoding) in encodings.iter().enumerate() { - let decoded = tokenizer.decode(&encoding.token_ids(), false).unwrap(); + let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap(); assert_eq!(decoded, texts[i]); } } diff --git a/sgl-router/src/tokenizer/traits.rs b/sgl-router/src/tokenizer/traits.rs index 5bf68c240..275dd822f 100644 --- a/sgl-router/src/tokenizer/traits.rs +++ b/sgl-router/src/tokenizer/traits.rs @@ -36,24 +36,21 @@ pub enum Encoding { } impl Encoding { - /// Returns a reference to token IDs when possible, owned Vec for compatibility - pub fn token_ids(&self) -> Vec { - match self { - Encoding::Hf(inner) => inner.get_ids().to_vec(), - Encoding::Sp(inner) => inner.clone(), - Encoding::Tiktoken(inner) => inner.clone(), - } - } - - /// Returns a reference to token IDs where possible - pub fn token_ids_ref(&self) -> &[TokenIdType] { + /// Returns a reference to token IDs - zero-copy operation + pub fn token_ids(&self) -> &[TokenIdType] { match self { Encoding::Hf(inner) => inner.get_ids(), Encoding::Sp(inner) => inner, - Encoding::Tiktoken(inner) => inner, // Now works with tiktoken-rs 0.7.0! + Encoding::Tiktoken(inner) => inner, } } + /// Deprecated: Use token_ids() instead (kept for compatibility) + #[deprecated(since = "0.1.0", note = "Use token_ids() instead")] + pub fn token_ids_ref(&self) -> &[TokenIdType] { + self.token_ids() + } + /// Get a hash of the token IDs for caching purposes pub fn get_hash(&self) -> u64 { let mut hasher = DefaultHasher::new(); diff --git a/sgl-router/tests/tokenizer_integration.rs b/sgl-router/tests/tokenizer_integration.rs index f49828bb1..9f0597297 100644 --- a/sgl-router/tests/tokenizer_integration.rs +++ b/sgl-router/tests/tokenizer_integration.rs @@ -66,7 +66,7 @@ fn test_tokenizer_encode_decode_lifecycle() { let encoding = tokenizer.encode(prompt).expect("Failed to encode prompt"); let decoded = tokenizer - .decode(&encoding.token_ids(), false) + .decode(encoding.token_ids(), false) .expect("Failed to decode token_ids"); assert_eq!(decoded, *prompt, "Encode-decode mismatch for: {}", prompt); @@ -101,7 +101,7 @@ fn test_sequence_operations() { for token_id in encoding.token_ids() { let text = decoder - .append_token(token_id) + .append_token(*token_id) .expect("Failed to append token"); output.push_str(&text); } @@ -131,7 +131,7 @@ fn test_decode_stream() { let mut output = String::new(); for token_id in encoding.token_ids() { - if let Some(text) = decoder.step(token_id).expect("Failed to decode token") { + if let Some(text) = decoder.step(*token_id).expect("Failed to decode token") { output.push_str(&text); } } @@ -157,11 +157,11 @@ fn test_long_sequence_incremental_decode_with_prefill() { .encode(output_text) .expect("Failed to encode output"); - let mut decoder = DecodeStream::new(tokenizer.clone(), &input_encoding.token_ids(), false); + let mut decoder = DecodeStream::new(tokenizer.clone(), input_encoding.token_ids(), false); let mut output = String::new(); for token_id in output_encoding.token_ids() { - if let Some(text) = decoder.step(token_id).expect("Failed to decode token") { + if let Some(text) = decoder.step(*token_id).expect("Failed to decode token") { output.push_str(&text); } } @@ -199,7 +199,7 @@ fn test_stop_sequence_decoder() { let mut stopped = false; for token_id in encoding.token_ids() { - match decoder.process_token(token_id).unwrap() { + match decoder.process_token(*token_id).unwrap() { SequenceDecoderOutput::Text(text) => output.push_str(&text), SequenceDecoderOutput::StoppedWithText(text) => { output.push_str(&text); @@ -245,7 +245,7 @@ fn test_factory_creation() { let encoding = tokenizer.encode(TEST_PROMPTS[0]).expect("Failed to encode"); let decoded = tokenizer - .decode(&encoding.token_ids(), false) + .decode(encoding.token_ids(), false) .expect("Failed to decode"); assert_eq!(decoded, TEST_PROMPTS[0]); @@ -265,7 +265,7 @@ fn test_batch_encoding() { for (i, encoding) in encodings.iter().enumerate() { let decoded = tokenizer - .decode(&encoding.token_ids(), false) + .decode(encoding.token_ids(), false) .expect("Failed to decode"); assert_eq!(decoded, TEST_PROMPTS[i]); } @@ -307,7 +307,7 @@ fn test_thread_safety() { .encode(prompt) .expect("Failed to encode in thread"); let decoded = tokenizer_clone - .decode(&encoding.token_ids(), false) + .decode(encoding.token_ids(), false) .expect("Failed to decode in thread"); assert_eq!(decoded, prompt); })