[router] tokenizer factory, hf tokenizer, and stop sequence detector (#9293)

Co-authored-by: Chang Su <chang.s.su@oracle.com>
This commit is contained in:
Simo Lin
2025-08-17 22:38:38 -07:00
committed by GitHub
parent 716e682721
commit d08663eec1
5 changed files with 935 additions and 5 deletions

View File

@@ -2,26 +2,36 @@ use anyhow::Result;
use std::ops::Deref;
use std::sync::Arc;
pub mod factory;
pub mod mock;
pub mod stop;
pub mod stream;
pub mod traits;
// Feature-gated modules
#[cfg(feature = "huggingface")]
pub mod huggingface;
#[cfg(test)]
mod tests;
// Re-exports
pub use factory::{create_tokenizer, create_tokenizer_from_file, TokenizerType};
pub use stop::{SequenceDecoderOutput, StopSequenceConfig, StopSequenceDecoder};
pub use stream::DecodeStream;
pub use traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait};
#[cfg(feature = "huggingface")]
pub use huggingface::{ChatMessage, HuggingFaceTokenizer};
/// Main tokenizer wrapper that provides a unified interface for different tokenizer implementations
#[derive(Clone)]
pub struct Tokenizer(Arc<dyn traits::Tokenizer>);
impl Tokenizer {
/// Create a tokenizer from a file path
/// Will be implemented in Phase 3 with factory pattern
pub fn from_file(_file_path: &str) -> Result<Tokenizer> {
// TODO: Implement factory pattern in Phase 3
unimplemented!("Factory pattern will be implemented in Phase 3")
pub fn from_file(file_path: &str) -> Result<Tokenizer> {
Ok(Tokenizer(factory::create_tokenizer_from_file(file_path)?))
}
/// Create a tokenizer from an Arc<dyn Tokenizer>