[router] add tokenizer download support from hf hub (#9882)

This commit is contained in:
Chang Su
2025-09-01 10:40:37 -07:00
committed by GitHub
parent b361750a4a
commit 598c0bc19d
9 changed files with 407 additions and 138 deletions

View File

@@ -5,7 +5,6 @@ use anyhow::{Error, Result};
use std::collections::HashMap;
use tokenizers::tokenizer::Tokenizer as HfTokenizer;
#[cfg(feature = "minijinja")]
use super::chat_template::{ChatMessage, ChatTemplateProcessor};
/// HuggingFace tokenizer wrapper
@@ -14,7 +13,6 @@ pub struct HuggingFaceTokenizer {
special_tokens: SpecialTokens,
vocab: HashMap<String, TokenIdType>,
reverse_vocab: HashMap<TokenIdType, String>,
#[cfg(feature = "minijinja")]
chat_template: Option<String>,
}
@@ -43,7 +41,6 @@ impl HuggingFaceTokenizer {
.collect();
// Load chat template
#[cfg(feature = "minijinja")]
let chat_template = if let Some(template_path) = chat_template_path {
// Load from specified .jinja file
Self::load_chat_template_from_file(template_path)?
@@ -57,7 +54,6 @@ impl HuggingFaceTokenizer {
special_tokens,
vocab,
reverse_vocab,
#[cfg(feature = "minijinja")]
chat_template,
})
}
@@ -76,7 +72,6 @@ impl HuggingFaceTokenizer {
special_tokens,
vocab,
reverse_vocab,
#[cfg(feature = "minijinja")]
chat_template: None,
}
}
@@ -109,7 +104,6 @@ impl HuggingFaceTokenizer {
}
/// Try to load chat template from tokenizer_config.json
#[cfg(feature = "minijinja")]
fn load_chat_template(tokenizer_path: &str) -> Option<String> {
// Try to find tokenizer_config.json in the same directory
let path = std::path::Path::new(tokenizer_path);
@@ -127,7 +121,6 @@ impl HuggingFaceTokenizer {
}
/// Load chat template from a .jinja file
#[cfg(feature = "minijinja")]
fn load_chat_template_from_file(template_path: &str) -> Result<Option<String>> {
use std::fs;
@@ -141,13 +134,11 @@ impl HuggingFaceTokenizer {
}
/// Set or override the chat template
#[cfg(feature = "minijinja")]
pub fn set_chat_template(&mut self, template: String) {
self.chat_template = Some(template);
}
/// Apply chat template if available
#[cfg(feature = "minijinja")]
pub fn apply_chat_template(
&self,
messages: &[ChatMessage],
@@ -172,24 +163,6 @@ impl HuggingFaceTokenizer {
Ok(result)
}
}
/// Apply chat template if available (without minijinja feature)
#[cfg(not(feature = "minijinja"))]
pub fn apply_chat_template(
&self,
messages: &[ChatMessage],
add_generation_prompt: bool,
) -> Result<String> {
// Fallback to simple formatting
let mut result = String::new();
for msg in messages {
result.push_str(&format!("{}: {}\n", msg.role, msg.content));
}
if add_generation_prompt {
result.push_str("assistant: ");
}
Ok(result)
}
}
impl Encoder for HuggingFaceTokenizer {
@@ -241,10 +214,8 @@ impl TokenizerTrait for HuggingFaceTokenizer {
#[cfg(test)]
mod tests {
#[cfg(feature = "minijinja")]
use super::ChatMessage;
#[cfg(feature = "minijinja")]
#[test]
fn test_chat_message_creation() {
let msg = ChatMessage::system("You are a helpful assistant");