[router] Add rustfmt and set group imports by default (#11732)
This commit is contained in:
@@ -3,12 +3,16 @@
|
||||
//! This module provides functionality to apply chat templates to messages,
|
||||
//! similar to HuggingFace transformers' apply_chat_template method.
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use minijinja::machinery::ast::{Expr, Stmt};
|
||||
use minijinja::{context, Environment, Value};
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use minijinja::{
|
||||
context,
|
||||
machinery::ast::{Expr, Stmt},
|
||||
Environment, Value,
|
||||
};
|
||||
use serde_json;
|
||||
|
||||
/// Chat template content format
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ChatTemplateContentFormat {
|
||||
@@ -319,8 +323,10 @@ impl<'a> Detector<'a> {
|
||||
/// AST-based detection using minijinja's unstable machinery
|
||||
/// Single-pass detector with scope tracking
|
||||
fn detect_format_with_ast(template: &str) -> Option<ChatTemplateContentFormat> {
|
||||
use minijinja::machinery::{parse, WhitespaceConfig};
|
||||
use minijinja::syntax::SyntaxConfig;
|
||||
use minijinja::{
|
||||
machinery::{parse, WhitespaceConfig},
|
||||
syntax::SyntaxConfig,
|
||||
};
|
||||
|
||||
let ast = match parse(
|
||||
template,
|
||||
|
||||
@@ -1,13 +1,9 @@
|
||||
use super::traits;
|
||||
use std::{fs::File, io::Read, path::Path, sync::Arc};
|
||||
|
||||
use anyhow::{Error, Result};
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use super::huggingface::HuggingFaceTokenizer;
|
||||
use super::tiktoken::TiktokenTokenizer;
|
||||
use super::{huggingface::HuggingFaceTokenizer, tiktoken::TiktokenTokenizer, traits};
|
||||
use crate::tokenizer::hub::download_tokenizer_from_hf;
|
||||
|
||||
/// Represents the type of tokenizer being used
|
||||
@@ -379,8 +375,7 @@ pub fn get_tokenizer_info(file_path: &str) -> Result<TokenizerType> {
|
||||
Some("json") => Ok(TokenizerType::HuggingFace(file_path.to_string())),
|
||||
_ => {
|
||||
// Try auto-detection
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::{fs::File, io::Read};
|
||||
|
||||
let mut file = File::open(file_path)?;
|
||||
let mut buffer = vec![0u8; 512];
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use std::{
|
||||
env,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use hf_hub::api::tokio::ApiBuilder;
|
||||
use std::env;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
const IGNORED: [&str; 5] = [
|
||||
".gitattributes",
|
||||
|
||||
@@ -3,12 +3,12 @@ use std::collections::HashMap;
|
||||
use anyhow::{Error, Result};
|
||||
use tokenizers::tokenizer::Tokenizer as HfTokenizer;
|
||||
|
||||
use super::chat_template::{
|
||||
detect_chat_template_content_format, ChatTemplateContentFormat, ChatTemplateParams,
|
||||
ChatTemplateProcessor,
|
||||
};
|
||||
use super::traits::{
|
||||
Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait,
|
||||
use super::{
|
||||
chat_template::{
|
||||
detect_chat_template_content_format, ChatTemplateContentFormat, ChatTemplateParams,
|
||||
ChatTemplateProcessor,
|
||||
},
|
||||
traits::{Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait},
|
||||
};
|
||||
|
||||
/// HuggingFace tokenizer wrapper
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
//! Mock tokenizer implementation for testing
|
||||
|
||||
use super::traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait};
|
||||
use anyhow::Result;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use super::traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait};
|
||||
|
||||
/// Mock tokenizer for testing purposes
|
||||
pub struct MockTokenizer {
|
||||
vocab: HashMap<String, u32>,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::{ops::Deref, sync::Arc};
|
||||
|
||||
use anyhow::Result;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub mod factory;
|
||||
pub mod hub;
|
||||
@@ -27,14 +27,12 @@ pub use factory::{
|
||||
create_tokenizer_from_file, create_tokenizer_with_chat_template,
|
||||
create_tokenizer_with_chat_template_blocking, TokenizerType,
|
||||
};
|
||||
pub use huggingface::HuggingFaceTokenizer;
|
||||
pub use sequence::Sequence;
|
||||
pub use stop::{SequenceDecoderOutput, StopSequenceConfig, StopSequenceDecoder};
|
||||
pub use stream::DecodeStream;
|
||||
pub use traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait};
|
||||
|
||||
pub use huggingface::HuggingFaceTokenizer;
|
||||
|
||||
pub use tiktoken::{TiktokenModel, TiktokenTokenizer};
|
||||
pub use traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait};
|
||||
|
||||
/// Main tokenizer wrapper that provides a unified interface for different tokenizer implementations
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use super::traits::{TokenIdType, Tokenizer as TokenizerTrait};
|
||||
use anyhow::Result;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use super::traits::{TokenIdType, Tokenizer as TokenizerTrait};
|
||||
|
||||
/// Maintains state for an ongoing sequence of tokens and their decoded text
|
||||
/// This provides a cleaner abstraction for managing token sequences
|
||||
pub struct Sequence {
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
use super::sequence::Sequence;
|
||||
use super::traits::{self, TokenIdType};
|
||||
use std::{collections::HashSet, sync::Arc};
|
||||
|
||||
use anyhow::Result;
|
||||
use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{
|
||||
sequence::Sequence,
|
||||
traits::{self, TokenIdType},
|
||||
};
|
||||
|
||||
/// Output from the sequence decoder
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
// src/tokenizer/stream.rs
|
||||
|
||||
use super::traits::{self, TokenIdType};
|
||||
use anyhow::Result;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use super::traits::{self, TokenIdType};
|
||||
|
||||
const INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET: usize = 5;
|
||||
|
||||
/// DecodeStream will keep the state necessary to produce individual chunks of
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#[cfg(test)]
|
||||
use super::*;
|
||||
#[cfg(test)]
|
||||
use std::sync::Arc;
|
||||
|
||||
#[cfg(test)]
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_mock_tokenizer_encode() {
|
||||
let tokenizer = mock::MockTokenizer::new();
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use anyhow::{Error, Result};
|
||||
use tiktoken_rs::{cl100k_base, p50k_base, p50k_edit, r50k_base, CoreBPE};
|
||||
|
||||
use super::traits::{
|
||||
Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait,
|
||||
};
|
||||
use anyhow::{Error, Result};
|
||||
use tiktoken_rs::{cl100k_base, p50k_base, p50k_edit, r50k_base, CoreBPE};
|
||||
|
||||
/// Tiktoken tokenizer wrapper for OpenAI GPT models
|
||||
pub struct TiktokenTokenizer {
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use std::{
|
||||
collections::hash_map::DefaultHasher,
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
/// Type alias for token IDs
|
||||
pub type TokenIdType = u32;
|
||||
|
||||
Reference in New Issue
Block a user