first commit

This commit is contained in:
2026-05-28 10:56:17 +08:00
commit b0b0248cee
5 changed files with 308 additions and 0 deletions

25
detect_tokenizer.py Normal file
View File

@@ -0,0 +1,25 @@
import os
import json
def detect(model_dir):
cfg_path = os.path.join(model_dir, "tokenizer_config.json")
if os.path.exists(cfg_path):
with open(cfg_path) as f:
cfg = json.load(f)
cls = cfg.get("tokenizer_class", "")
else:
cls = ""
files = os.listdir(model_dir)
if "tokenizer.json" in files:
return "fast", cls
if "tokenizer.model" in files:
return "sentencepiece", cls
if "vocab.json" in files and "merges.txt" in files:
return "bpe", cls
return "unknown", cls