25 lines
560 B
Python
25 lines
560 B
Python
import os
|
|
import json
|
|
|
|
def detect(model_dir):
|
|
cfg_path = os.path.join(model_dir, "tokenizer_config.json")
|
|
|
|
if os.path.exists(cfg_path):
|
|
with open(cfg_path) as f:
|
|
cfg = json.load(f)
|
|
cls = cfg.get("tokenizer_class", "")
|
|
else:
|
|
cls = ""
|
|
|
|
files = os.listdir(model_dir)
|
|
|
|
if "tokenizer.json" in files:
|
|
return "fast", cls
|
|
|
|
if "tokenizer.model" in files:
|
|
return "sentencepiece", cls
|
|
|
|
if "vocab.json" in files and "merges.txt" in files:
|
|
return "bpe", cls
|
|
|
|
return "unknown", cls |