diff --git a/NV A100 Patched 镜像合并/detect_tokenizer.py b/NV A100 Patched 镜像合并/detect_tokenizer.py new file mode 100644 index 0000000..03396c2 --- /dev/null +++ b/NV A100 Patched 镜像合并/detect_tokenizer.py @@ -0,0 +1,25 @@ +import os +import json + +def detect(model_dir): + cfg_path = os.path.join(model_dir, "tokenizer_config.json") + + if os.path.exists(cfg_path): + with open(cfg_path) as f: + cfg = json.load(f) + cls = cfg.get("tokenizer_class", "") + else: + cls = "" + + files = os.listdir(model_dir) + + if "tokenizer.json" in files: + return "fast", cls + + if "tokenizer.model" in files: + return "sentencepiece", cls + + if "vocab.json" in files and "merges.txt" in files: + return "bpe", cls + + return "unknown", cls \ No newline at end of file