From c701ca89d3a718fa8dbbb8370182bb1f513cbde6 Mon Sep 17 00:00:00 2001 From: i-peixingyu Date: Mon, 27 Apr 2026 13:12:10 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20NV=20A100=20Patched=20?= =?UTF-8?q?=E9=95=9C=E5=83=8F=E5=90=88=E5=B9=B6/detect=5Ftokenizer.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NV A100 Patched 镜像合并/detect_tokenizer.py | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 NV A100 Patched 镜像合并/detect_tokenizer.py diff --git a/NV A100 Patched 镜像合并/detect_tokenizer.py b/NV A100 Patched 镜像合并/detect_tokenizer.py new file mode 100644 index 0000000..03396c2 --- /dev/null +++ b/NV A100 Patched 镜像合并/detect_tokenizer.py @@ -0,0 +1,25 @@ +import os +import json + +def detect(model_dir): + cfg_path = os.path.join(model_dir, "tokenizer_config.json") + + if os.path.exists(cfg_path): + with open(cfg_path) as f: + cfg = json.load(f) + cls = cfg.get("tokenizer_class", "") + else: + cls = "" + + files = os.listdir(model_dir) + + if "tokenizer.json" in files: + return "fast", cls + + if "tokenizer.model" in files: + return "sentencepiece", cls + + if "vocab.json" in files and "merges.txt" in files: + return "bpe", cls + + return "unknown", cls \ No newline at end of file