All checks were successful
Docker Build and Push / docker (push) Successful in -1m50s
Signed-off-by: Sun Ruoxi <sunruoxi@4paradigm.com>
70 lines
1.7 KiB
Python
70 lines
1.7 KiB
Python
import os
|
|
import shutil
|
|
import json
|
|
from detect_tokenizer import detect
|
|
|
|
MODEL_DIR = os.environ.get("MODEL_DIR", "/model")
|
|
OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer")
|
|
|
|
os.makedirs(OUT_DIR, exist_ok=True)
|
|
|
|
def copy_if_exists(name):
|
|
src = os.path.join(MODEL_DIR, name)
|
|
if os.path.exists(src):
|
|
shutil.copy(src, OUT_DIR)
|
|
|
|
# 复制所有可能相关文件
|
|
for f in [
|
|
"tokenizer.json",
|
|
"tokenizer_config.json",
|
|
"special_tokens_map.json",
|
|
"vocab.json",
|
|
"merges.txt",
|
|
"tokenizer.model",
|
|
]:
|
|
copy_if_exists(f)
|
|
|
|
typ, orig_cls = detect(MODEL_DIR)
|
|
|
|
cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json")
|
|
|
|
if os.path.exists(cfg_path):
|
|
with open(cfg_path) as f:
|
|
cfg = json.load(f)
|
|
else:
|
|
cfg = {}
|
|
|
|
# ===== 自动修复策略 =====
|
|
if typ == "fast":
|
|
cfg["tokenizer_class"] = "PreTrainedTokenizerFast"
|
|
|
|
elif typ == "sentencepiece":
|
|
cfg["tokenizer_class"] = "LlamaTokenizer"
|
|
|
|
elif typ == "bpe":
|
|
cfg["tokenizer_class"] = "GPT2TokenizerFast"
|
|
|
|
else:
|
|
cfg["tokenizer_class"] = "PreTrainedTokenizerFast"
|
|
|
|
# 特殊 case 修复
|
|
bad_classes = [
|
|
"TokenizersBackend",
|
|
"TiktokenTokenizer",
|
|
]
|
|
|
|
if orig_cls in bad_classes:
|
|
print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}")
|
|
|
|
# 修复 extra_special_tokens: list → dict 格式
|
|
if "extra_special_tokens" in cfg and isinstance(cfg["extra_special_tokens"], list):
|
|
orig_list = cfg["extra_special_tokens"]
|
|
cfg["extra_special_tokens"] = {token: token for token in orig_list}
|
|
print(f"[fix] converted extra_special_tokens from list ({len(orig_list)} items) to dict format")
|
|
|
|
# 写回
|
|
with open(cfg_path, "w") as f:
|
|
json.dump(cfg, f)
|
|
|
|
print(f"[fix_tokenizer] done → {OUT_DIR}")
|