enginex-S2-vllm-fix-tokenizer/fix_tokenizer.py

import os
import shutil
import json
from detect_tokenizer import detect

MODEL_DIR = os.environ.get("MODEL_DIR", "/model")
OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer")

os.makedirs(OUT_DIR, exist_ok=True)

def copy_if_exists(name):
    src = os.path.join(MODEL_DIR, name)
    if os.path.exists(src):
        shutil.copy(src, OUT_DIR)

# 复制所有可能相关文件
for f in [
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json",
    "vocab.json",
    "merges.txt",
    "tokenizer.model",
]:
    copy_if_exists(f)

typ, orig_cls = detect(MODEL_DIR)

cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json")

if os.path.exists(cfg_path):
    with open(cfg_path) as f:
        cfg = json.load(f)
else:
    cfg = {}

# ===== 自动修复策略 =====
if typ == "fast":
    cfg["tokenizer_class"] = "PreTrainedTokenizerFast"

elif typ == "sentencepiece":
    cfg["tokenizer_class"] = "LlamaTokenizer"

elif typ == "bpe":
    cfg["tokenizer_class"] = "GPT2TokenizerFast"

else:
    cfg["tokenizer_class"] = "PreTrainedTokenizerFast"

# 特殊 case 修复
bad_classes = [
    "TokenizersBackend",
    "TiktokenTokenizer",
]

if orig_cls in bad_classes:
    print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}")

# 修复 extra_special_tokens: list → dict 格式
if "extra_special_tokens" in cfg and isinstance(cfg["extra_special_tokens"], list):
    orig_list = cfg["extra_special_tokens"]
    cfg["extra_special_tokens"] = {token: token for token in orig_list}
    print(f"[fix] converted extra_special_tokens from list ({len(orig_list)} items) to dict format")

# 写回
with open(cfg_path, "w") as f:
    json.dump(cfg, f)

print(f"[fix_tokenizer] done → {OUT_DIR}")