#!/usr/bin/env python3
"""
检测 tokenizer_config.json 中的 tokenizer_class 是否在 transformers 中存在。
若不存在（如 TokenizersBackend），则将 tokenizer 文件复制到 /tmp/fixed_tokenizer/
并修复 tokenizer_class，最后将修复目录路径输出到 stdout。
若无需修复，输出为空。
"""
import os
import sys
import json
import shutil

MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else os.environ.get("MODEL_DIR", "/model")
OUT_DIR = "/tmp/fixed_tokenizer"


def main():
    cfg_path = os.path.join(MODEL_DIR, "tokenizer_config.json")
    if not os.path.exists(cfg_path):
        return

    with open(cfg_path) as f:
        cfg = json.load(f)

    tokenizer_class = cfg.get("tokenizer_class", "")
    if not tokenizer_class:
        return

    # 用 transformers 自身判断该类是否可用，不硬编码类名
    import transformers
    if getattr(transformers, tokenizer_class, None) is not None:
        return  # 类存在，无需修复

    # tokenizer_class 在 transformers 中不存在，根据实际文件推断正确的类
    files = os.listdir(MODEL_DIR)
    if "tokenizer.json" in files:
        fixed_class = "PreTrainedTokenizerFast"
    elif "tokenizer.model" in files:
        fixed_class = "LlamaTokenizer"
    elif "vocab.json" in files and "merges.txt" in files:
        fixed_class = "GPT2TokenizerFast"
    else:
        fixed_class = "PreTrainedTokenizerFast"

    print(
        f"[fix_tokenizer] tokenizer_class '{tokenizer_class}' not found in transformers, "
        f"replacing with '{fixed_class}'",
        file=sys.stderr,
    )

    os.makedirs(OUT_DIR, exist_ok=True)
    for fname in [
        "tokenizer.json",
        "tokenizer_config.json",
        "special_tokens_map.json",
        "vocab.json",
        "merges.txt",
        "tokenizer.model",
    ]:
        src = os.path.join(MODEL_DIR, fname)
        if os.path.exists(src):
            shutil.copy(src, OUT_DIR)

    cfg["tokenizer_class"] = fixed_class
    with open(os.path.join(OUT_DIR, "tokenizer_config.json"), "w") as f:
        json.dump(cfg, f, indent=2)

    print(OUT_DIR)  # 输出修复目录，供 entrypoint.sh 捕获


main()