diff --git a/Dockerfile b/Dockerfile index f7c26ae..d544b37 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,6 @@ FROM harbor-contest.4pd.io/sunjichen/xc-llm-kunlun:latest COPY entrypoint.sh /opt/entrypoint.sh COPY fix_tokenizer.py /opt/fix_tokenizer.py -COPY detect_tokenizer.py /opt/detect_tokenizer.py RUN chmod +x /opt/entrypoint.sh diff --git a/detect_tokenizer.py b/detect_tokenizer.py deleted file mode 100644 index c0e7b3e..0000000 --- a/detect_tokenizer.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import json - -def detect(model_dir): - cfg_path = os.path.join(model_dir, "tokenizer_config.json") - - if os.path.exists(cfg_path): - with open(cfg_path) as f: - cfg = json.load(f) - cls = cfg.get("tokenizer_class", "") - else: - cls = "" - - files = os.listdir(model_dir) - - if "tokenizer.json" in files: - return "fast", cls - - if "tokenizer.model" in files: - return "sentencepiece", cls - - if "vocab.json" in files and "merges.txt" in files: - return "bpe", cls - - return "unknown", cls diff --git a/entrypoint.sh b/entrypoint.sh index 07308d3..41c9f9e 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -4,36 +4,11 @@ set -e MODEL_DIR=${1:-/model} shift || true -FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer -AUTO_FIX=${AUTO_FIX_TOKENIZER:-auto} - -echo "[entrypoint] model dir: $MODEL_DIR" - -NEED_FIX=0 - -if [ "$AUTO_FIX" = "1" ] || [ "$AUTO_FIX" = "true" ]; then - NEED_FIX=1 -elif [ "$AUTO_FIX" = "auto" ]; then - if [ -f "$MODEL_DIR/tokenizer_config.json" ]; then - if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then - NEED_FIX=1 - fi - # 检测 extra_special_tokens 是否为 list 格式 - if grep -q '"extra_special_tokens":\s*\[' "$MODEL_DIR/tokenizer_config.json"; then - NEED_FIX=1 - fi - fi -fi - -if [ $NEED_FIX -eq 1 ]; then - echo "[entrypoint] fixing tokenizer..." - python3 /opt/fix_tokenizer.py - TOKENIZER_ARG="--tokenizer $FIX_TOKENIZER_DIR" +FIXED_DIR=$(python3 /opt/fix_tokenizer.py "$MODEL_DIR") +if [ -n "$FIXED_DIR" ]; then + TOKENIZER_ARG="--tokenizer $FIXED_DIR" else - echo "[entrypoint] tokenizer OK, skip fix" TOKENIZER_ARG="" fi -echo "[entrypoint] starting vllm..." - exec vllm serve "$MODEL_DIR" $TOKENIZER_ARG "$@" diff --git a/fix_tokenizer.py b/fix_tokenizer.py index 9556ea6..651eb84 100644 --- a/fix_tokenizer.py +++ b/fix_tokenizer.py @@ -1,69 +1,71 @@ +#!/usr/bin/env python3 +""" +检测 tokenizer_config.json 中的 tokenizer_class 是否在 transformers 中存在。 +若不存在(如 TokenizersBackend),则将 tokenizer 文件复制到 /tmp/fixed_tokenizer/ +并修复 tokenizer_class,最后将修复目录路径输出到 stdout。 +若无需修复,输出为空。 +""" import os -import shutil +import sys import json -from detect_tokenizer import detect +import shutil -MODEL_DIR = os.environ.get("MODEL_DIR", "/model") -OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer") +MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else os.environ.get("MODEL_DIR", "/model") +OUT_DIR = "/tmp/fixed_tokenizer" -os.makedirs(OUT_DIR, exist_ok=True) -def copy_if_exists(name): - src = os.path.join(MODEL_DIR, name) - if os.path.exists(src): - shutil.copy(src, OUT_DIR) +def main(): + cfg_path = os.path.join(MODEL_DIR, "tokenizer_config.json") + if not os.path.exists(cfg_path): + return -# 复制所有可能相关文件 -for f in [ - "tokenizer.json", - "tokenizer_config.json", - "special_tokens_map.json", - "vocab.json", - "merges.txt", - "tokenizer.model", -]: - copy_if_exists(f) - -typ, orig_cls = detect(MODEL_DIR) - -cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json") - -if os.path.exists(cfg_path): with open(cfg_path) as f: cfg = json.load(f) -else: - cfg = {} -# ===== 自动修复策略 ===== -if typ == "fast": - cfg["tokenizer_class"] = "PreTrainedTokenizerFast" + tokenizer_class = cfg.get("tokenizer_class", "") + if not tokenizer_class: + return -elif typ == "sentencepiece": - cfg["tokenizer_class"] = "LlamaTokenizer" + # 用 transformers 自身判断该类是否可用,不硬编码类名 + import transformers + if getattr(transformers, tokenizer_class, None) is not None: + return # 类存在,无需修复 -elif typ == "bpe": - cfg["tokenizer_class"] = "GPT2TokenizerFast" + # tokenizer_class 在 transformers 中不存在,根据实际文件推断正确的类 + files = os.listdir(MODEL_DIR) + if "tokenizer.json" in files: + fixed_class = "PreTrainedTokenizerFast" + elif "tokenizer.model" in files: + fixed_class = "LlamaTokenizer" + elif "vocab.json" in files and "merges.txt" in files: + fixed_class = "GPT2TokenizerFast" + else: + fixed_class = "PreTrainedTokenizerFast" -else: - cfg["tokenizer_class"] = "PreTrainedTokenizerFast" + print( + f"[fix_tokenizer] tokenizer_class '{tokenizer_class}' not found in transformers, " + f"replacing with '{fixed_class}'", + file=sys.stderr, + ) -# 特殊 case 修复 -bad_classes = [ - "TokenizersBackend", - "TiktokenTokenizer", -] + os.makedirs(OUT_DIR, exist_ok=True) + for fname in [ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + "tokenizer.model", + ]: + src = os.path.join(MODEL_DIR, fname) + if os.path.exists(src): + shutil.copy(src, OUT_DIR) -if orig_cls in bad_classes: - print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}") + cfg["tokenizer_class"] = fixed_class + with open(os.path.join(OUT_DIR, "tokenizer_config.json"), "w") as f: + json.dump(cfg, f, indent=2) -# 修复 extra_special_tokens: list → dict 格式 -if "extra_special_tokens" in cfg and isinstance(cfg["extra_special_tokens"], list): - orig_list = cfg["extra_special_tokens"] - cfg["extra_special_tokens"] = {token: token for token in orig_list} - print(f"[fix] converted extra_special_tokens from list ({len(orig_list)} items) to dict format") + print(OUT_DIR) # 输出修复目录,供 entrypoint.sh 捕获 -# 写回 -with open(cfg_path, "w") as f: - json.dump(cfg, f) -print(f"[fix_tokenizer] done → {OUT_DIR}") +main()