commit b0b0248ceec760e7e9d4529b98ed479f5962d75f Author: Sun Ruoxi Date: Thu May 28 10:56:17 2026 +0800 first commit diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7a0307d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +FROM registry.maas.sunrise-ai.com/public/vllm:S2-v1.1.1 +ENV LD_LIBRARY_PATH=/usr/local/pccl/lib:\ +/usr/local/tangrt/targets/linux-x86_64/lib:\ +/usr/local/tangrt/targets/linux-x86_64/lib/stub:\ +/root/pt200/gcc-11.3.0/install/lib64:\ +/root:/root/gcc-11.5.0/lib64:\ +/usr/local/pccl/lib:\ +/usr/local/tangrt/targets/linux-x86_64/lib:\ +/usr/local/tangrt/targets/linux-x86_64/lib/stub:\ +/usr/local/tangrt/lib/linux-x86_64:\ +/root/pt200/gcc-11.3.0/install/lib64:\ +/root:\ +/usr/lib64:\ +/usr/local/lib/python3.10/site-packages/torch/lib +ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0 +ENV PATH=/root/gcc-11.5.0/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PYTHONPATH=/sunrise_code/vllm:/sunrise_code/sunrise_vllm:/usr/local/lib/python3.10/site-packages: +COPY fix_tokenizer.py /opt/ +COPY detect_tokenizer.py /opt/ +COPY entrypoint.sh /opt/ +RUN ln -sf /usr/local/bin/python3.10 /usr/bin/python3 +RUN chmod +x /opt/entrypoint.sh + +ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..c349adb --- /dev/null +++ b/README.md @@ -0,0 +1,161 @@ +# vLLM Tokenizer 自动修复方案 + +## 1. 背景 + +在使用 vLLM 部署部分模型时,可能会遇到如下报错: + +``` + +ValueError: Tokenizer class TokenizersBackend does not exist or is not currently imported. + +``` + +该问题通常由 transformers 的 tokenizer 加载机制导致: + +- tokenizer_config.json 中指定了不存在或不兼容的 tokenizer_class +- 开启 trust_remote_code=True 时,transformers 会强制加载该 class +- vLLM 无法通过参数 override tokenizer class + +--- + +## 2. 方案目标 + +本方案实现: + +``` + +无需修改模型文件 +无需修改启动命令 +自动修复 tokenizer 并启动 vLLM + +``` + +--- + +## 3. 核心思路 + +在容器启动时: + +``` + +entrypoint.sh +↓ +检测 tokenizer 是否异常 +↓ +复制 tokenizer 文件 → /tmp/fixed_tokenizer +↓ +修复 tokenizer_config.json +↓ +vllm serve --tokenizer /tmp/fixed_tokenizer + +```` + +--- + +## 4. 支持的自动修复场景 + +| 原 tokenizer_class | 修复为 | +|-------------------|--------| +| TokenizersBackend | PreTrainedTokenizerFast | +| TiktokenTokenizer | GPT2TokenizerFast | +| 缺失 tokenizer_config | 自动生成 | +| SentencePiece | LlamaTokenizer | + +--- + +## 5. 生成的 tokenizer 目录 + +``` +/tmp/fixed_tokenizer/ +├── tokenizer.json +├── tokenizer_config.json (已修复) +├── special_tokens_map.json (可选) +├── vocab.json / merges.txt (如需要) +``` + +--- + +## 6. 日志说明 + +### 正常情况 + +``` +[entrypoint] tokenizer OK, skip fix +``` + +### 自动修复 + +``` +[entrypoint] fixing tokenizer... +[fix] override bad tokenizer_class: TokenizersBackend → PreTrainedTokenizerFast +``` + +--- + +## 7. 验证方法 + +进入容器执行: + +```python +from transformers import AutoTokenizer + +tok = AutoTokenizer.from_pretrained("/tmp/fixed_tokenizer") + +print(tok.encode("hello world")) +print(tok.decode(tok.encode("hello world"))) +``` + +确保: + +``` +encode → decode 可逆 +``` + +--- + +## 8. 注意事项 + +### ⚠️ 1. tokenizer 文件必须存在 + +至少需要: + +| 类型 | 必需文件 | +| -------------- | ----------------------- | +| Fast tokenizer | tokenizer.json | +| BPE | vocab.json + merges.txt | +| SentencePiece | tokenizer.model | + +--- + +### ⚠️ 2. 不影响模型推理 + +本方案: + +``` +仅影响 tokenizer(文本 ↔ token) +不影响模型计算(attention / KV cache) +``` + +--- + +### ⚠️ 3. 特殊 token 风险 + +需确认: + +``` +bos_token / eos_token / pad_token 一致 +``` + +否则可能影响生成结果 + +--- + +## 9. 总结 + +本方案通过在容器启动阶段引入 tokenizer 修复逻辑,实现: + +``` +“模型不动,运行时自适应兼容” + +``` +``` diff --git a/detect_tokenizer.py b/detect_tokenizer.py new file mode 100644 index 0000000..c0e7b3e --- /dev/null +++ b/detect_tokenizer.py @@ -0,0 +1,25 @@ +import os +import json + +def detect(model_dir): + cfg_path = os.path.join(model_dir, "tokenizer_config.json") + + if os.path.exists(cfg_path): + with open(cfg_path) as f: + cfg = json.load(f) + cls = cfg.get("tokenizer_class", "") + else: + cls = "" + + files = os.listdir(model_dir) + + if "tokenizer.json" in files: + return "fast", cls + + if "tokenizer.model" in files: + return "sentencepiece", cls + + if "vocab.json" in files and "merges.txt" in files: + return "bpe", cls + + return "unknown", cls diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..0f8f92d --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +MODEL_DIR=${1:-/model} +shift || true + +FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer +AUTO_FIX=${AUTO_FIX_TOKENIZER:-auto} + +echo "[entrypoint] model dir: $MODEL_DIR" + +NEED_FIX=0 + +if [ "$AUTO_FIX" = "1" ] || [ "$AUTO_FIX" = "true" ]; then + NEED_FIX=1 +elif [ "$AUTO_FIX" = "auto" ]; then + if [ -f "$MODEL_DIR/tokenizer_config.json" ]; then + if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then + NEED_FIX=1 + fi + fi +fi + +if [ $NEED_FIX -eq 1 ]; then + echo "[entrypoint] fixing tokenizer..." + python3 /opt/fix_tokenizer.py + TOKENIZER_ARG="--tokenizer $FIX_TOKENIZER_DIR" +else + echo "[entrypoint] tokenizer OK, skip fix" + TOKENIZER_ARG="" +fi + +echo "[entrypoint] starting vllm..." + +exec vllm serve "$MODEL_DIR" $TOKENIZER_ARG "$@" diff --git a/fix_tokenizer.py b/fix_tokenizer.py new file mode 100644 index 0000000..8635486 --- /dev/null +++ b/fix_tokenizer.py @@ -0,0 +1,63 @@ +import os +import shutil +import json +from detect_tokenizer import detect + +MODEL_DIR = os.environ.get("MODEL_DIR", "/model") +OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer") + +os.makedirs(OUT_DIR, exist_ok=True) + +def copy_if_exists(name): + src = os.path.join(MODEL_DIR, name) + if os.path.exists(src): + shutil.copy(src, OUT_DIR) + +# 复制所有可能相关文件 +for f in [ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + "tokenizer.model", +]: + copy_if_exists(f) + +typ, orig_cls = detect(MODEL_DIR) + +cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json") + +if os.path.exists(cfg_path): + with open(cfg_path) as f: + cfg = json.load(f) +else: + cfg = {} + +# ===== 自动修复策略 ===== +if typ == "fast": + cfg["tokenizer_class"] = "PreTrainedTokenizerFast" + +elif typ == "sentencepiece": + cfg["tokenizer_class"] = "LlamaTokenizer" + +elif typ == "bpe": + cfg["tokenizer_class"] = "GPT2TokenizerFast" + +else: + cfg["tokenizer_class"] = "PreTrainedTokenizerFast" + +# 特殊 case 修复 +bad_classes = [ + "TokenizersBackend", + "TiktokenTokenizer", +] + +if orig_cls in bad_classes: + print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}") + +# 写回 +with open(cfg_path, "w") as f: + json.dump(cfg, f) + +print(f"[fix_tokenizer] done → {OUT_DIR}")