first commit

2026-05-28 10:56:17 +08:00
commit b0b0248cee
5 changed files with 308 additions and 0 deletions
--- a/24
+++ b/24
@@ -0,0 +1,24 @@
 FROM registry.maas.sunrise-ai.com/public/vllm:S2-v1.1.1
 ENV LD_LIBRARY_PATH=/usr/local/pccl/lib:\
 /usr/local/tangrt/targets/linux-x86_64/lib:\
 /usr/local/tangrt/targets/linux-x86_64/lib/stub:\
 /root/pt200/gcc-11.3.0/install/lib64:\
 /root:/root/gcc-11.5.0/lib64:\
 /usr/local/pccl/lib:\
 /usr/local/tangrt/targets/linux-x86_64/lib:\
 /usr/local/tangrt/targets/linux-x86_64/lib/stub:\
 /usr/local/tangrt/lib/linux-x86_64:\
 /root/pt200/gcc-11.3.0/install/lib64:\
 /root:\
 /usr/lib64:\
 /usr/local/lib/python3.10/site-packages/torch/lib
 ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
 ENV PATH=/root/gcc-11.5.0/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV PYTHONPATH=/sunrise_code/vllm:/sunrise_code/sunrise_vllm:/usr/local/lib/python3.10/site-packages:
 COPY fix_tokenizer.py /opt/
 COPY detect_tokenizer.py /opt/
 COPY entrypoint.sh /opt/
 RUN ln -sf /usr/local/bin/python3.10 /usr/bin/python3
 RUN chmod +x /opt/entrypoint.sh
 ENTRYPOINT ["/opt/entrypoint.sh"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,161 @@
 # vLLM Tokenizer 自动修复方案
 ## 1. 背景
 在使用 vLLM 部署部分模型时，可能会遇到如下报错：
 ```
 ValueError: Tokenizer class TokenizersBackend does not exist or is not currently imported.
 ```
 该问题通常由 transformers 的 tokenizer 加载机制导致：
 - tokenizer_config.json 中指定了不存在或不兼容的 tokenizer_class
 - 开启 trust_remote_code=True 时，transformers 会强制加载该 class
 - vLLM 无法通过参数 override tokenizer class
 ---
 ## 2. 方案目标
 本方案实现：
 ```
 无需修改模型文件
 无需修改启动命令
 自动修复 tokenizer 并启动 vLLM
 ```
 ---
 ## 3. 核心思路
 在容器启动时：
 ```
 entrypoint.sh
 ↓
 检测 tokenizer 是否异常
 ↓
 复制 tokenizer 文件 → /tmp/fixed_tokenizer
 ↓
 修复 tokenizer_config.json
 ↓
 vllm serve --tokenizer /tmp/fixed_tokenizer
 ````
 ---
 ## 4. 支持的自动修复场景
 | 原 tokenizer_class | 修复为 |
 |-------------------|--------|
 | TokenizersBackend | PreTrainedTokenizerFast |
 | TiktokenTokenizer | GPT2TokenizerFast |
 | 缺失 tokenizer_config | 自动生成 |
 | SentencePiece | LlamaTokenizer |
 ---
 ## 5. 生成的 tokenizer 目录
 ```
 /tmp/fixed_tokenizer/
 ├── tokenizer.json
 ├── tokenizer_config.json   (已修复)
 ├── special_tokens_map.json (可选)
 ├── vocab.json / merges.txt (如需要)
 ```
 ---
 ## 6. 日志说明
 ### 正常情况
 ```
 [entrypoint] tokenizer OK, skip fix
 ```
 ### 自动修复
 ```
 [entrypoint] fixing tokenizer...
 [fix] override bad tokenizer_class: TokenizersBackend → PreTrainedTokenizerFast
 ```
 ---
 ## 7. 验证方法
 进入容器执行：
 ```python
 from transformers import AutoTokenizer
 tok = AutoTokenizer.from_pretrained("/tmp/fixed_tokenizer")
 print(tok.encode("hello world"))
 print(tok.decode(tok.encode("hello world")))
 ```
 确保：
 ```
 encode → decode 可逆
 ```
 ---
 ## 8. 注意事项
 ### ⚠️ 1. tokenizer 文件必须存在
 至少需要：
 | 类型             | 必需文件                    |
 | -------------- | ----------------------- |
 | Fast tokenizer | tokenizer.json          |
 | BPE            | vocab.json + merges.txt |
 | SentencePiece  | tokenizer.model         |
 ---
 ### ⚠️ 2. 不影响模型推理
 本方案：
 ```
 仅影响 tokenizer（文本 ↔ token）
 不影响模型计算（attention / KV cache）
 ```
 ---
 ### ⚠️ 3. 特殊 token 风险
 需确认：
 ```
 bos_token / eos_token / pad_token 一致
 ```
 否则可能影响生成结果
 ---
 ## 9. 总结
 本方案通过在容器启动阶段引入 tokenizer 修复逻辑，实现：
 ```
 “模型不动，运行时自适应兼容”
 ```
 ```
--- a/detect_tokenizer.py
+++ b/detect_tokenizer.py
@@ -0,0 +1,25 @@
 import os
 import json
 def detect(model_dir):
    cfg_path = os.path.join(model_dir, "tokenizer_config.json")
    if os.path.exists(cfg_path):
        with open(cfg_path) as f:
            cfg = json.load(f)
        cls = cfg.get("tokenizer_class", "")
    else:
        cls = ""
    files = os.listdir(model_dir)
    if "tokenizer.json" in files:
        return "fast", cls
    if "tokenizer.model" in files:
        return "sentencepiece", cls
    if "vocab.json" in files and "merges.txt" in files:
        return "bpe", cls
    return "unknown", cls
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,35 @@
 #!/bin/bash
 set -e
 MODEL_DIR=${1:-/model}
 shift || true
 FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer
 AUTO_FIX=${AUTO_FIX_TOKENIZER:-auto}
 echo "[entrypoint] model dir: $MODEL_DIR"
 NEED_FIX=0
 if [ "$AUTO_FIX" = "1" ] || [ "$AUTO_FIX" = "true" ]; then
    NEED_FIX=1
 elif [ "$AUTO_FIX" = "auto" ]; then
    if [ -f "$MODEL_DIR/tokenizer_config.json" ]; then
        if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then
            NEED_FIX=1
        fi
    fi
 fi
 if [ $NEED_FIX -eq 1 ]; then
    echo "[entrypoint] fixing tokenizer..."
    python3 /opt/fix_tokenizer.py
    TOKENIZER_ARG="--tokenizer $FIX_TOKENIZER_DIR"
 else
    echo "[entrypoint] tokenizer OK, skip fix"
    TOKENIZER_ARG=""
 fi
 echo "[entrypoint] starting vllm..."
 exec vllm serve "$MODEL_DIR" $TOKENIZER_ARG "$@"
--- a/fix_tokenizer.py
+++ b/fix_tokenizer.py
@@ -0,0 +1,63 @@
 import os
 import shutil
 import json
 from detect_tokenizer import detect
 MODEL_DIR = os.environ.get("MODEL_DIR", "/model")
 OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer")
 os.makedirs(OUT_DIR, exist_ok=True)
 def copy_if_exists(name):
    src = os.path.join(MODEL_DIR, name)
    if os.path.exists(src):
        shutil.copy(src, OUT_DIR)
 # 复制所有可能相关文件
 for f in [
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json",
    "vocab.json",
    "merges.txt",
    "tokenizer.model",
 ]:
    copy_if_exists(f)
 typ, orig_cls = detect(MODEL_DIR)
 cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json")
 if os.path.exists(cfg_path):
    with open(cfg_path) as f:
        cfg = json.load(f)
 else:
    cfg = {}
 # ===== 自动修复策略 =====
 if typ == "fast":
    cfg["tokenizer_class"] = "PreTrainedTokenizerFast"
 elif typ == "sentencepiece":
    cfg["tokenizer_class"] = "LlamaTokenizer"
 elif typ == "bpe":
    cfg["tokenizer_class"] = "GPT2TokenizerFast"
 else:
    cfg["tokenizer_class"] = "PreTrainedTokenizerFast"
 # 特殊 case 修复
 bad_classes = [
    "TokenizersBackend",
    "TiktokenTokenizer",
 ]
 if orig_cls in bad_classes:
    print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}")
 # 写回
 with open(cfg_path, "w") as f:
    json.dump(cfg, f)
 print(f"[fix_tokenizer] done → {OUT_DIR}")