first commit

2026-05-28 10:56:17 +08:00
commit b0b0248cee
5 changed files with 308 additions and 0 deletions
--- a/24
+++ b/24
@@ -0,0 +1,24 @@
+FROM registry.maas.sunrise-ai.com/public/vllm:S2-v1.1.1
+ENV LD_LIBRARY_PATH=/usr/local/pccl/lib:\
+/usr/local/tangrt/targets/linux-x86_64/lib:\
+/usr/local/tangrt/targets/linux-x86_64/lib/stub:\
+/root/pt200/gcc-11.3.0/install/lib64:\
+/root:/root/gcc-11.5.0/lib64:\
+/usr/local/pccl/lib:\
+/usr/local/tangrt/targets/linux-x86_64/lib:\
+/usr/local/tangrt/targets/linux-x86_64/lib/stub:\
+/usr/local/tangrt/lib/linux-x86_64:\
+/root/pt200/gcc-11.3.0/install/lib64:\
+/root:\
+/usr/lib64:\
+/usr/local/lib/python3.10/site-packages/torch/lib
+ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
+ENV PATH=/root/gcc-11.5.0/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV PYTHONPATH=/sunrise_code/vllm:/sunrise_code/sunrise_vllm:/usr/local/lib/python3.10/site-packages:
+COPY fix_tokenizer.py /opt/
+COPY detect_tokenizer.py /opt/
+COPY entrypoint.sh /opt/
+RUN ln -sf /usr/local/bin/python3.10 /usr/bin/python3
+RUN chmod +x /opt/entrypoint.sh
+
+ENTRYPOINT ["/opt/entrypoint.sh"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,161 @@
+# vLLM Tokenizer 自动修复方案
+
+## 1. 背景
+
+在使用 vLLM 部署部分模型时，可能会遇到如下报错：
+
+```
+
+ValueError: Tokenizer class TokenizersBackend does not exist or is not currently imported.
+
+```
+
+该问题通常由 transformers 的 tokenizer 加载机制导致：
+
+- tokenizer_config.json 中指定了不存在或不兼容的 tokenizer_class
+- 开启 trust_remote_code=True 时，transformers 会强制加载该 class
+- vLLM 无法通过参数 override tokenizer class
+
+---
+
+## 2. 方案目标
+
+本方案实现：
+
+```
+
+无需修改模型文件
+无需修改启动命令
+自动修复 tokenizer 并启动 vLLM
+
+```
+
+---
+
+## 3. 核心思路
+
+在容器启动时：
+
+```
+
+entrypoint.sh
+↓
+检测 tokenizer 是否异常
+↓
+复制 tokenizer 文件 → /tmp/fixed_tokenizer
+↓
+修复 tokenizer_config.json
+↓
+vllm serve --tokenizer /tmp/fixed_tokenizer
+
+````
+
+---
+
+## 4. 支持的自动修复场景
+
+| 原 tokenizer_class | 修复为 |
+|-------------------|--------|
+| TokenizersBackend | PreTrainedTokenizerFast |
+| TiktokenTokenizer | GPT2TokenizerFast |
+| 缺失 tokenizer_config | 自动生成 |
+| SentencePiece | LlamaTokenizer |
+
+---
+
+## 5. 生成的 tokenizer 目录
+
+```
+/tmp/fixed_tokenizer/
+├── tokenizer.json
+├── tokenizer_config.json   (已修复)
+├── special_tokens_map.json (可选)
+├── vocab.json / merges.txt (如需要)
+```
+
+---
+
+## 6. 日志说明
+
+### 正常情况
+
+```
+[entrypoint] tokenizer OK, skip fix
+```
+
+### 自动修复
+
+```
+[entrypoint] fixing tokenizer...
+[fix] override bad tokenizer_class: TokenizersBackend → PreTrainedTokenizerFast
+```
+
+---
+
+## 7. 验证方法
+
+进入容器执行：
+
+```python
+from transformers import AutoTokenizer
+
+tok = AutoTokenizer.from_pretrained("/tmp/fixed_tokenizer")
+
+print(tok.encode("hello world"))
+print(tok.decode(tok.encode("hello world")))
+```
+
+确保：
+
+```
+encode → decode 可逆
+```
+
+---
+
+## 8. 注意事项
+
+### ⚠️ 1. tokenizer 文件必须存在
+
+至少需要：
+
+| 类型             | 必需文件                    |
+| -------------- | ----------------------- |
+| Fast tokenizer | tokenizer.json          |
+| BPE            | vocab.json + merges.txt |
+| SentencePiece  | tokenizer.model         |
+
+---
+
+### ⚠️ 2. 不影响模型推理
+
+本方案：
+
+```
+仅影响 tokenizer（文本 ↔ token）
+不影响模型计算（attention / KV cache）
+```
+
+---
+
+### ⚠️ 3. 特殊 token 风险
+
+需确认：
+
+```
+bos_token / eos_token / pad_token 一致
+```
+
+否则可能影响生成结果
+
+---
+
+## 9. 总结
+
+本方案通过在容器启动阶段引入 tokenizer 修复逻辑，实现：
+
+```
+“模型不动，运行时自适应兼容”
+
+```
+```
--- a/detect_tokenizer.py
+++ b/detect_tokenizer.py
@@ -0,0 +1,25 @@
+import os
+import json
+
+def detect(model_dir):
+    cfg_path = os.path.join(model_dir, "tokenizer_config.json")
+
+    if os.path.exists(cfg_path):
+        with open(cfg_path) as f:
+            cfg = json.load(f)
+        cls = cfg.get("tokenizer_class", "")
+    else:
+        cls = ""
+
+    files = os.listdir(model_dir)
+
+    if "tokenizer.json" in files:
+        return "fast", cls
+
+    if "tokenizer.model" in files:
+        return "sentencepiece", cls
+
+    if "vocab.json" in files and "merges.txt" in files:
+        return "bpe", cls
+
+    return "unknown", cls
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -e
+
+MODEL_DIR=${1:-/model}
+shift || true
+
+FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer
+AUTO_FIX=${AUTO_FIX_TOKENIZER:-auto}
+
+echo "[entrypoint] model dir: $MODEL_DIR"
+
+NEED_FIX=0
+
+if [ "$AUTO_FIX" = "1" ] || [ "$AUTO_FIX" = "true" ]; then
+    NEED_FIX=1
+elif [ "$AUTO_FIX" = "auto" ]; then
+    if [ -f "$MODEL_DIR/tokenizer_config.json" ]; then
+        if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then
+            NEED_FIX=1
+        fi
+    fi
+fi
+
+if [ $NEED_FIX -eq 1 ]; then
+    echo "[entrypoint] fixing tokenizer..."
+    python3 /opt/fix_tokenizer.py
+    TOKENIZER_ARG="--tokenizer $FIX_TOKENIZER_DIR"
+else
+    echo "[entrypoint] tokenizer OK, skip fix"
+    TOKENIZER_ARG=""
+fi
+
+echo "[entrypoint] starting vllm..."
+
+exec vllm serve "$MODEL_DIR" $TOKENIZER_ARG "$@"
--- a/fix_tokenizer.py
+++ b/fix_tokenizer.py
@@ -0,0 +1,63 @@
+import os
+import shutil
+import json
+from detect_tokenizer import detect
+
+MODEL_DIR = os.environ.get("MODEL_DIR", "/model")
+OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer")
+
+os.makedirs(OUT_DIR, exist_ok=True)
+
+def copy_if_exists(name):
+    src = os.path.join(MODEL_DIR, name)
+    if os.path.exists(src):
+        shutil.copy(src, OUT_DIR)
+
+# 复制所有可能相关文件
+for f in [
+    "tokenizer.json",
+    "tokenizer_config.json",
+    "special_tokens_map.json",
+    "vocab.json",
+    "merges.txt",
+    "tokenizer.model",
+]:
+    copy_if_exists(f)
+
+typ, orig_cls = detect(MODEL_DIR)
+
+cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json")
+
+if os.path.exists(cfg_path):
+    with open(cfg_path) as f:
+        cfg = json.load(f)
+else:
+    cfg = {}
+
+# ===== 自动修复策略 =====
+if typ == "fast":
+    cfg["tokenizer_class"] = "PreTrainedTokenizerFast"
+
+elif typ == "sentencepiece":
+    cfg["tokenizer_class"] = "LlamaTokenizer"
+
+elif typ == "bpe":
+    cfg["tokenizer_class"] = "GPT2TokenizerFast"
+
+else:
+    cfg["tokenizer_class"] = "PreTrainedTokenizerFast"
+
+# 特殊 case 修复
+bad_classes = [
+    "TokenizersBackend",
+    "TiktokenTokenizer",
+]
+
+if orig_cls in bad_classes:
+    print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}")
+
+# 写回
+with open(cfg_path, "w") as f:
+    json.dump(cfg, f)
+
+print(f"[fix_tokenizer] done → {OUT_DIR}")