From 772ad2cd6bbd72a9e611e0eb7ed57829ba238e8f Mon Sep 17 00:00:00 2001 From: Sun Ruoxi Date: Tue, 16 Jun 2026 18:43:29 +0800 Subject: [PATCH] first commit --- .gitea/workflows/docker-build-push.yml | 134 ++++++++++++++++++ .gitea/workflows/task_info.env | 3 + Dockerfile | 30 ++++ README.md | 184 +++++++++++++++++++++++++ detect_tokenizer.py | 25 ++++ entrypoint.sh | 39 ++++++ fix_tokenizer.py | 69 ++++++++++ 7 files changed, 484 insertions(+) create mode 100644 .gitea/workflows/docker-build-push.yml create mode 100644 .gitea/workflows/task_info.env create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 detect_tokenizer.py create mode 100644 entrypoint.sh create mode 100644 fix_tokenizer.py diff --git a/.gitea/workflows/docker-build-push.yml b/.gitea/workflows/docker-build-push.yml new file mode 100644 index 0000000..ac36813 --- /dev/null +++ b/.gitea/workflows/docker-build-push.yml @@ -0,0 +1,134 @@ +name: Docker Build and Push + +on: + push: + tags: + - "v*" + +jobs: + docker: + runs-on: amd64-ubuntu-24.04 + + steps: + - name: Clone repository + run: | + git clone "${{ gitea.server_url }}/${{ gitea.repository }}.git" . + git checkout "${{ gitea.ref_name }}" + + - name: Set image metadata + run: | + IMAGE_NAME="$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')" + IMAGE="${DOCKER_REGISTRY}/${DOCKER_USERNAME}/${IMAGE_NAME}:${{ gitea.ref_name }}" + + echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITEA_ENV" + echo "IMAGE=${IMAGE}" >> "$GITEA_ENV" + + - name: Load and Validate Task Info + run: | + set -a + . .gitea/workflows/task_info.env + set +a + + for name in FRAMEWORK GPU_TYPE TASK_TYPE; do + eval "value=\${${name}:-}" + if [ "$name" = "FRAMEWORK" ] && [ -z "$value" ]; then + echo "${name} is empty in .gitea/workflows/task_info.env" + exit 1 + fi + + echo "${name}=${value}" >> "$GITEA_ENV" + done + + - name: Validate Image Verify Metadata + run: | + if [ -z "${FIXED_TOKEN:-}" ]; then + echo "FIXED_TOKEN is not configured on runner" + exit 1 + fi + + if ! response="$(curl --silent --show-error --location --get 'https://modelhub.org.cn/adminApi/image-verify/validate' \ + --header "Xc-Token: ${FIXED_TOKEN}" \ + --data-urlencode "gpuType=${GPU_TYPE:-}" \ + --data-urlencode "taskType=${TASK_TYPE:-}")"; then + echo "failed to call image verify validate API" + exit 1 + fi + + VALIDATE_RESPONSE="$response" python3 - <<'PY' + import json + import os + import sys + + raw = os.environ.get("VALIDATE_RESPONSE", "") + try: + body = json.loads(raw) + except json.JSONDecodeError: + print("image verify validate API returned invalid JSON") + print(raw) + sys.exit(1) + + if body.get("code") == 0 and body.get("data") is True: + print("image verify metadata validation passed") + sys.exit(0) + + message = body.get("message") or "unknown error" + print(f"image verify metadata validation failed: {message}") + print(raw) + sys.exit(1) + PY + + - name: Login to Docker Registry + run: | + echo "$DOCKER_PASSWORD" | docker login "$DOCKER_REGISTRY" \ + -u "$DOCKER_USERNAME" \ + --password-stdin + + - name: Build Docker Image + run: | + docker build -t "$IMAGE" . + + - name: Push Docker Image + run: | + for attempt in 1 2 3; do + echo "Starting docker push attempt ${attempt}/3 for ${IMAGE}" + docker push "$IMAGE" & + PUSH_PID=$! + + while kill -0 "$PUSH_PID" 2>/dev/null; do + echo "docker push is still running at $(date -u '+%Y-%m-%dT%H:%M:%SZ')" + sleep 60 + done + + if wait "$PUSH_PID"; then + echo "docker push completed successfully" + exit 0 + fi + + echo "docker push failed on attempt ${attempt}/3" + sleep 30 + done + + echo "docker push failed after 3 attempts" + exit 1 + + - name: Notify Image Verify + run: | + if [ -z "${FIXED_TOKEN:-}" ]; then + echo "FIXED_TOKEN is not configured on runner" + exit 1 + fi + + curl --silent --show-error --fail-with-body --location --request POST 'https://modelhub.org.cn//adminApi/image-verify' \ + --header "Xc-Token: ${FIXED_TOKEN}" \ + --header 'Content-Type: application/json' \ + --data-raw "{ + \"framework\": \"${FRAMEWORK}\", + \"gpuType\": \"${GPU_TYPE}\", + \"imageUrl\": \"${IMAGE}\", + \"taskType\": \"${TASK_TYPE}\", + \"createBy\": \"${{ gitea.actor }}\", + \"repoUrl\": \"${{ gitea.server_url }}/${{ gitea.repository }}\", + \"tag\": \"${{ github.ref_name }}\" + }" + + diff --git a/.gitea/workflows/task_info.env b/.gitea/workflows/task_info.env new file mode 100644 index 0000000..7d64c86 --- /dev/null +++ b/.gitea/workflows/task_info.env @@ -0,0 +1,3 @@ +FRAMEWORK=vllm_fix_tokenizer +GPU_TYPE=Biren_166m +TASK_TYPE=text-generation diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1180cbf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +from harbor.4pd.io/modelhubxc/enginex/xc-llm-biren166m:26.01 +ENV PKG_CONFIG_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/brffmpeg/lib/pkgconfig:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/lib/pkgconfig +ENV CMAKE_INCLUDE_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/suairan/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/deepep/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brSimulator/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/surtc/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/tensor-engine/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sutlass/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/surand/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/supti/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/supa/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudnn-eager/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sufft/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/succl/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sublas/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/libsufile/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brperfworks/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brjpegdec/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/include/sudnn:/usr/local/birensupa/sdk/1.10.0.0.rc1/brffmpeg/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brcc/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brbpp/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/include +ENV LIBVA_DRIVER_NAME=bevc +ENV PWD=/workspace +ENV HOME=/root +ENV LANG=C.UTF-8 +ENV PYTHONPATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/tensor-engine/python:/usr/local/birensupa/sdk/1.10.0.0.rc1/tensor-engine/python/tvm/python:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/python +ENV CPLUS_INCLUDE_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/suairan/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/deepep/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brSimulator/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/surtc/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/tensor-engine/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sutlass/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/surand/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/supti/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/supa/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudnn-eager/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sufft/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/succl/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sublas/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/libsufile/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brperfworks/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brjpegdec/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/include/sudnn:/usr/local/birensupa/sdk/1.10.0.0.rc1/brffmpeg/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brcc/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brbpp/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/include +ENV LIBRARY_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/suairan/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/deepep/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brSimulator/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/surtc/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/tensor-engine/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sutlass/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/surand/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/supti/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/suprofiler/sudx/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/supa/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudnn-eager/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sufft/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudbg/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/succl/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/sublas/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/libsufile/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brperfworks/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brjpegdec/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brffmpeg/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brcc/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brbpp/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/lib/dri:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/lib +ENV SHLVL=0 +ENV LD_LIBRARY_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/suairan/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/deepep/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brSimulator/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/surtc/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/tensor-engine/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sutlass/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/surand/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/supti/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/suprofiler/sudx/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/supa/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudnn-eager/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sufft/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudbg/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/succl/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/sublas/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/libsufile/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brperfworks/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brjpegdec/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brffmpeg/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brcc/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brbpp/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/lib/dri:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/lib +ENV SUDNN_ENABLE_ANY_BATCH=1 +ENV BIREN_ENV_SETTED=1 +ENV LIBVA_DRIVERS_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/lib/dri +ENV LC_ALL=C.UTF-8 +ENV PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/suprofiler/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/brSimulator/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/suprof-cli/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/suPerfViz/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/sutlass/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/supti/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/supa-sanitizer/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/sucst/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudbg/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/brperfworks/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/brffmpeg/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/brcc/bin:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV SUFILE_ENV_PATH_JSON=/usr/local/birensupa/sdk/1.10.0.0.rc1/libsufile/sufile.json +ENV C_INCLUDE_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/suairan/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/deepep/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brSimulator/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/surtc/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/tensor-engine/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sutlass/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/surand/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/supti/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/supa/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudnn-eager/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sufft/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/succl/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sublas/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/libsufile/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brperfworks/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brjpegdec/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/include/sudnn:/usr/local/birensupa/sdk/1.10.0.0.rc1/brffmpeg/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brcc/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/brbpp/include:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/include +ENV DEBIAN_FRONTEND=noninteractive +ENV SUPA_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/supa +ENV CMAKE_LIBRARY_PATH=/usr/local/birensupa/sdk/1.10.0.0.rc1/suairan/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/deepep/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brSimulator/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/surtc/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/tensor-engine/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sutlass/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/surand/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/supti/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/suprofiler/sudx/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/supa/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sulib/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudnn-eager/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sufft/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/sudbg/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/succl/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/sublas/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/libsufile/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brperfworks/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brjpegdec/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brffmpeg/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brcc/lib:/usr/local/birensupa/sdk/1.10.0.0.rc1/brbpp/lib/x86_64-linux-gnu:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/lib/dri:/usr/local/birensupa/sdk/1.10.0.0.rc1/bevc/lib +ENV _=/usr/bin/env + +COPY fix_tokenizer.py /opt/ +COPY detect_tokenizer.py /opt/ +COPY entrypoint.sh /opt/ +RUN chmod +x /opt/entrypoint.sh + +ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..f70e799 --- /dev/null +++ b/README.md @@ -0,0 +1,184 @@ +# vLLM Tokenizer 自动修复方案 + +## 1. 背景 + +在使用 vLLM 部署部分模型时,可能会遇到如下报错: + +``` + +ValueError: Tokenizer class TokenizersBackend does not exist or is not currently imported. + +``` + +该问题通常由 transformers 的 tokenizer 加载机制导致: + +- tokenizer_config.json 中指定了不存在或不兼容的 tokenizer_class +- 开启 trust_remote_code=True 时,transformers 会强制加载该 class +- vLLM 无法通过参数 override tokenizer class + +--- + +## 2. 方案目标 + +本方案实现: + +``` + +无需修改模型文件 +无需修改启动命令 +自动修复 tokenizer 并启动 vLLM + +``` + +--- + +## 3. 核心思路 + +在容器启动时: + +``` + +entrypoint.sh +↓ +检测 tokenizer 是否异常 +↓ +复制 tokenizer 文件 → /tmp/fixed_tokenizer +↓ +修复 tokenizer_config.json +↓ +vllm serve --tokenizer /tmp/fixed_tokenizer + +```` + +--- + +## 4. 支持的自动修复场景 + +| 原 tokenizer_class | 修复为 | +|-------------------|--------| +| TokenizersBackend | PreTrainedTokenizerFast | +| TiktokenTokenizer | GPT2TokenizerFast | +| 缺失 tokenizer_config | 自动生成 | +| SentencePiece | LlamaTokenizer | + +### 修复 extra_special_tokens 格式 + +当 `extra_special_tokens` 为 list 格式时,自动转换为 dict 格式: + +```json +// 修复前 +"extra_special_tokens": ["<|im_start|>", "<|im_end|>", "<|box_start|>", "<|box_end|>", ...] + +// 修复后 +"extra_special_tokens": { + "<|im_start|>": "<|im_start|>", + "<|im_end|>": "<|im_end|>", + "<|box_start|>": "<|box_start|>", + "<|box_end|>": "<|box_end|>", + ... +} +``` + +--- + +## 5. 生成的 tokenizer 目录 + +``` +/tmp/fixed_tokenizer/ +├── tokenizer.json +├── tokenizer_config.json (已修复) +├── special_tokens_map.json (可选) +├── vocab.json / merges.txt (如需要) +``` + +--- + +## 6. 日志说明 + +### 正常情况 + +``` +[entrypoint] tokenizer OK, skip fix +``` + +### 自动修复 + +``` +[entrypoint] fixing tokenizer... +[fix] override bad tokenizer_class: TokenizersBackend → PreTrainedTokenizerFast +[fix] converted extra_special_tokens from list (13 items) to dict format +``` + +触发条件(AUTO_FIX=auto 时): +- tokenizer_config.json 包含 `TokenizersBackend` 或 `TiktokenTokenizer` +- tokenizer_config.json 中 `extra_special_tokens` 为 list 格式(`"extra_special_tokens": [`) + +--- + +## 7. 验证方法 + +进入容器执行: + +```python +from transformers import AutoTokenizer + +tok = AutoTokenizer.from_pretrained("/tmp/fixed_tokenizer") + +print(tok.encode("hello world")) +print(tok.decode(tok.encode("hello world"))) +``` + +确保: + +``` +encode → decode 可逆 +``` + +--- + +## 8. 注意事项 + +### ⚠️ 1. tokenizer 文件必须存在 + +至少需要: + +| 类型 | 必需文件 | +| -------------- | ----------------------- | +| Fast tokenizer | tokenizer.json | +| BPE | vocab.json + merges.txt | +| SentencePiece | tokenizer.model | + +--- + +### ⚠️ 2. 不影响模型推理 + +本方案: + +``` +仅影响 tokenizer(文本 ↔ token) +不影响模型计算(attention / KV cache) +``` + +--- + +### ⚠️ 3. 特殊 token 风险 + +需确认: + +``` +bos_token / eos_token / pad_token 一致 +``` + +否则可能影响生成结果 + +--- + +## 9. 总结 + +本方案通过在容器启动阶段引入 tokenizer 修复逻辑,实现: + +``` +“模型不动,运行时自适应兼容” + +``` +``` diff --git a/detect_tokenizer.py b/detect_tokenizer.py new file mode 100644 index 0000000..c0e7b3e --- /dev/null +++ b/detect_tokenizer.py @@ -0,0 +1,25 @@ +import os +import json + +def detect(model_dir): + cfg_path = os.path.join(model_dir, "tokenizer_config.json") + + if os.path.exists(cfg_path): + with open(cfg_path) as f: + cfg = json.load(f) + cls = cfg.get("tokenizer_class", "") + else: + cls = "" + + files = os.listdir(model_dir) + + if "tokenizer.json" in files: + return "fast", cls + + if "tokenizer.model" in files: + return "sentencepiece", cls + + if "vocab.json" in files and "merges.txt" in files: + return "bpe", cls + + return "unknown", cls diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..07308d3 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -e + +MODEL_DIR=${1:-/model} +shift || true + +FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer +AUTO_FIX=${AUTO_FIX_TOKENIZER:-auto} + +echo "[entrypoint] model dir: $MODEL_DIR" + +NEED_FIX=0 + +if [ "$AUTO_FIX" = "1" ] || [ "$AUTO_FIX" = "true" ]; then + NEED_FIX=1 +elif [ "$AUTO_FIX" = "auto" ]; then + if [ -f "$MODEL_DIR/tokenizer_config.json" ]; then + if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then + NEED_FIX=1 + fi + # 检测 extra_special_tokens 是否为 list 格式 + if grep -q '"extra_special_tokens":\s*\[' "$MODEL_DIR/tokenizer_config.json"; then + NEED_FIX=1 + fi + fi +fi + +if [ $NEED_FIX -eq 1 ]; then + echo "[entrypoint] fixing tokenizer..." + python3 /opt/fix_tokenizer.py + TOKENIZER_ARG="--tokenizer $FIX_TOKENIZER_DIR" +else + echo "[entrypoint] tokenizer OK, skip fix" + TOKENIZER_ARG="" +fi + +echo "[entrypoint] starting vllm..." + +exec vllm serve "$MODEL_DIR" $TOKENIZER_ARG "$@" diff --git a/fix_tokenizer.py b/fix_tokenizer.py new file mode 100644 index 0000000..9556ea6 --- /dev/null +++ b/fix_tokenizer.py @@ -0,0 +1,69 @@ +import os +import shutil +import json +from detect_tokenizer import detect + +MODEL_DIR = os.environ.get("MODEL_DIR", "/model") +OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer") + +os.makedirs(OUT_DIR, exist_ok=True) + +def copy_if_exists(name): + src = os.path.join(MODEL_DIR, name) + if os.path.exists(src): + shutil.copy(src, OUT_DIR) + +# 复制所有可能相关文件 +for f in [ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + "tokenizer.model", +]: + copy_if_exists(f) + +typ, orig_cls = detect(MODEL_DIR) + +cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json") + +if os.path.exists(cfg_path): + with open(cfg_path) as f: + cfg = json.load(f) +else: + cfg = {} + +# ===== 自动修复策略 ===== +if typ == "fast": + cfg["tokenizer_class"] = "PreTrainedTokenizerFast" + +elif typ == "sentencepiece": + cfg["tokenizer_class"] = "LlamaTokenizer" + +elif typ == "bpe": + cfg["tokenizer_class"] = "GPT2TokenizerFast" + +else: + cfg["tokenizer_class"] = "PreTrainedTokenizerFast" + +# 特殊 case 修复 +bad_classes = [ + "TokenizersBackend", + "TiktokenTokenizer", +] + +if orig_cls in bad_classes: + print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}") + +# 修复 extra_special_tokens: list → dict 格式 +if "extra_special_tokens" in cfg and isinstance(cfg["extra_special_tokens"], list): + orig_list = cfg["extra_special_tokens"] + cfg["extra_special_tokens"] = {token: token for token in orig_list} + print(f"[fix] converted extra_special_tokens from list ({len(orig_list)} items) to dict format") + +# 写回 +with open(cfg_path, "w") as f: + json.dump(cfg, f) + +print(f"[fix_tokenizer] done → {OUT_DIR}")