From cfbb59512796debfc0128b824667de1ae6203266 Mon Sep 17 00:00:00 2001 From: 4paradigm <4paradigm@4paradigmdeMacBook-Pro.local> Date: Wed, 1 Jul 2026 17:59:17 +0800 Subject: [PATCH] Initial commit: bi100 tokenizer patch Add fix_tokenizer.py, vllm_wrapper.sh, Dockerfile, and README for automatic tokenizer_config.json repair on Iluvatar BI-100 vLLM images. --- .gitea/workflows/docker-build-push.yml | 132 +++++++++++++++++++++++++ .gitea/workflows/task_info.env | 3 + Dockerfile | 10 ++ README.md | 55 +++++++++++ fix_tokenizer.py | 95 ++++++++++++++++++ vllm_wrapper.sh | 17 ++++ 6 files changed, 312 insertions(+) create mode 100644 .gitea/workflows/docker-build-push.yml create mode 100644 .gitea/workflows/task_info.env create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 fix_tokenizer.py create mode 100644 vllm_wrapper.sh diff --git a/.gitea/workflows/docker-build-push.yml b/.gitea/workflows/docker-build-push.yml new file mode 100644 index 0000000..e062a95 --- /dev/null +++ b/.gitea/workflows/docker-build-push.yml @@ -0,0 +1,132 @@ +name: Docker Build and Push + +on: + push: + tags: + - "v*" + +jobs: + docker: + runs-on: amd64-ubuntu-24.04 + + steps: + - name: Clone repository + run: | + git clone "${{ gitea.server_url }}/${{ gitea.repository }}.git" . + git checkout "${{ gitea.ref_name }}" + + - name: Set image metadata + run: | + IMAGE_NAME="$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')" + IMAGE="${DOCKER_REGISTRY}/${DOCKER_USERNAME}/${IMAGE_NAME}:${{ gitea.ref_name }}" + + echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITEA_ENV" + echo "IMAGE=${IMAGE}" >> "$GITEA_ENV" + + - name: Load and Validate Task Info + run: | + set -a + . .gitea/workflows/task_info.env + set +a + + for name in FRAMEWORK GPU_TYPE TASK_TYPE; do + eval "value=\${${name}:-}" + if [ "$name" = "FRAMEWORK" ] && [ -z "$value" ]; then + echo "${name} is empty in .gitea/workflows/task_info.env" + exit 1 + fi + + echo "${name}=${value}" >> "$GITEA_ENV" + done + + - name: Validate Image Verify Metadata + run: | + if [ -z "${FIXED_TOKEN:-}" ]; then + echo "FIXED_TOKEN is not configured on runner" + exit 1 + fi + + if ! response="$(curl --silent --show-error --location --get 'https://modelhub.org.cn/adminApi/image-verify/validate' \ + --header "Xc-Token: ${FIXED_TOKEN}" \ + --data-urlencode "gpuType=${GPU_TYPE:-}" \ + --data-urlencode "taskType=${TASK_TYPE:-}")"; then + echo "failed to call image verify validate API" + exit 1 + fi + + VALIDATE_RESPONSE="$response" python3 - <<'PY' + import json + import os + import sys + + raw = os.environ.get("VALIDATE_RESPONSE", "") + try: + body = json.loads(raw) + except json.JSONDecodeError: + print("image verify validate API returned invalid JSON") + print(raw) + sys.exit(1) + + if body.get("code") == 0 and body.get("data") is True: + print("image verify metadata validation passed") + sys.exit(0) + + message = body.get("message") or "unknown error" + print(f"image verify metadata validation failed: {message}") + print(raw) + sys.exit(1) + PY + + - name: Login to Docker Registry + run: | + echo "$DOCKER_PASSWORD" | docker login "$DOCKER_REGISTRY" \ + -u "$DOCKER_USERNAME" \ + --password-stdin + + - name: Build Docker Image + run: | + docker build -t "$IMAGE" . + + - name: Push Docker Image + run: | + for attempt in 1 2 3; do + echo "Starting docker push attempt ${attempt}/3 for ${IMAGE}" + docker push "$IMAGE" & + PUSH_PID=$! + + while kill -0 "$PUSH_PID" 2>/dev/null; do + echo "docker push is still running at $(date -u '+%Y-%m-%dT%H:%M:%SZ')" + sleep 60 + done + + if wait "$PUSH_PID"; then + echo "docker push completed successfully" + exit 0 + fi + + echo "docker push failed on attempt ${attempt}/3" + sleep 30 + done + + echo "docker push failed after 3 attempts" + exit 1 + + - name: Notify Image Verify + run: | + if [ -z "${FIXED_TOKEN:-}" ]; then + echo "FIXED_TOKEN is not configured on runner" + exit 1 + fi + + curl --silent --show-error --fail-with-body --location --request POST 'https://modelhub.org.cn//adminApi/image-verify' \ + --header "Xc-Token: ${FIXED_TOKEN}" \ + --header 'Content-Type: application/json' \ + --data-raw "{ + \"framework\": \"${FRAMEWORK}\", + \"gpuType\": \"${GPU_TYPE}\", + \"imageUrl\": \"${IMAGE}\", + \"taskType\": \"${TASK_TYPE}\", + \"createBy\": \"${{ gitea.actor }}\", + \"repoUrl\": \"${{ gitea.server_url }}/${{ gitea.repository }}\", + \"tag\": \"${{ github.ref_name }}\" + }" \ No newline at end of file diff --git a/.gitea/workflows/task_info.env b/.gitea/workflows/task_info.env new file mode 100644 index 0000000..9e3f508 --- /dev/null +++ b/.gitea/workflows/task_info.env @@ -0,0 +1,3 @@ +FRAMEWORK=vllm-patch-tokenizer +GPU_TYPE=Iluvatar_bi-100 +TASK_TYPE=text-generation \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ef87f62 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM harbor-contest.4pd.io/luopingyi/bi100/vllm:0.6.3 + +COPY fix_tokenizer.py /opt/fix_tokenizer.py +COPY vllm_wrapper.sh /opt/vllm_wrapper.sh + +RUN chmod +x /opt/vllm_wrapper.sh && \ + mv /usr/local/corex/lib64/python3/dist-packages/bin/vllm \ + /usr/local/corex/lib64/python3/dist-packages/bin/vllm_real && \ + ln -s /opt/vllm_wrapper.sh \ + /usr/local/corex/lib64/python3/dist-packages/bin/vllm diff --git a/README.md b/README.md new file mode 100644 index 0000000..f48680a --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +# bi100-tokenizer-patch + +基于 `harbor-contest.4pd.io/luopingyi/bi100/vllm:0.6.3` 的 tokenizer 自动修复镜像。 + +## 问题背景 + +部分模型的 `tokenizer_config.json` 存在以下问题,导致 vLLM 服务启动失败: + +| 错误 | 原因 | +|---|---| +| `ValueError: Tokenizer class TokenizersBackend does not exist` | `tokenizer_class` 不是 transformers 合法类名 | +| `AttributeError: 'list' object has no attribute 'keys'` | `extra_special_tokens` 为 list 格式,transformers 要求 dict | + +## 修复方式 + +构建时将镜像内的 `vllm` 二进制替换为同名 wrapper 脚本,原二进制重命名为 `vllm_real`。 + +容器启动时 wrapper 自动检测 `tokenizer_config.json`: +- 存在问题 → 将 tokenizer 文件复制到 `/tmp/fixed_tokenizer/` 并修复,追加 `--tokenizer /tmp/fixed_tokenizer` 参数后调用 `vllm_real` +- 无问题 → 直接调用 `vllm_real`,行为与原镜像完全一致 + +原始模型目录不做任何修改。 + +## 使用方式 + +**原始 docker run 命令只需替换镜像名,其他参数不变:** + +```bash +# 原镜像 +harbor-contest.4pd.io/luopingyi/bi100/vllm:0.6.3 + +# 替换为 + +``` + +示例: +```bash +docker run -dit --name \ + -p :8000 \ + -v /lib/modules:/lib/modules -v /dev:/dev \ + --device=/dev/iluvatar0:/dev/iluvatar0 \ + -v /path/to/model:/model \ + --entrypoint vllm \ + serve /model --port 8000 --served-model-name llm \ + --max-model-len 2048 --gpu-memory-utilization 0.9 \ + --enforce-eager --trust-remote-code -tp 1 +``` + +## 构建 + +```bash +docker build -t bi100-tokenizer-patch:latest . +``` + +CI 通过推送 `v*` tag 自动触发构建并推送镜像。 diff --git a/fix_tokenizer.py b/fix_tokenizer.py new file mode 100644 index 0000000..42538f5 --- /dev/null +++ b/fix_tokenizer.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +""" +检测并修复 tokenizer_config.json 中的两类问题: +1. tokenizer_class 在 transformers 中不存在(如 TokenizersBackend) +2. extra_special_tokens 为 list 格式(transformers 要求 dict) + +若存在问题,将 tokenizer 文件复制到 /tmp/fixed_tokenizer/ 并修复, +最后将修复目录路径输出到 stdout。若无需修复,输出为空。 +""" +import os +import sys +import json +import shutil + +MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else os.environ.get("MODEL_DIR", "/model") +OUT_DIR = "/tmp/fixed_tokenizer" + + +def main(): + cfg_path = os.path.join(MODEL_DIR, "tokenizer_config.json") + if not os.path.exists(cfg_path): + return + + with open(cfg_path) as f: + cfg = json.load(f) + + fixes = [] + + # --- 检测 1:tokenizer_class 是否在 transformers 中存在 --- + tokenizer_class = cfg.get("tokenizer_class", "") + bad_tokenizer_class = False + if tokenizer_class: + import transformers + if getattr(transformers, tokenizer_class, None) is None: + bad_tokenizer_class = True + fixes.append(f"tokenizer_class '{tokenizer_class}' not found in transformers") + + # --- 检测 2:extra_special_tokens 是否为 list 格式 --- + bad_extra_special_tokens = ( + "extra_special_tokens" in cfg + and isinstance(cfg["extra_special_tokens"], list) + ) + if bad_extra_special_tokens: + fixes.append("extra_special_tokens is a list, expected dict") + + if not fixes: + return # 无需修复 + + # 复制 tokenizer 文件到临时目录 + os.makedirs(OUT_DIR, exist_ok=True) + for fname in [ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + "tokenizer.model", + ]: + src = os.path.join(MODEL_DIR, fname) + if os.path.exists(src): + shutil.copy(src, OUT_DIR) + + # --- 修复 1:替换 tokenizer_class --- + if bad_tokenizer_class: + files = os.listdir(MODEL_DIR) + if "tokenizer.json" in files: + fixed_class = "PreTrainedTokenizerFast" + elif "tokenizer.model" in files: + fixed_class = "LlamaTokenizer" + elif "vocab.json" in files and "merges.txt" in files: + fixed_class = "GPT2TokenizerFast" + else: + fixed_class = "PreTrainedTokenizerFast" + cfg["tokenizer_class"] = fixed_class + print( + f"[fix_tokenizer] tokenizer_class: '{tokenizer_class}' → '{fixed_class}'", + file=sys.stderr, + ) + + # --- 修复 2:extra_special_tokens list → dict --- + if bad_extra_special_tokens: + orig_list = cfg["extra_special_tokens"] + cfg["extra_special_tokens"] = {token: token for token in orig_list} + print( + f"[fix_tokenizer] extra_special_tokens: list({len(orig_list)}) → dict", + file=sys.stderr, + ) + + with open(os.path.join(OUT_DIR, "tokenizer_config.json"), "w") as f: + json.dump(cfg, f, indent=2) + + print(OUT_DIR) # 输出修复目录,供 entrypoint.sh 捕获 + + +main() diff --git a/vllm_wrapper.sh b/vllm_wrapper.sh new file mode 100644 index 0000000..c050c47 --- /dev/null +++ b/vllm_wrapper.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# 只拦截 "serve " 子命令,其他子命令直接透传 +if [ "$1" = "serve" ] && [ -n "$2" ]; then + MODEL_DIR="$2" + shift 2 + + FIXED_DIR=$(python3 /opt/fix_tokenizer.py "$MODEL_DIR") + if [ -n "$FIXED_DIR" ]; then + exec /usr/local/corex/lib64/python3/dist-packages/bin/vllm_real serve "$MODEL_DIR" --tokenizer "$FIXED_DIR" "$@" + else + exec /usr/local/corex/lib64/python3/dist-packages/bin/vllm_real serve "$MODEL_DIR" "$@" + fi +fi + +exec /usr/local/corex/lib64/python3/dist-packages/bin/vllm_real "$@"