Initial commit: bi100 tokenizer patch
All checks were successful
Docker Build and Push / docker (push) Successful in 1m14s

Add fix_tokenizer.py, vllm_wrapper.sh, Dockerfile, and README for
automatic tokenizer_config.json repair on Iluvatar BI-100 vLLM images.
This commit is contained in:
4paradigm
2026-07-01 17:59:17 +08:00
commit cfbb595127
6 changed files with 312 additions and 0 deletions

View File

@@ -0,0 +1,132 @@
name: Docker Build and Push
on:
push:
tags:
- "v*"
jobs:
docker:
runs-on: amd64-ubuntu-24.04
steps:
- name: Clone repository
run: |
git clone "${{ gitea.server_url }}/${{ gitea.repository }}.git" .
git checkout "${{ gitea.ref_name }}"
- name: Set image metadata
run: |
IMAGE_NAME="$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')"
IMAGE="${DOCKER_REGISTRY}/${DOCKER_USERNAME}/${IMAGE_NAME}:${{ gitea.ref_name }}"
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITEA_ENV"
echo "IMAGE=${IMAGE}" >> "$GITEA_ENV"
- name: Load and Validate Task Info
run: |
set -a
. .gitea/workflows/task_info.env
set +a
for name in FRAMEWORK GPU_TYPE TASK_TYPE; do
eval "value=\${${name}:-}"
if [ "$name" = "FRAMEWORK" ] && [ -z "$value" ]; then
echo "${name} is empty in .gitea/workflows/task_info.env"
exit 1
fi
echo "${name}=${value}" >> "$GITEA_ENV"
done
- name: Validate Image Verify Metadata
run: |
if [ -z "${FIXED_TOKEN:-}" ]; then
echo "FIXED_TOKEN is not configured on runner"
exit 1
fi
if ! response="$(curl --silent --show-error --location --get 'https://modelhub.org.cn/adminApi/image-verify/validate' \
--header "Xc-Token: ${FIXED_TOKEN}" \
--data-urlencode "gpuType=${GPU_TYPE:-}" \
--data-urlencode "taskType=${TASK_TYPE:-}")"; then
echo "failed to call image verify validate API"
exit 1
fi
VALIDATE_RESPONSE="$response" python3 - <<'PY'
import json
import os
import sys
raw = os.environ.get("VALIDATE_RESPONSE", "")
try:
body = json.loads(raw)
except json.JSONDecodeError:
print("image verify validate API returned invalid JSON")
print(raw)
sys.exit(1)
if body.get("code") == 0 and body.get("data") is True:
print("image verify metadata validation passed")
sys.exit(0)
message = body.get("message") or "unknown error"
print(f"image verify metadata validation failed: {message}")
print(raw)
sys.exit(1)
PY
- name: Login to Docker Registry
run: |
echo "$DOCKER_PASSWORD" | docker login "$DOCKER_REGISTRY" \
-u "$DOCKER_USERNAME" \
--password-stdin
- name: Build Docker Image
run: |
docker build -t "$IMAGE" .
- name: Push Docker Image
run: |
for attempt in 1 2 3; do
echo "Starting docker push attempt ${attempt}/3 for ${IMAGE}"
docker push "$IMAGE" &
PUSH_PID=$!
while kill -0 "$PUSH_PID" 2>/dev/null; do
echo "docker push is still running at $(date -u '+%Y-%m-%dT%H:%M:%SZ')"
sleep 60
done
if wait "$PUSH_PID"; then
echo "docker push completed successfully"
exit 0
fi
echo "docker push failed on attempt ${attempt}/3"
sleep 30
done
echo "docker push failed after 3 attempts"
exit 1
- name: Notify Image Verify
run: |
if [ -z "${FIXED_TOKEN:-}" ]; then
echo "FIXED_TOKEN is not configured on runner"
exit 1
fi
curl --silent --show-error --fail-with-body --location --request POST 'https://modelhub.org.cn//adminApi/image-verify' \
--header "Xc-Token: ${FIXED_TOKEN}" \
--header 'Content-Type: application/json' \
--data-raw "{
\"framework\": \"${FRAMEWORK}\",
\"gpuType\": \"${GPU_TYPE}\",
\"imageUrl\": \"${IMAGE}\",
\"taskType\": \"${TASK_TYPE}\",
\"createBy\": \"${{ gitea.actor }}\",
\"repoUrl\": \"${{ gitea.server_url }}/${{ gitea.repository }}\",
\"tag\": \"${{ github.ref_name }}\"
}"

View File

@@ -0,0 +1,3 @@
FRAMEWORK=vllm-patch-tokenizer
GPU_TYPE=Iluvatar_bi-100
TASK_TYPE=text-generation

10
Dockerfile Normal file
View File

@@ -0,0 +1,10 @@
FROM harbor-contest.4pd.io/luopingyi/bi100/vllm:0.6.3
COPY fix_tokenizer.py /opt/fix_tokenizer.py
COPY vllm_wrapper.sh /opt/vllm_wrapper.sh
RUN chmod +x /opt/vllm_wrapper.sh && \
mv /usr/local/corex/lib64/python3/dist-packages/bin/vllm \
/usr/local/corex/lib64/python3/dist-packages/bin/vllm_real && \
ln -s /opt/vllm_wrapper.sh \
/usr/local/corex/lib64/python3/dist-packages/bin/vllm

55
README.md Normal file
View File

@@ -0,0 +1,55 @@
# bi100-tokenizer-patch
基于 `harbor-contest.4pd.io/luopingyi/bi100/vllm:0.6.3` 的 tokenizer 自动修复镜像。
## 问题背景
部分模型的 `tokenizer_config.json` 存在以下问题,导致 vLLM 服务启动失败:
| 错误 | 原因 |
|---|---|
| `ValueError: Tokenizer class TokenizersBackend does not exist` | `tokenizer_class` 不是 transformers 合法类名 |
| `AttributeError: 'list' object has no attribute 'keys'` | `extra_special_tokens` 为 list 格式transformers 要求 dict |
## 修复方式
构建时将镜像内的 `vllm` 二进制替换为同名 wrapper 脚本,原二进制重命名为 `vllm_real`
容器启动时 wrapper 自动检测 `tokenizer_config.json`
- 存在问题 → 将 tokenizer 文件复制到 `/tmp/fixed_tokenizer/` 并修复,追加 `--tokenizer /tmp/fixed_tokenizer` 参数后调用 `vllm_real`
- 无问题 → 直接调用 `vllm_real`,行为与原镜像完全一致
原始模型目录不做任何修改。
## 使用方式
**原始 docker run 命令只需替换镜像名,其他参数不变:**
```bash
# 原镜像
harbor-contest.4pd.io/luopingyi/bi100/vllm:0.6.3
# 替换为
<this-image>
```
示例:
```bash
docker run -dit --name <container_name> \
-p <port>:8000 \
-v /lib/modules:/lib/modules -v /dev:/dev \
--device=/dev/iluvatar0:/dev/iluvatar0 \
-v /path/to/model:/model \
--entrypoint vllm <this-image> \
serve /model --port 8000 --served-model-name llm \
--max-model-len 2048 --gpu-memory-utilization 0.9 \
--enforce-eager --trust-remote-code -tp 1
```
## 构建
```bash
docker build -t bi100-tokenizer-patch:latest .
```
CI 通过推送 `v*` tag 自动触发构建并推送镜像。

95
fix_tokenizer.py Normal file
View File

@@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""
检测并修复 tokenizer_config.json 中的两类问题:
1. tokenizer_class 在 transformers 中不存在(如 TokenizersBackend
2. extra_special_tokens 为 list 格式transformers 要求 dict
若存在问题,将 tokenizer 文件复制到 /tmp/fixed_tokenizer/ 并修复,
最后将修复目录路径输出到 stdout。若无需修复输出为空。
"""
import os
import sys
import json
import shutil
MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else os.environ.get("MODEL_DIR", "/model")
OUT_DIR = "/tmp/fixed_tokenizer"
def main():
cfg_path = os.path.join(MODEL_DIR, "tokenizer_config.json")
if not os.path.exists(cfg_path):
return
with open(cfg_path) as f:
cfg = json.load(f)
fixes = []
# --- 检测 1tokenizer_class 是否在 transformers 中存在 ---
tokenizer_class = cfg.get("tokenizer_class", "")
bad_tokenizer_class = False
if tokenizer_class:
import transformers
if getattr(transformers, tokenizer_class, None) is None:
bad_tokenizer_class = True
fixes.append(f"tokenizer_class '{tokenizer_class}' not found in transformers")
# --- 检测 2extra_special_tokens 是否为 list 格式 ---
bad_extra_special_tokens = (
"extra_special_tokens" in cfg
and isinstance(cfg["extra_special_tokens"], list)
)
if bad_extra_special_tokens:
fixes.append("extra_special_tokens is a list, expected dict")
if not fixes:
return # 无需修复
# 复制 tokenizer 文件到临时目录
os.makedirs(OUT_DIR, exist_ok=True)
for fname in [
"tokenizer.json",
"tokenizer_config.json",
"special_tokens_map.json",
"vocab.json",
"merges.txt",
"tokenizer.model",
]:
src = os.path.join(MODEL_DIR, fname)
if os.path.exists(src):
shutil.copy(src, OUT_DIR)
# --- 修复 1替换 tokenizer_class ---
if bad_tokenizer_class:
files = os.listdir(MODEL_DIR)
if "tokenizer.json" in files:
fixed_class = "PreTrainedTokenizerFast"
elif "tokenizer.model" in files:
fixed_class = "LlamaTokenizer"
elif "vocab.json" in files and "merges.txt" in files:
fixed_class = "GPT2TokenizerFast"
else:
fixed_class = "PreTrainedTokenizerFast"
cfg["tokenizer_class"] = fixed_class
print(
f"[fix_tokenizer] tokenizer_class: '{tokenizer_class}''{fixed_class}'",
file=sys.stderr,
)
# --- 修复 2extra_special_tokens list → dict ---
if bad_extra_special_tokens:
orig_list = cfg["extra_special_tokens"]
cfg["extra_special_tokens"] = {token: token for token in orig_list}
print(
f"[fix_tokenizer] extra_special_tokens: list({len(orig_list)}) → dict",
file=sys.stderr,
)
with open(os.path.join(OUT_DIR, "tokenizer_config.json"), "w") as f:
json.dump(cfg, f, indent=2)
print(OUT_DIR) # 输出修复目录,供 entrypoint.sh 捕获
main()

17
vllm_wrapper.sh Normal file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
set -e
# 只拦截 "serve <model_dir>" 子命令,其他子命令直接透传
if [ "$1" = "serve" ] && [ -n "$2" ]; then
MODEL_DIR="$2"
shift 2
FIXED_DIR=$(python3 /opt/fix_tokenizer.py "$MODEL_DIR")
if [ -n "$FIXED_DIR" ]; then
exec /usr/local/corex/lib64/python3/dist-packages/bin/vllm_real serve "$MODEL_DIR" --tokenizer "$FIXED_DIR" "$@"
else
exec /usr/local/corex/lib64/python3/dist-packages/bin/vllm_real serve "$MODEL_DIR" "$@"
fi
fi
exec /usr/local/corex/lib64/python3/dist-packages/bin/vllm_real "$@"