commit a2a00f102efacb7544ee6c7e13622b16bdd75821 Author: 4paradigm <4paradigm@4paradigmdeMacBook-Pro.local> Date: Mon Jun 29 17:04:41 2026 +0800 first commit diff --git a/.gitea/docker-build-push.yml b/.gitea/docker-build-push.yml new file mode 100644 index 0000000..9c5c285 --- /dev/null +++ b/.gitea/docker-build-push.yml @@ -0,0 +1,132 @@ +name: Docker Build and Push + +on: + push: + tags: + - "v*" + +jobs: + docker: + runs-on: amd64-ubuntu-24.04 + + steps: + - name: Clone repository + run: | + git clone "${{ gitea.server_url }}/${{ gitea.repository }}.git" . + git checkout "${{ gitea.ref_name }}" + + - name: Set image metadata + run: | + IMAGE_NAME="$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')" + IMAGE="${DOCKER_REGISTRY}/${DOCKER_USERNAME}/${IMAGE_NAME}:${{ gitea.ref_name }}" + + echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITEA_ENV" + echo "IMAGE=${IMAGE}" >> "$GITEA_ENV" + + - name: Load and Validate Task Info + run: | + set -a + . .gitea/workflows/task_info.env + set +a + + for name in FRAMEWORK GPU_TYPE TASK_TYPE; do + eval "value=\${${name}:-}" + if [ "$name" = "FRAMEWORK" ] && [ -z "$value" ]; then + echo "${name} is empty in .gitea/workflows/task_info.env" + exit 1 + fi + + echo "${name}=${value}" >> "$GITEA_ENV" + done + + - name: Validate Image Verify Metadata + run: | + if [ -z "${FIXED_TOKEN:-}" ]; then + echo "FIXED_TOKEN is not configured on runner" + exit 1 + fi + + if ! response="$(curl --silent --show-error --location --get 'https://modelhub.org.cn/adminApi/image-verify/validate' \ + --header "Xc-Token: ${FIXED_TOKEN}" \ + --data-urlencode "gpuType=${GPU_TYPE:-}" \ + --data-urlencode "taskType=${TASK_TYPE:-}")"; then + echo "failed to call image verify validate API" + exit 1 + fi + + VALIDATE_RESPONSE="$response" python3 - <<'PY' + import json + import os + import sys + + raw = os.environ.get("VALIDATE_RESPONSE", "") + try: + body = json.loads(raw) + except json.JSONDecodeError: + print("image verify validate API returned invalid JSON") + print(raw) + sys.exit(1) + + if body.get("code") == 0 and body.get("data") is True: + print("image verify metadata validation passed") + sys.exit(0) + + message = body.get("message") or "unknown error" + print(f"image verify metadata validation failed: {message}") + print(raw) + sys.exit(1) + PY + + - name: Login to Docker Registry + run: | + echo "$DOCKER_PASSWORD" | docker login "$DOCKER_REGISTRY" \ + -u "$DOCKER_USERNAME" \ + --password-stdin + + - name: Build Docker Image + run: | + docker build -t "$IMAGE" . + + - name: Push Docker Image + run: | + for attempt in 1 2 3; do + echo "Starting docker push attempt ${attempt}/3 for ${IMAGE}" + docker push "$IMAGE" & + PUSH_PID=$! + + while kill -0 "$PUSH_PID" 2>/dev/null; do + echo "docker push is still running at $(date -u '+%Y-%m-%dT%H:%M:%SZ')" + sleep 60 + done + + if wait "$PUSH_PID"; then + echo "docker push completed successfully" + exit 0 + fi + + echo "docker push failed on attempt ${attempt}/3" + sleep 30 + done + + echo "docker push failed after 3 attempts" + exit 1 + + - name: Notify Image Verify + run: | + if [ -z "${FIXED_TOKEN:-}" ]; then + echo "FIXED_TOKEN is not configured on runner" + exit 1 + fi + + curl --silent --show-error --fail-with-body --location --request POST 'https://modelhub.org.cn//adminApi/image-verify' \ + --header "Xc-Token: ${FIXED_TOKEN}" \ + --header 'Content-Type: application/json' \ + --data-raw "{ + \"framework\": \"${FRAMEWORK}\", + \"gpuType\": \"${GPU_TYPE}\", + \"imageUrl\": \"${IMAGE}\", + \"taskType\": \"${TASK_TYPE}\", + \"createBy\": \"${{ gitea.actor }}\", + \"repoUrl\": \"${{ gitea.server_url }}/${{ gitea.repository }}\", + \"tag\": \"${{ github.ref_name }}\" + }" diff --git a/.gitea/task_info.env b/.gitea/task_info.env new file mode 100644 index 0000000..b1e8801 --- /dev/null +++ b/.gitea/task_info.env @@ -0,0 +1,3 @@ +FRAMEWORK=vllm_tokenizerz_patch +GPU_TYPE=Kunlun_P800 +TASK_TYPE=text-generation diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f7c26ae --- /dev/null +++ b/Dockerfile @@ -0,0 +1,9 @@ +FROM harbor-contest.4pd.io/sunjichen/xc-llm-kunlun:latest + +COPY entrypoint.sh /opt/entrypoint.sh +COPY fix_tokenizer.py /opt/fix_tokenizer.py +COPY detect_tokenizer.py /opt/detect_tokenizer.py + +RUN chmod +x /opt/entrypoint.sh + +ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..5bcaa04 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# xc-llm-kunlun-fix-tokenizer + +基于 `harbor-contest.4pd.io/sunjichen/xc-llm-kunlun:latest` 的 tokenizer 自动修复镜像,解决部分模型 `tokenizer_config.json` 中 `tokenizer_class` 为 `TokenizersBackend` 等非标准类名导致 vLLM 启动失败的问题。 + +## 问题背景 + +某些经过训练/合并的模型,其 `tokenizer_config.json` 中存在以下问题: +- `tokenizer_class` 被设置为 `TokenizersBackend`、`TiktokenTokenizer` 等 transformers 不识别的类名 +- `extra_special_tokens` 字段为 list 格式,而 transformers 期望 dict 格式 + +这会导致 `AutoTokenizer.from_pretrained` 抛出 `ValueError`,vLLM 服务无法启动。 + +## 修复方式 + +容器启动时自动检测 `tokenizer_config.json`,若存在问题则将 tokenizer 文件复制到 `/tmp/fixed_tokenizer/` 并修复配置,再以 `--tokenizer /tmp/fixed_tokenizer` 参数启动 vLLM。原始模型目录不做任何修改。 + +## 使用方式 + +将原 docker run 命令中的镜像名替换为本镜像,并去掉 `--entrypoint vllm`,改为直接传参: + +```bash +docker run -dit --name \ + -p 44825:8000 \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + --tmpfs /dev/shm:rw,nosuid,nodev,exec,size=64g \ + --ulimit memlock=-1 \ + --device=/dev/xpu0:/dev/xpu0 \ + --device=/dev/xpuctrl:/dev/xpuctrl \ + -v /path/to/model:/model \ + \ + /model --port 8000 --served-model-name llm \ + --max-model-len 2048 --gpu-memory-utilization 0.9 \ + --enforce-eager --trust-remote-code -tp 1 +``` + +## 环境变量 + +| 变量 | 默认值 | 说明 | +|---|---|---| +| `AUTO_FIX_TOKENIZER` | `auto` | `auto`:自动检测;`1`/`true`:强制修复;其他值:跳过修复 | +| `MODEL_DIR` | `/model` | 模型路径(通常通过命令行第一个参数传入) | +| `FIX_TOKENIZER_DIR` | `/tmp/fixed_tokenizer` | 修复后 tokenizer 文件的临时目录 | + +## 构建 + +```bash +docker build -t xc-llm-kunlun-fix-tokenizer:latest . +``` + +CI 通过推送 `v*` tag 自动触发构建并推送镜像。 diff --git a/detect_tokenizer.py b/detect_tokenizer.py new file mode 100644 index 0000000..c0e7b3e --- /dev/null +++ b/detect_tokenizer.py @@ -0,0 +1,25 @@ +import os +import json + +def detect(model_dir): + cfg_path = os.path.join(model_dir, "tokenizer_config.json") + + if os.path.exists(cfg_path): + with open(cfg_path) as f: + cfg = json.load(f) + cls = cfg.get("tokenizer_class", "") + else: + cls = "" + + files = os.listdir(model_dir) + + if "tokenizer.json" in files: + return "fast", cls + + if "tokenizer.model" in files: + return "sentencepiece", cls + + if "vocab.json" in files and "merges.txt" in files: + return "bpe", cls + + return "unknown", cls diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..07308d3 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -e + +MODEL_DIR=${1:-/model} +shift || true + +FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer +AUTO_FIX=${AUTO_FIX_TOKENIZER:-auto} + +echo "[entrypoint] model dir: $MODEL_DIR" + +NEED_FIX=0 + +if [ "$AUTO_FIX" = "1" ] || [ "$AUTO_FIX" = "true" ]; then + NEED_FIX=1 +elif [ "$AUTO_FIX" = "auto" ]; then + if [ -f "$MODEL_DIR/tokenizer_config.json" ]; then + if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then + NEED_FIX=1 + fi + # 检测 extra_special_tokens 是否为 list 格式 + if grep -q '"extra_special_tokens":\s*\[' "$MODEL_DIR/tokenizer_config.json"; then + NEED_FIX=1 + fi + fi +fi + +if [ $NEED_FIX -eq 1 ]; then + echo "[entrypoint] fixing tokenizer..." + python3 /opt/fix_tokenizer.py + TOKENIZER_ARG="--tokenizer $FIX_TOKENIZER_DIR" +else + echo "[entrypoint] tokenizer OK, skip fix" + TOKENIZER_ARG="" +fi + +echo "[entrypoint] starting vllm..." + +exec vllm serve "$MODEL_DIR" $TOKENIZER_ARG "$@" diff --git a/fix_tokenizer.py b/fix_tokenizer.py new file mode 100644 index 0000000..9556ea6 --- /dev/null +++ b/fix_tokenizer.py @@ -0,0 +1,69 @@ +import os +import shutil +import json +from detect_tokenizer import detect + +MODEL_DIR = os.environ.get("MODEL_DIR", "/model") +OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer") + +os.makedirs(OUT_DIR, exist_ok=True) + +def copy_if_exists(name): + src = os.path.join(MODEL_DIR, name) + if os.path.exists(src): + shutil.copy(src, OUT_DIR) + +# 复制所有可能相关文件 +for f in [ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + "tokenizer.model", +]: + copy_if_exists(f) + +typ, orig_cls = detect(MODEL_DIR) + +cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json") + +if os.path.exists(cfg_path): + with open(cfg_path) as f: + cfg = json.load(f) +else: + cfg = {} + +# ===== 自动修复策略 ===== +if typ == "fast": + cfg["tokenizer_class"] = "PreTrainedTokenizerFast" + +elif typ == "sentencepiece": + cfg["tokenizer_class"] = "LlamaTokenizer" + +elif typ == "bpe": + cfg["tokenizer_class"] = "GPT2TokenizerFast" + +else: + cfg["tokenizer_class"] = "PreTrainedTokenizerFast" + +# 特殊 case 修复 +bad_classes = [ + "TokenizersBackend", + "TiktokenTokenizer", +] + +if orig_cls in bad_classes: + print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}") + +# 修复 extra_special_tokens: list → dict 格式 +if "extra_special_tokens" in cfg and isinstance(cfg["extra_special_tokens"], list): + orig_list = cfg["extra_special_tokens"] + cfg["extra_special_tokens"] = {token: token for token in orig_list} + print(f"[fix] converted extra_special_tokens from list ({len(orig_list)} items) to dict format") + +# 写回 +with open(cfg_path, "w") as f: + json.dump(cfg, f) + +print(f"[fix_tokenizer] done → {OUT_DIR}")