first commit
This commit is contained in:
132
.gitea/docker-build-push.yml
Normal file
132
.gitea/docker-build-push.yml
Normal file
@@ -0,0 +1,132 @@
|
||||
name: Docker Build and Push
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*"
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
runs-on: amd64-ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Clone repository
|
||||
run: |
|
||||
git clone "${{ gitea.server_url }}/${{ gitea.repository }}.git" .
|
||||
git checkout "${{ gitea.ref_name }}"
|
||||
|
||||
- name: Set image metadata
|
||||
run: |
|
||||
IMAGE_NAME="$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')"
|
||||
IMAGE="${DOCKER_REGISTRY}/${DOCKER_USERNAME}/${IMAGE_NAME}:${{ gitea.ref_name }}"
|
||||
|
||||
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITEA_ENV"
|
||||
echo "IMAGE=${IMAGE}" >> "$GITEA_ENV"
|
||||
|
||||
- name: Load and Validate Task Info
|
||||
run: |
|
||||
set -a
|
||||
. .gitea/workflows/task_info.env
|
||||
set +a
|
||||
|
||||
for name in FRAMEWORK GPU_TYPE TASK_TYPE; do
|
||||
eval "value=\${${name}:-}"
|
||||
if [ "$name" = "FRAMEWORK" ] && [ -z "$value" ]; then
|
||||
echo "${name} is empty in .gitea/workflows/task_info.env"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "${name}=${value}" >> "$GITEA_ENV"
|
||||
done
|
||||
|
||||
- name: Validate Image Verify Metadata
|
||||
run: |
|
||||
if [ -z "${FIXED_TOKEN:-}" ]; then
|
||||
echo "FIXED_TOKEN is not configured on runner"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! response="$(curl --silent --show-error --location --get 'https://modelhub.org.cn/adminApi/image-verify/validate' \
|
||||
--header "Xc-Token: ${FIXED_TOKEN}" \
|
||||
--data-urlencode "gpuType=${GPU_TYPE:-}" \
|
||||
--data-urlencode "taskType=${TASK_TYPE:-}")"; then
|
||||
echo "failed to call image verify validate API"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
VALIDATE_RESPONSE="$response" python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
raw = os.environ.get("VALIDATE_RESPONSE", "")
|
||||
try:
|
||||
body = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
print("image verify validate API returned invalid JSON")
|
||||
print(raw)
|
||||
sys.exit(1)
|
||||
|
||||
if body.get("code") == 0 and body.get("data") is True:
|
||||
print("image verify metadata validation passed")
|
||||
sys.exit(0)
|
||||
|
||||
message = body.get("message") or "unknown error"
|
||||
print(f"image verify metadata validation failed: {message}")
|
||||
print(raw)
|
||||
sys.exit(1)
|
||||
PY
|
||||
|
||||
- name: Login to Docker Registry
|
||||
run: |
|
||||
echo "$DOCKER_PASSWORD" | docker login "$DOCKER_REGISTRY" \
|
||||
-u "$DOCKER_USERNAME" \
|
||||
--password-stdin
|
||||
|
||||
- name: Build Docker Image
|
||||
run: |
|
||||
docker build -t "$IMAGE" .
|
||||
|
||||
- name: Push Docker Image
|
||||
run: |
|
||||
for attempt in 1 2 3; do
|
||||
echo "Starting docker push attempt ${attempt}/3 for ${IMAGE}"
|
||||
docker push "$IMAGE" &
|
||||
PUSH_PID=$!
|
||||
|
||||
while kill -0 "$PUSH_PID" 2>/dev/null; do
|
||||
echo "docker push is still running at $(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
||||
sleep 60
|
||||
done
|
||||
|
||||
if wait "$PUSH_PID"; then
|
||||
echo "docker push completed successfully"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "docker push failed on attempt ${attempt}/3"
|
||||
sleep 30
|
||||
done
|
||||
|
||||
echo "docker push failed after 3 attempts"
|
||||
exit 1
|
||||
|
||||
- name: Notify Image Verify
|
||||
run: |
|
||||
if [ -z "${FIXED_TOKEN:-}" ]; then
|
||||
echo "FIXED_TOKEN is not configured on runner"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
curl --silent --show-error --fail-with-body --location --request POST 'https://modelhub.org.cn//adminApi/image-verify' \
|
||||
--header "Xc-Token: ${FIXED_TOKEN}" \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw "{
|
||||
\"framework\": \"${FRAMEWORK}\",
|
||||
\"gpuType\": \"${GPU_TYPE}\",
|
||||
\"imageUrl\": \"${IMAGE}\",
|
||||
\"taskType\": \"${TASK_TYPE}\",
|
||||
\"createBy\": \"${{ gitea.actor }}\",
|
||||
\"repoUrl\": \"${{ gitea.server_url }}/${{ gitea.repository }}\",
|
||||
\"tag\": \"${{ github.ref_name }}\"
|
||||
}"
|
||||
3
.gitea/task_info.env
Normal file
3
.gitea/task_info.env
Normal file
@@ -0,0 +1,3 @@
|
||||
FRAMEWORK=vllm_tokenizerz_patch
|
||||
GPU_TYPE=Kunlun_P800
|
||||
TASK_TYPE=text-generation
|
||||
9
Dockerfile
Normal file
9
Dockerfile
Normal file
@@ -0,0 +1,9 @@
|
||||
FROM harbor-contest.4pd.io/sunjichen/xc-llm-kunlun:latest
|
||||
|
||||
COPY entrypoint.sh /opt/entrypoint.sh
|
||||
COPY fix_tokenizer.py /opt/fix_tokenizer.py
|
||||
COPY detect_tokenizer.py /opt/detect_tokenizer.py
|
||||
|
||||
RUN chmod +x /opt/entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/opt/entrypoint.sh"]
|
||||
50
README.md
Normal file
50
README.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# xc-llm-kunlun-fix-tokenizer
|
||||
|
||||
基于 `harbor-contest.4pd.io/sunjichen/xc-llm-kunlun:latest` 的 tokenizer 自动修复镜像,解决部分模型 `tokenizer_config.json` 中 `tokenizer_class` 为 `TokenizersBackend` 等非标准类名导致 vLLM 启动失败的问题。
|
||||
|
||||
## 问题背景
|
||||
|
||||
某些经过训练/合并的模型,其 `tokenizer_config.json` 中存在以下问题:
|
||||
- `tokenizer_class` 被设置为 `TokenizersBackend`、`TiktokenTokenizer` 等 transformers 不识别的类名
|
||||
- `extra_special_tokens` 字段为 list 格式,而 transformers 期望 dict 格式
|
||||
|
||||
这会导致 `AutoTokenizer.from_pretrained` 抛出 `ValueError`,vLLM 服务无法启动。
|
||||
|
||||
## 修复方式
|
||||
|
||||
容器启动时自动检测 `tokenizer_config.json`,若存在问题则将 tokenizer 文件复制到 `/tmp/fixed_tokenizer/` 并修复配置,再以 `--tokenizer /tmp/fixed_tokenizer` 参数启动 vLLM。原始模型目录不做任何修改。
|
||||
|
||||
## 使用方式
|
||||
|
||||
将原 docker run 命令中的镜像名替换为本镜像,并去掉 `--entrypoint vllm`,改为直接传参:
|
||||
|
||||
```bash
|
||||
docker run -dit --name <container_name> \
|
||||
-p 44825:8000 \
|
||||
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
|
||||
--tmpfs /dev/shm:rw,nosuid,nodev,exec,size=64g \
|
||||
--ulimit memlock=-1 \
|
||||
--device=/dev/xpu0:/dev/xpu0 \
|
||||
--device=/dev/xpuctrl:/dev/xpuctrl \
|
||||
-v /path/to/model:/model \
|
||||
<this-image> \
|
||||
/model --port 8000 --served-model-name llm \
|
||||
--max-model-len 2048 --gpu-memory-utilization 0.9 \
|
||||
--enforce-eager --trust-remote-code -tp 1
|
||||
```
|
||||
|
||||
## 环境变量
|
||||
|
||||
| 变量 | 默认值 | 说明 |
|
||||
|---|---|---|
|
||||
| `AUTO_FIX_TOKENIZER` | `auto` | `auto`:自动检测;`1`/`true`:强制修复;其他值:跳过修复 |
|
||||
| `MODEL_DIR` | `/model` | 模型路径(通常通过命令行第一个参数传入) |
|
||||
| `FIX_TOKENIZER_DIR` | `/tmp/fixed_tokenizer` | 修复后 tokenizer 文件的临时目录 |
|
||||
|
||||
## 构建
|
||||
|
||||
```bash
|
||||
docker build -t xc-llm-kunlun-fix-tokenizer:latest .
|
||||
```
|
||||
|
||||
CI 通过推送 `v*` tag 自动触发构建并推送镜像。
|
||||
25
detect_tokenizer.py
Normal file
25
detect_tokenizer.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import os
|
||||
import json
|
||||
|
||||
def detect(model_dir):
|
||||
cfg_path = os.path.join(model_dir, "tokenizer_config.json")
|
||||
|
||||
if os.path.exists(cfg_path):
|
||||
with open(cfg_path) as f:
|
||||
cfg = json.load(f)
|
||||
cls = cfg.get("tokenizer_class", "")
|
||||
else:
|
||||
cls = ""
|
||||
|
||||
files = os.listdir(model_dir)
|
||||
|
||||
if "tokenizer.json" in files:
|
||||
return "fast", cls
|
||||
|
||||
if "tokenizer.model" in files:
|
||||
return "sentencepiece", cls
|
||||
|
||||
if "vocab.json" in files and "merges.txt" in files:
|
||||
return "bpe", cls
|
||||
|
||||
return "unknown", cls
|
||||
39
entrypoint.sh
Normal file
39
entrypoint.sh
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
MODEL_DIR=${1:-/model}
|
||||
shift || true
|
||||
|
||||
FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer
|
||||
AUTO_FIX=${AUTO_FIX_TOKENIZER:-auto}
|
||||
|
||||
echo "[entrypoint] model dir: $MODEL_DIR"
|
||||
|
||||
NEED_FIX=0
|
||||
|
||||
if [ "$AUTO_FIX" = "1" ] || [ "$AUTO_FIX" = "true" ]; then
|
||||
NEED_FIX=1
|
||||
elif [ "$AUTO_FIX" = "auto" ]; then
|
||||
if [ -f "$MODEL_DIR/tokenizer_config.json" ]; then
|
||||
if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then
|
||||
NEED_FIX=1
|
||||
fi
|
||||
# 检测 extra_special_tokens 是否为 list 格式
|
||||
if grep -q '"extra_special_tokens":\s*\[' "$MODEL_DIR/tokenizer_config.json"; then
|
||||
NEED_FIX=1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $NEED_FIX -eq 1 ]; then
|
||||
echo "[entrypoint] fixing tokenizer..."
|
||||
python3 /opt/fix_tokenizer.py
|
||||
TOKENIZER_ARG="--tokenizer $FIX_TOKENIZER_DIR"
|
||||
else
|
||||
echo "[entrypoint] tokenizer OK, skip fix"
|
||||
TOKENIZER_ARG=""
|
||||
fi
|
||||
|
||||
echo "[entrypoint] starting vllm..."
|
||||
|
||||
exec vllm serve "$MODEL_DIR" $TOKENIZER_ARG "$@"
|
||||
69
fix_tokenizer.py
Normal file
69
fix_tokenizer.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import shutil
|
||||
import json
|
||||
from detect_tokenizer import detect
|
||||
|
||||
MODEL_DIR = os.environ.get("MODEL_DIR", "/model")
|
||||
OUT_DIR = os.environ.get("FIX_TOKENIZER_DIR", "/tmp/fixed_tokenizer")
|
||||
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
|
||||
def copy_if_exists(name):
|
||||
src = os.path.join(MODEL_DIR, name)
|
||||
if os.path.exists(src):
|
||||
shutil.copy(src, OUT_DIR)
|
||||
|
||||
# 复制所有可能相关文件
|
||||
for f in [
|
||||
"tokenizer.json",
|
||||
"tokenizer_config.json",
|
||||
"special_tokens_map.json",
|
||||
"vocab.json",
|
||||
"merges.txt",
|
||||
"tokenizer.model",
|
||||
]:
|
||||
copy_if_exists(f)
|
||||
|
||||
typ, orig_cls = detect(MODEL_DIR)
|
||||
|
||||
cfg_path = os.path.join(OUT_DIR, "tokenizer_config.json")
|
||||
|
||||
if os.path.exists(cfg_path):
|
||||
with open(cfg_path) as f:
|
||||
cfg = json.load(f)
|
||||
else:
|
||||
cfg = {}
|
||||
|
||||
# ===== 自动修复策略 =====
|
||||
if typ == "fast":
|
||||
cfg["tokenizer_class"] = "PreTrainedTokenizerFast"
|
||||
|
||||
elif typ == "sentencepiece":
|
||||
cfg["tokenizer_class"] = "LlamaTokenizer"
|
||||
|
||||
elif typ == "bpe":
|
||||
cfg["tokenizer_class"] = "GPT2TokenizerFast"
|
||||
|
||||
else:
|
||||
cfg["tokenizer_class"] = "PreTrainedTokenizerFast"
|
||||
|
||||
# 特殊 case 修复
|
||||
bad_classes = [
|
||||
"TokenizersBackend",
|
||||
"TiktokenTokenizer",
|
||||
]
|
||||
|
||||
if orig_cls in bad_classes:
|
||||
print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}")
|
||||
|
||||
# 修复 extra_special_tokens: list → dict 格式
|
||||
if "extra_special_tokens" in cfg and isinstance(cfg["extra_special_tokens"], list):
|
||||
orig_list = cfg["extra_special_tokens"]
|
||||
cfg["extra_special_tokens"] = {token: token for token in orig_list}
|
||||
print(f"[fix] converted extra_special_tokens from list ({len(orig_list)} items) to dict format")
|
||||
|
||||
# 写回
|
||||
with open(cfg_path, "w") as f:
|
||||
json.dump(cfg, f)
|
||||
|
||||
print(f"[fix_tokenizer] done → {OUT_DIR}")
|
||||
Reference in New Issue
Block a user