fix tokenizer

This commit is contained in:
4paradigm
2026-06-29 17:23:40 +08:00
parent ef6173824e
commit 42420f61ea
4 changed files with 57 additions and 106 deletions

View File

@@ -4,36 +4,11 @@ set -e
MODEL_DIR=${1:-/model}
shift || true
FIX_TOKENIZER_DIR=/tmp/fixed_tokenizer
AUTO_FIX=${AUTO_FIX_TOKENIZER:-auto}
echo "[entrypoint] model dir: $MODEL_DIR"
NEED_FIX=0
if [ "$AUTO_FIX" = "1" ] || [ "$AUTO_FIX" = "true" ]; then
NEED_FIX=1
elif [ "$AUTO_FIX" = "auto" ]; then
if [ -f "$MODEL_DIR/tokenizer_config.json" ]; then
if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then
NEED_FIX=1
fi
# 检测 extra_special_tokens 是否为 list 格式
if grep -q '"extra_special_tokens":\s*\[' "$MODEL_DIR/tokenizer_config.json"; then
NEED_FIX=1
fi
fi
fi
if [ $NEED_FIX -eq 1 ]; then
echo "[entrypoint] fixing tokenizer..."
python3 /opt/fix_tokenizer.py
TOKENIZER_ARG="--tokenizer $FIX_TOKENIZER_DIR"
FIXED_DIR=$(python3 /opt/fix_tokenizer.py "$MODEL_DIR")
if [ -n "$FIXED_DIR" ]; then
TOKENIZER_ARG="--tokenizer $FIXED_DIR"
else
echo "[entrypoint] tokenizer OK, skip fix"
TOKENIZER_ARG=""
fi
echo "[entrypoint] starting vllm..."
exec vllm serve "$MODEL_DIR" $TOKENIZER_ARG "$@"