diff --git a/README.md b/README.md index c349adb..f70e799 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,24 @@ vllm serve --tokenizer /tmp/fixed_tokenizer | 缺失 tokenizer_config | 自动生成 | | SentencePiece | LlamaTokenizer | +### 修复 extra_special_tokens 格式 + +当 `extra_special_tokens` 为 list 格式时,自动转换为 dict 格式: + +```json +// 修复前 +"extra_special_tokens": ["<|im_start|>", "<|im_end|>", "<|box_start|>", "<|box_end|>", ...] + +// 修复后 +"extra_special_tokens": { + "<|im_start|>": "<|im_start|>", + "<|im_end|>": "<|im_end|>", + "<|box_start|>": "<|box_start|>", + "<|box_end|>": "<|box_end|>", + ... +} +``` + --- ## 5. 生成的 tokenizer 目录 @@ -88,8 +106,13 @@ vllm serve --tokenizer /tmp/fixed_tokenizer ``` [entrypoint] fixing tokenizer... [fix] override bad tokenizer_class: TokenizersBackend → PreTrainedTokenizerFast +[fix] converted extra_special_tokens from list (13 items) to dict format ``` +触发条件(AUTO_FIX=auto 时): +- tokenizer_config.json 包含 `TokenizersBackend` 或 `TiktokenTokenizer` +- tokenizer_config.json 中 `extra_special_tokens` 为 list 格式(`"extra_special_tokens": [`) + --- ## 7. 验证方法 diff --git a/entrypoint.sh b/entrypoint.sh index 0f8f92d..07308d3 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -18,6 +18,10 @@ elif [ "$AUTO_FIX" = "auto" ]; then if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then NEED_FIX=1 fi + # 检测 extra_special_tokens 是否为 list 格式 + if grep -q '"extra_special_tokens":\s*\[' "$MODEL_DIR/tokenizer_config.json"; then + NEED_FIX=1 + fi fi fi diff --git a/fix_tokenizer.py b/fix_tokenizer.py index 8635486..9556ea6 100644 --- a/fix_tokenizer.py +++ b/fix_tokenizer.py @@ -56,6 +56,12 @@ bad_classes = [ if orig_cls in bad_classes: print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}") +# 修复 extra_special_tokens: list → dict 格式 +if "extra_special_tokens" in cfg and isinstance(cfg["extra_special_tokens"], list): + orig_list = cfg["extra_special_tokens"] + cfg["extra_special_tokens"] = {token: token for token in orig_list} + print(f"[fix] converted extra_special_tokens from list ({len(orig_list)} items) to dict format") + # 写回 with open(cfg_path, "w") as f: json.dump(cfg, f)