From 3725866cc559fcfc5664aeb1576328476d09eef5 Mon Sep 17 00:00:00 2001 From: Sun Ruoxi Date: Thu, 28 May 2026 17:56:05 +0800 Subject: [PATCH] =?UTF-8?q?add=20fix=20extra=5Fspecial=5Ftokens:=20list=20?= =?UTF-8?q?=E2=86=92=20dict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sun Ruoxi --- README.md | 23 +++++++++++++++++++++++ entrypoint.sh | 4 ++++ fix_tokenizer.py | 6 ++++++ 3 files changed, 33 insertions(+) diff --git a/README.md b/README.md index c349adb..f70e799 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,24 @@ vllm serve --tokenizer /tmp/fixed_tokenizer | 缺失 tokenizer_config | 自动生成 | | SentencePiece | LlamaTokenizer | +### 修复 extra_special_tokens 格式 + +当 `extra_special_tokens` 为 list 格式时,自动转换为 dict 格式: + +```json +// 修复前 +"extra_special_tokens": ["<|im_start|>", "<|im_end|>", "<|box_start|>", "<|box_end|>", ...] + +// 修复后 +"extra_special_tokens": { + "<|im_start|>": "<|im_start|>", + "<|im_end|>": "<|im_end|>", + "<|box_start|>": "<|box_start|>", + "<|box_end|>": "<|box_end|>", + ... +} +``` + --- ## 5. 生成的 tokenizer 目录 @@ -88,8 +106,13 @@ vllm serve --tokenizer /tmp/fixed_tokenizer ``` [entrypoint] fixing tokenizer... [fix] override bad tokenizer_class: TokenizersBackend → PreTrainedTokenizerFast +[fix] converted extra_special_tokens from list (13 items) to dict format ``` +触发条件(AUTO_FIX=auto 时): +- tokenizer_config.json 包含 `TokenizersBackend` 或 `TiktokenTokenizer` +- tokenizer_config.json 中 `extra_special_tokens` 为 list 格式(`"extra_special_tokens": [`) + --- ## 7. 验证方法 diff --git a/entrypoint.sh b/entrypoint.sh index 0f8f92d..07308d3 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -18,6 +18,10 @@ elif [ "$AUTO_FIX" = "auto" ]; then if grep -q "TokenizersBackend\|TiktokenTokenizer" "$MODEL_DIR/tokenizer_config.json"; then NEED_FIX=1 fi + # 检测 extra_special_tokens 是否为 list 格式 + if grep -q '"extra_special_tokens":\s*\[' "$MODEL_DIR/tokenizer_config.json"; then + NEED_FIX=1 + fi fi fi diff --git a/fix_tokenizer.py b/fix_tokenizer.py index 8635486..9556ea6 100644 --- a/fix_tokenizer.py +++ b/fix_tokenizer.py @@ -56,6 +56,12 @@ bad_classes = [ if orig_cls in bad_classes: print(f"[fix] override bad tokenizer_class: {orig_cls} → {cfg['tokenizer_class']}") +# 修复 extra_special_tokens: list → dict 格式 +if "extra_special_tokens" in cfg and isinstance(cfg["extra_special_tokens"], list): + orig_list = cfg["extra_special_tokens"] + cfg["extra_special_tokens"] = {token: token for token in orig_list} + print(f"[fix] converted extra_special_tokens from list ({len(orig_list)} items) to dict format") + # 写回 with open(cfg_path, "w") as f: json.dump(cfg, f)