更新 K100-vLLM-Patched-v2.0/fix_tokenizer.py

This commit is contained in:
2026-05-19 18:50:09 +08:00
parent bbb1cd4304
commit 761ff1866c

View File

@@ -2,6 +2,8 @@ import os
import shutil import shutil
import json import json
import sys import sys
import transformers
import inspect
sys.path.insert(0, '/opt') sys.path.insert(0, '/opt')
from detect_tokenizer import detect from detect_tokenizer import detect
@@ -35,22 +37,28 @@ if os.path.exists(cfg_path):
else: else:
cfg = {} cfg = {}
if typ == "fast": VALID_CLASSES = {
cfg["tokenizer_class"] = "PreTrainedTokenizerFast" name for name, obj in inspect.getmembers(transformers)
elif typ == "sentencepiece": if inspect.isclass(obj) and "Tokenizer" in name
cfg["tokenizer_class"] = "LlamaTokenizer" }
elif typ == "bpe":
cfg["tokenizer_class"] = "GPT2TokenizerFast" BAD_CLASSES = {"TokenizersBackend", "TiktokenTokenizer"}
FALLBACK = {
"fast": "PreTrainedTokenizerFast",
"sentencepiece": "LlamaTokenizer",
"bpe": "GPT2TokenizerFast",
}
if orig_cls and orig_cls in VALID_CLASSES and orig_cls not in BAD_CLASSES:
print(f"[fix_tokenizer] tokenizer_class '{orig_cls}' is valid, skip override")
else: else:
cfg["tokenizer_class"] = "PreTrainedTokenizerFast" fallback = FALLBACK.get(typ, "PreTrainedTokenizerFast")
if orig_cls:
bad_classes = [ print(f"[fix] override bad tokenizer_class: {orig_cls}{fallback}")
"TokenizersBackend", else:
"TiktokenTokenizer", print(f"[fix] tokenizer_class missing, set to: {fallback}")
] cfg["tokenizer_class"] = fallback
if orig_cls in bad_classes:
print(f"[fix] override bad tokenizer_class: {orig_cls}{cfg['tokenizer_class']}")
with open(cfg_path, "w") as f: with open(cfg_path, "w") as f:
json.dump(cfg, f) json.dump(cfg, f)