19 lines
517 B
Python
19 lines
517 B
Python
from transformers import AutoTokenizer
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("/workspace/dolphin-2.6-mistral-7b-hf")
|
|
|
|
# 1. Remove the "</s>" token from the vocabulary
|
|
vocab = tokenizer.get_vocab()
|
|
del vocab['</s>']
|
|
vocab['<|im_end|>'] = 2
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
"/workspace/dolphin-2.6-mistral-7b-hf",
|
|
vocab=vocab
|
|
)
|
|
|
|
tokenizer.eos_token = "<|im_end|>"
|
|
tokenizer.pad_token = "<|im_end|>"
|
|
|
|
# 5. Save the modified tokenizer
|
|
tokenizer.save_pretrained('/workspace/dolphin-new-tokenizer/') |