19 lines
517 B
Python
19 lines
517 B
Python
|
|
from transformers import AutoTokenizer
|
||
|
|
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained("/workspace/dolphin-2.6-mistral-7b-hf")
|
||
|
|
|
||
|
|
# 1. Remove the "</s>" token from the vocabulary
|
||
|
|
vocab = tokenizer.get_vocab()
|
||
|
|
del vocab['</s>']
|
||
|
|
vocab['<|im_end|>'] = 2
|
||
|
|
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
||
|
|
"/workspace/dolphin-2.6-mistral-7b-hf",
|
||
|
|
vocab=vocab
|
||
|
|
)
|
||
|
|
|
||
|
|
tokenizer.eos_token = "<|im_end|>"
|
||
|
|
tokenizer.pad_token = "<|im_end|>"
|
||
|
|
|
||
|
|
# 5. Save the modified tokenizer
|
||
|
|
tokenizer.save_pretrained('/workspace/dolphin-new-tokenizer/')
|