初始化项目,由ModelHub XC社区提供模型
Model: Finnish-NLP/Ahma-7B Source: Original Platform
This commit is contained in:
10
train_sentencepiece.py
Normal file
10
train_sentencepiece.py
Normal file
@@ -0,0 +1,10 @@
|
||||
import sentencepiece as spm
|
||||
|
||||
spm.SentencePieceTrainer.train(input="/researchdisk/training_dataset_sentences/train.txt", model_prefix="tokenizer",
|
||||
model_type="bpe", split_digits=True, vocab_size=64256, byte_fallback=True,
|
||||
normalization_rule_name="nfkc",
|
||||
user_defined_symbols=["[INST]", "[/INST]", "<<SYS>>", "<</SYS>>"],
|
||||
required_chars="abcdefghijklmnopqrstuvwxyzåäöABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ",
|
||||
train_extremely_large_corpus=True,
|
||||
input_sentence_size=500000000, shuffle_input_sentence=True,
|
||||
num_threads=96)
|
||||
Reference in New Issue
Block a user