llama : add support for SmolLm pre-tokenizer (#8609)

* Adding SmolLM Pre Tokenizer * Update convert_hf_to_gguf_update.py Co-authored-by: compilade <git@compilade.net> * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * handle regex * removed .inp and out .out ggufs --------- Co-authored-by: compilade <git@compilade.net>
2024-07-22 10:43:01 -04:00
parent 566daa5a5b
commit d94c6e0ccb
4 changed files with 10 additions and 0 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5521,6 +5521,10 @@ static void llm_load_vocab(
                vocab.tokenizer_clean_spaces = false;
                vocab.tokenizer_ignore_merges = true;
                vocab.tokenizer_add_bos = true;
+            } else if (
+                tokenizer_pre == "smollm") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                vocab.tokenizer_clean_spaces = false;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@@ -15543,6 +15547,7 @@ struct llm_tokenizer_bpe {
            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
            case LLAMA_VOCAB_PRE_TYPE_REFACT:
            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
+            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
                regex_exprs = {
                    "\\p{N}",
                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",