From 6190a6c9bea27f0fb8ee8e86a8ad479fd1382643 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 29 Apr 2026 21:05:16 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Madras1/Jade-20B Source: Original Platform --- .gitattributes | 36 +++ README.md | 156 ++++++++++++ chat_template.jinja | 8 + config.json | 69 +++++ generation_config.json | 10 + model-00001-of-00013.safetensors | 3 + model-00002-of-00013.safetensors | 3 + model-00003-of-00013.safetensors | 3 + model-00004-of-00013.safetensors | 3 + model-00005-of-00013.safetensors | 3 + model-00006-of-00013.safetensors | 3 + model-00007-of-00013.safetensors | 3 + model-00008-of-00013.safetensors | 3 + model-00009-of-00013.safetensors | 3 + model-00010-of-00013.safetensors | 3 + model-00011-of-00013.safetensors | 3 + model-00012-of-00013.safetensors | 3 + model-00013-of-00013.safetensors | 3 + model.safetensors.index.json | 419 +++++++++++++++++++++++++++++++ special_tokens_map.json | 23 ++ tokenizer.json | 3 + tokenizer_config.json | 180 +++++++++++++ 22 files changed, 943 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00013.safetensors create mode 100644 model-00002-of-00013.safetensors create mode 100644 model-00003-of-00013.safetensors create mode 100644 model-00004-of-00013.safetensors create mode 100644 model-00005-of-00013.safetensors create mode 100644 model-00006-of-00013.safetensors create mode 100644 model-00007-of-00013.safetensors create mode 100644 model-00008-of-00013.safetensors create mode 100644 model-00009-of-00013.safetensors create mode 100644 model-00010-of-00013.safetensors create mode 100644 model-00011-of-00013.safetensors create mode 100644 model-00012-of-00013.safetensors create mode 100644 model-00013-of-00013.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..504785c --- /dev/null +++ b/README.md @@ -0,0 +1,156 @@ +--- +language: +- pt +- en +license: apache-2.0 +base_model: +- unsloth/gpt-oss-20b +- openai/gpt-oss-20b +base_model_relation: finetune +library_name: transformers +pipeline_tag: text-generation +tags: +- pt-br +- portuguese +- brazilian-portuguese +- conversational +- chatbot +- persona +- unsloth +- 4-bit +- bitsandbytes +- qwen3 +--- + +# Jade-20b + +Jade-20b is a Brazilian Portuguese conversational finetune of gpt-oss-20b built to express a strong, persistent persona. This model is designed for PT-BR chat, chatbot use cases, and character-style interaction, with colloquial language, abbreviations, slang, and a WhatsApp-like tone. + +## Model Summary + +Jade-20b is a persona-first model. It was intentionally finetuned so the model speaks like **Jade** even without a strong `system prompt`. Because of that, the model often answers in PT-BR with informal phrasing such as `vc`, slang, and a friendly conversational tone from the very first turn. + +## Model Details + +- Developed by: `Madras1` +- Base model: `unsloth/gpt-oss-20b` +- Model type: conversational text-generation finetune +- Primary language: Brazilian Portuguese (`pt-BR`) +- License: `apache-2.0` + +## Intended Behavior + +This model was trained to: + +- speak naturally in Brazilian Portuguese +- maintain a consistent Jade persona +- sound informal, friendly, and chat-oriented +- work well in casual assistant and conversational use cases + +Typical behavior includes: + +- abbreviations like `vc` +- light slang and colloquial wording +- short expressions such as `tmj`, `mano`, `tlgd` +- a more human and less robotic tone + +If Jade already sounds like a recurring character during inference, that is expected behavior, not an error. + +## Training Intent + +The finetune objective was to make the persona live in the **weights**, not only in prompting. + +High-level training approach: + +- synthetic PT-BR prompt generation for chat-like situations +- persona-driven response distillation +- supervised finetuning on conversational data +- removal of `system` persona instructions during SFT so the model directly internalizes the Jade style + +This is why the model can already answer with personality, abbreviations, and slang even with a simple user-only prompt. + +## Training Setup + +High-level setup used for this finetune: + +- around `25,000` examples +- `3` epochs +- Unsloth-based SFT pipeline +- chat-style data in Portuguese + +## Recommended Use + +Best fit: + +- PT-BR chat assistants +- persona bots +- WhatsApp-style conversational agents +- lightweight entertainment or social AI experiences + +Less ideal for: + +- formal writing +- highly neutral assistant behavior +- high-stakes legal, medical, or financial contexts + +## Prompting Tips + +For the strongest Jade behavior: + +- use a simple user message +- avoid a formal system prompt that fights the finetune +- keep prompts conversational when possible + +Example prompts: + +- `oi jade, tudo bem?` +- `jade, me explica isso de um jeito simples` +- `vc acha que vale a pena estudar python hoje?` + +## Example Inference + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +model_id = "Madras1/Jade-20b" + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + device_map="auto", +) + +messages = [ + {"role": "user", "content": "oi jade, tudo bem?"} +] + +text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, +) + +inputs = tokenizer(text, return_tensors="pt").to(model.device) +outputs = model.generate( + **inputs, + max_new_tokens=256, + temperature=0.7, + top_p=0.9, +) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +## Limitations + +Because this is a persona-oriented finetune: + +- it may sound informal in contexts where a neutral tone would be better +- it may over-index on chat style depending on the prompt +- it is optimized more for persona consistency than strict formality + +## Links + +https://github.com/MadrasLe/JadeLLMV-1 \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..ecb8be2 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,8 @@ +{% for message in messages %}{% if message['role'] == 'user' %}{{'<|im_start|>user +' + message['content'] + '<|im_end|> +'}}{% elif message['role'] == 'assistant' %}{{'<|im_start|>assistant +' + message['content'] + '<|im_end|> +' }}{% else %}{{ '<|im_start|>system +' + message['content'] + '<|im_end|> +' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant +' }}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..12eba1e --- /dev/null +++ b/config.json @@ -0,0 +1,69 @@ +{ + "architectures": [ + "GptOssForCausalLM" + ], + "attention_bias": true, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": 200002, + "experts_per_token": 4, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2880, + "initial_context_length": 4096, + "initializer_range": 0.02, + "intermediate_size": 2880, + "layer_types": [ + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "model_type": "gpt_oss", + "num_attention_heads": 64, + "num_experts_per_tok": 4, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_local_experts": 32, + "output_router_logits": false, + "pad_token_id": 200017, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "beta_fast": 32.0, + "beta_slow": 1.0, + "factor": 32.0, + "original_max_position_embeddings": 4096, + "rope_type": "yarn", + "truncate": false + }, + "rope_theta": 150000, + "router_aux_loss_coef": 0.9, + "sliding_window": 128, + "swiglu_limit": 7.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.3", + "unsloth_fixed": true, + "use_cache": true, + "vocab_size": 201088 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..ed7982f --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "bos_token_id": 199998, + "do_sample": true, + "eos_token_id": [ + 200002, + 199999 + ], + "pad_token_id": 199999, + "transformers_version": "4.57.3" +} diff --git a/model-00001-of-00013.safetensors b/model-00001-of-00013.safetensors new file mode 100644 index 0000000..f2e8bb3 --- /dev/null +++ b/model-00001-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c7cada63572fc060f391176054b0dc766a04ebb3f57ab98e5396a6225d5451 +size 3919980872 diff --git a/model-00002-of-00013.safetensors b/model-00002-of-00013.safetensors new file mode 100644 index 0000000..1b0bac5 --- /dev/null +++ b/model-00002-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb36cfbc872e192ee6bda4063d221e5fc830666a5936da8beb435427f6feac78 +size 3877075568 diff --git a/model-00003-of-00013.safetensors b/model-00003-of-00013.safetensors new file mode 100644 index 0000000..c2174e7 --- /dev/null +++ b/model-00003-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f26c41a57aaf4dc94ba452d80429cc596517987692b5a2ba12d5f35f81ebfe6 +size 3292751776 diff --git a/model-00004-of-00013.safetensors b/model-00004-of-00013.safetensors new file mode 100644 index 0000000..217caa3 --- /dev/null +++ b/model-00004-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9bcd6eec164a34454f8f0678363deca815e5ff87646f0b53e308a1bc13cdc88 +size 3292751776 diff --git a/model-00005-of-00013.safetensors b/model-00005-of-00013.safetensors new file mode 100644 index 0000000..46d3fea --- /dev/null +++ b/model-00005-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09919c91a66562d1d58f4a1bc9d0b417833cf0f91cb42de9bca49938ca8e6120 +size 3292751728 diff --git a/model-00006-of-00013.safetensors b/model-00006-of-00013.safetensors new file mode 100644 index 0000000..7cdf5f8 --- /dev/null +++ b/model-00006-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d5c0ecfbe26a5fd5c8bd41c23f053da59de649567b62efaf6b24a911ba065a +size 3292751808 diff --git a/model-00007-of-00013.safetensors b/model-00007-of-00013.safetensors new file mode 100644 index 0000000..76a4465 --- /dev/null +++ b/model-00007-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f3edcb91d6e102d8514d409567bb9e3aedced13ebe6ed12b163c56f465a1bab +size 3292751808 diff --git a/model-00008-of-00013.safetensors b/model-00008-of-00013.safetensors new file mode 100644 index 0000000..3806792 --- /dev/null +++ b/model-00008-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e2eacf6c4d47af03b4c93457be6eea5db2da4c14296476b32842ca605e26212 +size 3292751808 diff --git a/model-00009-of-00013.safetensors b/model-00009-of-00013.safetensors new file mode 100644 index 0000000..3bc7261 --- /dev/null +++ b/model-00009-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6b28df6acf038a173b1e6e0ae1d8d4148b199b5a466504975c24136ea49a79a +size 3292751808 diff --git a/model-00010-of-00013.safetensors b/model-00010-of-00013.safetensors new file mode 100644 index 0000000..18a756b --- /dev/null +++ b/model-00010-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61d8f76d2ae5aeb2663d7e12a840236da72add97466ff406e817789b8d6143b5 +size 3292751808 diff --git a/model-00011-of-00013.safetensors b/model-00011-of-00013.safetensors new file mode 100644 index 0000000..a5276be --- /dev/null +++ b/model-00011-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2be770913250fadd737c1302ee5662021d9f2b795ff1069926dfb420978ca23 +size 3292751808 diff --git a/model-00012-of-00013.safetensors b/model-00012-of-00013.safetensors new file mode 100644 index 0000000..448fe02 --- /dev/null +++ b/model-00012-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cebe3f985abe90e592af36fdff5c7ecf3b62c2203ce6a2b112ec8fe74190573 +size 3239471760 diff --git a/model-00013-of-00013.safetensors b/model-00013-of-00013.safetensors new file mode 100644 index 0000000..2e26950 --- /dev/null +++ b/model-00013-of-00013.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08e257de1fa16cf80c8a36ca29c5974fc999f0dc7ada1468bfd0be18151af482 +size 1158267008 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..401e361 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,419 @@ +{ + "metadata": { + "total_parameters": 20914757184, + "total_size": 41829514368 + }, + "weight_map": { + "lm_head.weight": "model-00013-of-00013.safetensors", + "model.embed_tokens.weight": "model-00001-of-00013.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00013.safetensors", + "model.layers.0.mlp.experts.down_proj": "model-00001-of-00013.safetensors", + "model.layers.0.mlp.experts.down_proj_bias": "model-00001-of-00013.safetensors", + "model.layers.0.mlp.experts.gate_up_proj": "model-00001-of-00013.safetensors", + "model.layers.0.mlp.experts.gate_up_proj_bias": "model-00001-of-00013.safetensors", + "model.layers.0.mlp.router.bias": "model-00001-of-00013.safetensors", + "model.layers.0.mlp.router.weight": "model-00001-of-00013.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.o_proj.bias": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.sinks": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00013.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00013.safetensors", + "model.layers.1.input_layernorm.weight": "model-00002-of-00013.safetensors", + "model.layers.1.mlp.experts.down_proj": "model-00002-of-00013.safetensors", + "model.layers.1.mlp.experts.down_proj_bias": "model-00002-of-00013.safetensors", + "model.layers.1.mlp.experts.gate_up_proj": "model-00001-of-00013.safetensors", + "model.layers.1.mlp.experts.gate_up_proj_bias": "model-00001-of-00013.safetensors", + "model.layers.1.mlp.router.bias": "model-00001-of-00013.safetensors", + "model.layers.1.mlp.router.weight": "model-00001-of-00013.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00013.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00013.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00013.safetensors", + "model.layers.1.self_attn.o_proj.bias": "model-00001-of-00013.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00013.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00013.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00013.safetensors", + "model.layers.1.self_attn.sinks": "model-00001-of-00013.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00013.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00013.safetensors", + "model.layers.10.input_layernorm.weight": "model-00006-of-00013.safetensors", + "model.layers.10.mlp.experts.down_proj": "model-00006-of-00013.safetensors", + "model.layers.10.mlp.experts.down_proj_bias": "model-00006-of-00013.safetensors", + "model.layers.10.mlp.experts.gate_up_proj": "model-00006-of-00013.safetensors", + "model.layers.10.mlp.experts.gate_up_proj_bias": "model-00006-of-00013.safetensors", + "model.layers.10.mlp.router.bias": "model-00005-of-00013.safetensors", + "model.layers.10.mlp.router.weight": "model-00005-of-00013.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00006-of-00013.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00005-of-00013.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00005-of-00013.safetensors", + "model.layers.10.self_attn.o_proj.bias": "model-00005-of-00013.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00005-of-00013.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00005-of-00013.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00005-of-00013.safetensors", + "model.layers.10.self_attn.sinks": "model-00005-of-00013.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00005-of-00013.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00005-of-00013.safetensors", + "model.layers.11.input_layernorm.weight": "model-00006-of-00013.safetensors", + "model.layers.11.mlp.experts.down_proj": "model-00006-of-00013.safetensors", + "model.layers.11.mlp.experts.down_proj_bias": "model-00006-of-00013.safetensors", + "model.layers.11.mlp.experts.gate_up_proj": "model-00006-of-00013.safetensors", + "model.layers.11.mlp.experts.gate_up_proj_bias": "model-00006-of-00013.safetensors", + "model.layers.11.mlp.router.bias": "model-00006-of-00013.safetensors", + "model.layers.11.mlp.router.weight": "model-00006-of-00013.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.o_proj.bias": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.sinks": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00006-of-00013.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00006-of-00013.safetensors", + "model.layers.12.input_layernorm.weight": "model-00007-of-00013.safetensors", + "model.layers.12.mlp.experts.down_proj": "model-00007-of-00013.safetensors", + "model.layers.12.mlp.experts.down_proj_bias": "model-00007-of-00013.safetensors", + "model.layers.12.mlp.experts.gate_up_proj": "model-00007-of-00013.safetensors", + "model.layers.12.mlp.experts.gate_up_proj_bias": "model-00007-of-00013.safetensors", + "model.layers.12.mlp.router.bias": "model-00006-of-00013.safetensors", + "model.layers.12.mlp.router.weight": "model-00006-of-00013.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00007-of-00013.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00006-of-00013.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00006-of-00013.safetensors", + "model.layers.12.self_attn.o_proj.bias": "model-00006-of-00013.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00006-of-00013.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00006-of-00013.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00006-of-00013.safetensors", + "model.layers.12.self_attn.sinks": "model-00006-of-00013.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00006-of-00013.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00006-of-00013.safetensors", + "model.layers.13.input_layernorm.weight": "model-00007-of-00013.safetensors", + "model.layers.13.mlp.experts.down_proj": "model-00007-of-00013.safetensors", + "model.layers.13.mlp.experts.down_proj_bias": "model-00007-of-00013.safetensors", + "model.layers.13.mlp.experts.gate_up_proj": "model-00007-of-00013.safetensors", + "model.layers.13.mlp.experts.gate_up_proj_bias": "model-00007-of-00013.safetensors", + "model.layers.13.mlp.router.bias": "model-00007-of-00013.safetensors", + "model.layers.13.mlp.router.weight": "model-00007-of-00013.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.o_proj.bias": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.sinks": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00007-of-00013.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00007-of-00013.safetensors", + "model.layers.14.input_layernorm.weight": "model-00008-of-00013.safetensors", + "model.layers.14.mlp.experts.down_proj": "model-00008-of-00013.safetensors", + "model.layers.14.mlp.experts.down_proj_bias": "model-00008-of-00013.safetensors", + "model.layers.14.mlp.experts.gate_up_proj": "model-00008-of-00013.safetensors", + "model.layers.14.mlp.experts.gate_up_proj_bias": "model-00008-of-00013.safetensors", + "model.layers.14.mlp.router.bias": "model-00007-of-00013.safetensors", + "model.layers.14.mlp.router.weight": "model-00007-of-00013.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00008-of-00013.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00007-of-00013.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00007-of-00013.safetensors", + "model.layers.14.self_attn.o_proj.bias": "model-00007-of-00013.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00007-of-00013.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00007-of-00013.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00007-of-00013.safetensors", + "model.layers.14.self_attn.sinks": "model-00007-of-00013.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00007-of-00013.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00007-of-00013.safetensors", + "model.layers.15.input_layernorm.weight": "model-00008-of-00013.safetensors", + "model.layers.15.mlp.experts.down_proj": "model-00008-of-00013.safetensors", + "model.layers.15.mlp.experts.down_proj_bias": "model-00008-of-00013.safetensors", + "model.layers.15.mlp.experts.gate_up_proj": "model-00008-of-00013.safetensors", + "model.layers.15.mlp.experts.gate_up_proj_bias": "model-00008-of-00013.safetensors", + "model.layers.15.mlp.router.bias": "model-00008-of-00013.safetensors", + "model.layers.15.mlp.router.weight": "model-00008-of-00013.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.o_proj.bias": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.sinks": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00008-of-00013.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00008-of-00013.safetensors", + "model.layers.16.input_layernorm.weight": "model-00009-of-00013.safetensors", + "model.layers.16.mlp.experts.down_proj": "model-00009-of-00013.safetensors", + "model.layers.16.mlp.experts.down_proj_bias": "model-00009-of-00013.safetensors", + "model.layers.16.mlp.experts.gate_up_proj": "model-00009-of-00013.safetensors", + "model.layers.16.mlp.experts.gate_up_proj_bias": "model-00009-of-00013.safetensors", + "model.layers.16.mlp.router.bias": "model-00008-of-00013.safetensors", + "model.layers.16.mlp.router.weight": "model-00008-of-00013.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00009-of-00013.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00008-of-00013.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00008-of-00013.safetensors", + "model.layers.16.self_attn.o_proj.bias": "model-00008-of-00013.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00008-of-00013.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00008-of-00013.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00008-of-00013.safetensors", + "model.layers.16.self_attn.sinks": "model-00008-of-00013.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00008-of-00013.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00008-of-00013.safetensors", + "model.layers.17.input_layernorm.weight": "model-00009-of-00013.safetensors", + "model.layers.17.mlp.experts.down_proj": "model-00009-of-00013.safetensors", + "model.layers.17.mlp.experts.down_proj_bias": "model-00009-of-00013.safetensors", + "model.layers.17.mlp.experts.gate_up_proj": "model-00009-of-00013.safetensors", + "model.layers.17.mlp.experts.gate_up_proj_bias": "model-00009-of-00013.safetensors", + "model.layers.17.mlp.router.bias": "model-00009-of-00013.safetensors", + "model.layers.17.mlp.router.weight": "model-00009-of-00013.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.o_proj.bias": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.sinks": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00009-of-00013.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00009-of-00013.safetensors", + "model.layers.18.input_layernorm.weight": "model-00010-of-00013.safetensors", + "model.layers.18.mlp.experts.down_proj": "model-00010-of-00013.safetensors", + "model.layers.18.mlp.experts.down_proj_bias": "model-00010-of-00013.safetensors", + "model.layers.18.mlp.experts.gate_up_proj": "model-00010-of-00013.safetensors", + "model.layers.18.mlp.experts.gate_up_proj_bias": "model-00010-of-00013.safetensors", + "model.layers.18.mlp.router.bias": "model-00009-of-00013.safetensors", + "model.layers.18.mlp.router.weight": "model-00009-of-00013.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00010-of-00013.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00009-of-00013.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00009-of-00013.safetensors", + "model.layers.18.self_attn.o_proj.bias": "model-00009-of-00013.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00009-of-00013.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00009-of-00013.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00009-of-00013.safetensors", + "model.layers.18.self_attn.sinks": "model-00009-of-00013.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00009-of-00013.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00009-of-00013.safetensors", + "model.layers.19.input_layernorm.weight": "model-00010-of-00013.safetensors", + "model.layers.19.mlp.experts.down_proj": "model-00010-of-00013.safetensors", + "model.layers.19.mlp.experts.down_proj_bias": "model-00010-of-00013.safetensors", + "model.layers.19.mlp.experts.gate_up_proj": "model-00010-of-00013.safetensors", + "model.layers.19.mlp.experts.gate_up_proj_bias": "model-00010-of-00013.safetensors", + "model.layers.19.mlp.router.bias": "model-00010-of-00013.safetensors", + "model.layers.19.mlp.router.weight": "model-00010-of-00013.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.o_proj.bias": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.sinks": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00010-of-00013.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00010-of-00013.safetensors", + "model.layers.2.input_layernorm.weight": "model-00002-of-00013.safetensors", + "model.layers.2.mlp.experts.down_proj": "model-00002-of-00013.safetensors", + "model.layers.2.mlp.experts.down_proj_bias": "model-00002-of-00013.safetensors", + "model.layers.2.mlp.experts.gate_up_proj": "model-00002-of-00013.safetensors", + "model.layers.2.mlp.experts.gate_up_proj_bias": "model-00002-of-00013.safetensors", + "model.layers.2.mlp.router.bias": "model-00002-of-00013.safetensors", + "model.layers.2.mlp.router.weight": "model-00002-of-00013.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.o_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.sinks": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.20.input_layernorm.weight": "model-00011-of-00013.safetensors", + "model.layers.20.mlp.experts.down_proj": "model-00011-of-00013.safetensors", + "model.layers.20.mlp.experts.down_proj_bias": "model-00011-of-00013.safetensors", + "model.layers.20.mlp.experts.gate_up_proj": "model-00011-of-00013.safetensors", + "model.layers.20.mlp.experts.gate_up_proj_bias": "model-00011-of-00013.safetensors", + "model.layers.20.mlp.router.bias": "model-00010-of-00013.safetensors", + "model.layers.20.mlp.router.weight": "model-00010-of-00013.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00011-of-00013.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00010-of-00013.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00010-of-00013.safetensors", + "model.layers.20.self_attn.o_proj.bias": "model-00010-of-00013.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00010-of-00013.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00010-of-00013.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00010-of-00013.safetensors", + "model.layers.20.self_attn.sinks": "model-00010-of-00013.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00010-of-00013.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00010-of-00013.safetensors", + "model.layers.21.input_layernorm.weight": "model-00011-of-00013.safetensors", + "model.layers.21.mlp.experts.down_proj": "model-00011-of-00013.safetensors", + "model.layers.21.mlp.experts.down_proj_bias": "model-00011-of-00013.safetensors", + "model.layers.21.mlp.experts.gate_up_proj": "model-00011-of-00013.safetensors", + "model.layers.21.mlp.experts.gate_up_proj_bias": "model-00011-of-00013.safetensors", + "model.layers.21.mlp.router.bias": "model-00011-of-00013.safetensors", + "model.layers.21.mlp.router.weight": "model-00011-of-00013.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.o_proj.bias": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.sinks": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00011-of-00013.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00011-of-00013.safetensors", + "model.layers.22.input_layernorm.weight": "model-00012-of-00013.safetensors", + "model.layers.22.mlp.experts.down_proj": "model-00012-of-00013.safetensors", + "model.layers.22.mlp.experts.down_proj_bias": "model-00012-of-00013.safetensors", + "model.layers.22.mlp.experts.gate_up_proj": "model-00012-of-00013.safetensors", + "model.layers.22.mlp.experts.gate_up_proj_bias": "model-00012-of-00013.safetensors", + "model.layers.22.mlp.router.bias": "model-00011-of-00013.safetensors", + "model.layers.22.mlp.router.weight": "model-00011-of-00013.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00012-of-00013.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00011-of-00013.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00011-of-00013.safetensors", + "model.layers.22.self_attn.o_proj.bias": "model-00011-of-00013.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00011-of-00013.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00011-of-00013.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00011-of-00013.safetensors", + "model.layers.22.self_attn.sinks": "model-00011-of-00013.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00011-of-00013.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00011-of-00013.safetensors", + "model.layers.23.input_layernorm.weight": "model-00012-of-00013.safetensors", + "model.layers.23.mlp.experts.down_proj": "model-00012-of-00013.safetensors", + "model.layers.23.mlp.experts.down_proj_bias": "model-00012-of-00013.safetensors", + "model.layers.23.mlp.experts.gate_up_proj": "model-00012-of-00013.safetensors", + "model.layers.23.mlp.experts.gate_up_proj_bias": "model-00012-of-00013.safetensors", + "model.layers.23.mlp.router.bias": "model-00012-of-00013.safetensors", + "model.layers.23.mlp.router.weight": "model-00012-of-00013.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.o_proj.bias": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.sinks": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00012-of-00013.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00012-of-00013.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00013.safetensors", + "model.layers.3.mlp.experts.down_proj": "model-00002-of-00013.safetensors", + "model.layers.3.mlp.experts.down_proj_bias": "model-00002-of-00013.safetensors", + "model.layers.3.mlp.experts.gate_up_proj": "model-00002-of-00013.safetensors", + "model.layers.3.mlp.experts.gate_up_proj_bias": "model-00002-of-00013.safetensors", + "model.layers.3.mlp.router.bias": "model-00002-of-00013.safetensors", + "model.layers.3.mlp.router.weight": "model-00002-of-00013.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.o_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.sinks": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.4.input_layernorm.weight": "model-00003-of-00013.safetensors", + "model.layers.4.mlp.experts.down_proj": "model-00003-of-00013.safetensors", + "model.layers.4.mlp.experts.down_proj_bias": "model-00003-of-00013.safetensors", + "model.layers.4.mlp.experts.gate_up_proj": "model-00003-of-00013.safetensors", + "model.layers.4.mlp.experts.gate_up_proj_bias": "model-00003-of-00013.safetensors", + "model.layers.4.mlp.router.bias": "model-00002-of-00013.safetensors", + "model.layers.4.mlp.router.weight": "model-00002-of-00013.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00013.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.4.self_attn.o_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.4.self_attn.sinks": "model-00002-of-00013.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00013.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00013.safetensors", + "model.layers.5.input_layernorm.weight": "model-00003-of-00013.safetensors", + "model.layers.5.mlp.experts.down_proj": "model-00003-of-00013.safetensors", + "model.layers.5.mlp.experts.down_proj_bias": "model-00003-of-00013.safetensors", + "model.layers.5.mlp.experts.gate_up_proj": "model-00003-of-00013.safetensors", + "model.layers.5.mlp.experts.gate_up_proj_bias": "model-00003-of-00013.safetensors", + "model.layers.5.mlp.router.bias": "model-00003-of-00013.safetensors", + "model.layers.5.mlp.router.weight": "model-00003-of-00013.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.o_proj.bias": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.sinks": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00013.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00013.safetensors", + "model.layers.6.input_layernorm.weight": "model-00004-of-00013.safetensors", + "model.layers.6.mlp.experts.down_proj": "model-00004-of-00013.safetensors", + "model.layers.6.mlp.experts.down_proj_bias": "model-00004-of-00013.safetensors", + "model.layers.6.mlp.experts.gate_up_proj": "model-00004-of-00013.safetensors", + "model.layers.6.mlp.experts.gate_up_proj_bias": "model-00004-of-00013.safetensors", + "model.layers.6.mlp.router.bias": "model-00003-of-00013.safetensors", + "model.layers.6.mlp.router.weight": "model-00003-of-00013.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00013.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00013.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00013.safetensors", + "model.layers.6.self_attn.o_proj.bias": "model-00003-of-00013.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00013.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00013.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00013.safetensors", + "model.layers.6.self_attn.sinks": "model-00003-of-00013.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00013.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00013.safetensors", + "model.layers.7.input_layernorm.weight": "model-00004-of-00013.safetensors", + "model.layers.7.mlp.experts.down_proj": "model-00004-of-00013.safetensors", + "model.layers.7.mlp.experts.down_proj_bias": "model-00004-of-00013.safetensors", + "model.layers.7.mlp.experts.gate_up_proj": "model-00004-of-00013.safetensors", + "model.layers.7.mlp.experts.gate_up_proj_bias": "model-00004-of-00013.safetensors", + "model.layers.7.mlp.router.bias": "model-00004-of-00013.safetensors", + "model.layers.7.mlp.router.weight": "model-00004-of-00013.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.o_proj.bias": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.sinks": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00004-of-00013.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00004-of-00013.safetensors", + "model.layers.8.input_layernorm.weight": "model-00005-of-00013.safetensors", + "model.layers.8.mlp.experts.down_proj": "model-00005-of-00013.safetensors", + "model.layers.8.mlp.experts.down_proj_bias": "model-00005-of-00013.safetensors", + "model.layers.8.mlp.experts.gate_up_proj": "model-00005-of-00013.safetensors", + "model.layers.8.mlp.experts.gate_up_proj_bias": "model-00005-of-00013.safetensors", + "model.layers.8.mlp.router.bias": "model-00004-of-00013.safetensors", + "model.layers.8.mlp.router.weight": "model-00004-of-00013.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00005-of-00013.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00004-of-00013.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00004-of-00013.safetensors", + "model.layers.8.self_attn.o_proj.bias": "model-00004-of-00013.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00013.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00004-of-00013.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00004-of-00013.safetensors", + "model.layers.8.self_attn.sinks": "model-00004-of-00013.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00004-of-00013.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00004-of-00013.safetensors", + "model.layers.9.input_layernorm.weight": "model-00005-of-00013.safetensors", + "model.layers.9.mlp.experts.down_proj": "model-00005-of-00013.safetensors", + "model.layers.9.mlp.experts.down_proj_bias": "model-00005-of-00013.safetensors", + "model.layers.9.mlp.experts.gate_up_proj": "model-00005-of-00013.safetensors", + "model.layers.9.mlp.experts.gate_up_proj_bias": "model-00005-of-00013.safetensors", + "model.layers.9.mlp.router.bias": "model-00005-of-00013.safetensors", + "model.layers.9.mlp.router.weight": "model-00005-of-00013.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.o_proj.bias": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.sinks": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00005-of-00013.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00005-of-00013.safetensors", + "model.norm.weight": "model-00012-of-00013.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..f93aae3 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..494a3ff --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1af7aff4e39ad377c689737ee7be1e8e77333d28c5ed52a0a137dc68e27557fb +size 27868174 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..402a35c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,180 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|reserved_200017|>", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +}