commit eaa7a47bbcf066b6b15d12bbbcd62a7006456e5c Author: ModelHub XC Date: Sun Apr 12 10:53:00 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..9ce5540 --- /dev/null +++ b/README.md @@ -0,0 +1,92 @@ +--- +license: cc-by-nc-4.0 +--- + + +![image/png](https://cdn-uploads.huggingface.co/production/uploads/630dfb008df86f1e5becadc3/vwcJfOnL-2QDJ0ShfxRJ5.png) + + + +--- + +# Disclaimer: +## This model is experimental, do not expect everything to work. + +This model uses the Alpaca **prompting format**(or just directly download the SillyTavern instruct preset [here](https://files.catbox.moe/0ohmco.json)) + +--- + + +Beeg noromaid on ***steroids***. Suitable for RP, ERP. + +This time based on Mixtral Instruct, seems to do wonders! + +This model was trained for 8h(v1) + 8h(v2) + 12h(v3) on customized modified datasets, focusing on RP, uncensoring, and a modified version of the Alpaca prompting (that was already used in LimaRP), which should be at the same conversational level as ChatLM or Llama2-Chat without adding any additional special tokens. + +If you wanna have more infos about this model(and v1 + v2) you can check out [my blog post](https://ikaridevgit.github.io/index.html?p=7&blog=blogid-6&bo=true) + +[Recommended settings - Settings 1](https://huggingface.co/NeverSleep/Noromaid-v0.1-mixtral-8x7b-v3/discussions/1) + +[Recommended settings - Settings 2 (idk if they are any good)](https://files.catbox.moe/fv4xhu.json) + +## Credits: +- Undi +- IkariDev + + +## Description + + + +This repo contains FP16 files of Noromaid-v0.1-mixtral-8x7b-Instruct-v3. + +[FP16 - by IkariDev and Undi](https://huggingface.co/NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3) + + + + + + + + + + + +[GGUF - by IkariDev and Undi](https://huggingface.co/NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3-GGUF) + + +## Ratings: + +Note: We have permission of all users to upload their ratings, we DONT screenshot random reviews without asking if we can put them here! + +No ratings yet! + +If you want your rating to be here, send us a message over on DC and we'll put up a screenshot of it here. DC name is "ikaridev" and "undi". + + + +### Custom format: +``` +### Instruction: +{system prompt} + +### Input: +{input} + +### Response: +{reply} +``` + +## Datasets used: + +- Aesir 1 and 2 ([MinervaAI](https://huggingface.co/MinervaAI) / [Gryphe](https://huggingface.co/Gryphe)) +- [LimaRP-20231109](https://huggingface.co/datasets/lemonilia/LimaRP) ([Lemonilia](https://huggingface.co/lemonilia)) +- [ToxicDPO-NoWarning](https://huggingface.co/datasets/Undi95/toxic-dpo-v0.1-sharegpt) ([unalignment orga repo](https://huggingface.co/unalignment) + [Undi](https://huggingface.co/Undi95)) +- [No-robots-ShareGPT](https://huggingface.co/datasets/Doctor-Shotgun/no-robots-sharegpt) ([Doctor-Shotgun](https://huggingface.co/Doctor-Shotgu)) + + +## Others + +Undi: If you want to support me, you can [here](https://ko-fi.com/undiai). + +IkariDev: Visit my [retro/neocities style website](https://ikaridevgit.github.io/) please kek \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..75368d0 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "architectures": [ + "MixtralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "mixtral", + "num_attention_heads": 32, + "num_experts_per_tok": 2, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "num_local_experts": 8, + "output_router_logits": true, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "router_aux_loss_coef": 0.02, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..c533f93 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.36.2" +} diff --git a/pytorch_model-00001-of-00019.bin b/pytorch_model-00001-of-00019.bin new file mode 100644 index 0000000..a9c39c2 --- /dev/null +++ b/pytorch_model-00001-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944dc4d54308f6d651934949757fa9c347928065fbe598d456e54b6dca9f39c2 +size 4892820801 diff --git a/pytorch_model-00002-of-00019.bin b/pytorch_model-00002-of-00019.bin new file mode 100644 index 0000000..fdf7158 --- /dev/null +++ b/pytorch_model-00002-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a24a090d27be77b32fa11656168039bda06bc32ac7a901d0a2df6be15f6e8b2a +size 4983016125 diff --git a/pytorch_model-00003-of-00019.bin b/pytorch_model-00003-of-00019.bin new file mode 100644 index 0000000..40cd5b7 --- /dev/null +++ b/pytorch_model-00003-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c2e83d42856bdc94bf5952875019441a28dfdd69ce8989d67997f5aae517c3f +size 4983016209 diff --git a/pytorch_model-00004-of-00019.bin b/pytorch_model-00004-of-00019.bin new file mode 100644 index 0000000..85287ee --- /dev/null +++ b/pytorch_model-00004-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9197be8d0694eb9e02fe5bb14f855590401867a850c0beb7254aed9e0036b683 +size 4899045759 diff --git a/pytorch_model-00005-of-00019.bin b/pytorch_model-00005-of-00019.bin new file mode 100644 index 0000000..097c48a --- /dev/null +++ b/pytorch_model-00005-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff3f302fee1e6e5549c4eb8b0c9c964fe97c9e0ccf558fb8926aa1f6797d14a1 +size 4983016161 diff --git a/pytorch_model-00006-of-00019.bin b/pytorch_model-00006-of-00019.bin new file mode 100644 index 0000000..0476f72 --- /dev/null +++ b/pytorch_model-00006-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9882c03b747df93a2911628c24bd344aea3f01cee04f790062d7f7f3eb75409f +size 4983016125 diff --git a/pytorch_model-00007-of-00019.bin b/pytorch_model-00007-of-00019.bin new file mode 100644 index 0000000..748fe7e --- /dev/null +++ b/pytorch_model-00007-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1f357b5eb896df9fcfce9f1db4ec04548b571f2305e7e7e8de454440ef3096a +size 4899045759 diff --git a/pytorch_model-00008-of-00019.bin b/pytorch_model-00008-of-00019.bin new file mode 100644 index 0000000..7baaa37 --- /dev/null +++ b/pytorch_model-00008-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e7bc9b97aa9fe1854c0097b3e719ae9fc4897d2d1bc2dd273f0bc34fcc99cf9 +size 4983016185 diff --git a/pytorch_model-00009-of-00019.bin b/pytorch_model-00009-of-00019.bin new file mode 100644 index 0000000..c9d0658 --- /dev/null +++ b/pytorch_model-00009-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1136dc942f87b92979648f476464476c657f30d5a0b457c20e97461bed828bb0 +size 4983016125 diff --git a/pytorch_model-00010-of-00019.bin b/pytorch_model-00010-of-00019.bin new file mode 100644 index 0000000..b5b8787 --- /dev/null +++ b/pytorch_model-00010-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5dad23664d9945c85c9284ca4f8ba298edfdd0dbde4b16271f26e9ec52327d4 +size 4899045759 diff --git a/pytorch_model-00011-of-00019.bin b/pytorch_model-00011-of-00019.bin new file mode 100644 index 0000000..f38e976 --- /dev/null +++ b/pytorch_model-00011-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885af9d95b351304dbc9804cee2155a4809107c419e1794fc071e18171ea31b2 +size 4983016149 diff --git a/pytorch_model-00012-of-00019.bin b/pytorch_model-00012-of-00019.bin new file mode 100644 index 0000000..854da90 --- /dev/null +++ b/pytorch_model-00012-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69aeedb1a62a6875d214f596dc56123f4ad268397913479f3720164690d03001 +size 4983016149 diff --git a/pytorch_model-00013-of-00019.bin b/pytorch_model-00013-of-00019.bin new file mode 100644 index 0000000..c63ee61 --- /dev/null +++ b/pytorch_model-00013-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b21a536a109405ef3d5a9e9008927dc8a8e4fd6d566c7152cde6d5f661023b81 +size 4983016125 diff --git a/pytorch_model-00014-of-00019.bin b/pytorch_model-00014-of-00019.bin new file mode 100644 index 0000000..6c22fea --- /dev/null +++ b/pytorch_model-00014-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4117f0c4adaf62ae4179b80d60f8b4cf095317c8cdee3b707a21a8e1cd951431 +size 4899045759 diff --git a/pytorch_model-00015-of-00019.bin b/pytorch_model-00015-of-00019.bin new file mode 100644 index 0000000..8ed490f --- /dev/null +++ b/pytorch_model-00015-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9305bc0f6cda73bc60f1679430faceb9cf284e29971858fecc8ef1b24783b329 +size 4983016185 diff --git a/pytorch_model-00016-of-00019.bin b/pytorch_model-00016-of-00019.bin new file mode 100644 index 0000000..712fe4d --- /dev/null +++ b/pytorch_model-00016-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ca0bbef7e63f9565ec06d1f349fe25860b12831d62496a548821e0631d3e63 +size 4983016125 diff --git a/pytorch_model-00017-of-00019.bin b/pytorch_model-00017-of-00019.bin new file mode 100644 index 0000000..750edd8 --- /dev/null +++ b/pytorch_model-00017-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdcdb2420de7552ba0fa300185a9aedde7c6d4e5f0c710eff29bb3c3b6c1edeb +size 4899045759 diff --git a/pytorch_model-00018-of-00019.bin b/pytorch_model-00018-of-00019.bin new file mode 100644 index 0000000..c6b88a6 --- /dev/null +++ b/pytorch_model-00018-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f264f38ca462b776f0d441e0382ce7bdf9848aa2fe992ab111671ad556da152 +size 4983016161 diff --git a/pytorch_model-00019-of-00019.bin b/pytorch_model-00019-of-00019.bin new file mode 100644 index 0000000..3bbb24e --- /dev/null +++ b/pytorch_model-00019-of-00019.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac74a18ea2e0d9e59b8f09ea1076b21ab73442e1846e8d11116a27a1209bf3ed +size 4221688679 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..c105000 --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,1002 @@ +{ + "metadata": { + "total_size": 93405585408 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00019-of-00019.bin", + "model.embed_tokens.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.block_sparse_moe.gate.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.block_sparse_moe.gate.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.block_sparse_moe.gate.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.10.input_layernorm.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.11.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.11.block_sparse_moe.gate.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.input_layernorm.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00007-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.block_sparse_moe.gate.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.input_layernorm.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.block_sparse_moe.gate.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.input_layernorm.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00008-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.block_sparse_moe.gate.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.input_layernorm.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.block_sparse_moe.gate.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.input_layernorm.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00009-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.16.block_sparse_moe.gate.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.input_layernorm.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00010-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.block_sparse_moe.gate.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.input_layernorm.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.block_sparse_moe.gate.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.input_layernorm.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00011-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.block_sparse_moe.gate.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.input_layernorm.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.input_layernorm.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.block_sparse_moe.gate.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.input_layernorm.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00012-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.block_sparse_moe.gate.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.input_layernorm.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.block_sparse_moe.gate.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.22.input_layernorm.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00013-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.23.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.23.block_sparse_moe.gate.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.input_layernorm.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00014-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.block_sparse_moe.gate.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.input_layernorm.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.block_sparse_moe.gate.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.input_layernorm.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00015-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.block_sparse_moe.gate.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.input_layernorm.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.block_sparse_moe.gate.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.input_layernorm.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00016-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.28.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.28.block_sparse_moe.gate.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.input_layernorm.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00017-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.block_sparse_moe.gate.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.input_layernorm.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.input_layernorm.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00002-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.block_sparse_moe.gate.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.input_layernorm.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00018-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.block_sparse_moe.gate.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.input_layernorm.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00019-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.block_sparse_moe.gate.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.input_layernorm.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.block_sparse_moe.gate.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.5.input_layernorm.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00003-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.block_sparse_moe.gate.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.input_layernorm.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00004-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.input_layernorm.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.input_layernorm.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00005-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.block_sparse_moe.gate.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.input_layernorm.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00006-of-00019.bin", + "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00006-of-00019.bin", + "model.norm.weight": "pytorch_model-00019-of-00019.bin" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..72ecfee --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..8b443ef --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055 +size 493443 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..bc00187 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,45 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [], + "bos_token": "", + "chat_template": "{% set ns = namespace(found=false) %}{% for message in messages %}{% if message['role'] == 'system' %}{% set ns.found = true %}{% endif %}{% endfor %}{% if not ns.found %}{{ '### Instruction:\nYou are a chatbot.\n\n' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] + '\n\n' }}{% else %}{% if message['role'] == 'user' %}{{ '### Input:\n' + message['content'] + '\n\n' }}{% else %}{{ '### Response:\n' + message['content'] + '\n\n' }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '### Response:\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "trust_remote_code": false, + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +}