commit 3036e84fc650c3c6882e0684c77a5298af38fe7e Author: ModelHub XC Date: Sun May 3 10:21:36 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: llm-jp/optimal-sparsity-math-d2048-E16-k4-7.1B-A2.3B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..02f2f00 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,52 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..2b002e9 --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +--- +pipeline_tag: text-generation +library_name: transformers +license: apache-2.0 +tags: + - mixtral + - moe + - reasoning +--- + +# Optimal Sparsity of Mixture-of-Experts Language Models for Reasoning Tasks + +This repository contains model checkpoints from the paper [Optimal Sparsity of Mixture-of-Experts Language Models for Reasoning Tasks](https://huggingface.co/papers/2508.18672). + +For more details, including code and evaluation procedures, please refer to the official GitHub repository: [https://github.com/rioyokotalab/optimal-sparsity](https://github.com/rioyokotalab/optimal-sparsity) + +## How to cite + +If you find our work helpful, please feel free to cite the paper. + +```bibtex +@inproceedings{ + nakamura2026optimal, + title={Optimal Sparsity of Mixture-of-Experts Language Models for Reasoning Tasks}, + author={Taishi Nakamura and Satoki Ishikawa and Masaki Kawamura and Takumi Okamoto and Daisuke Nohara and Jun Suzuki and Rio Yokota}, + booktitle={The Fourteenth International Conference on Learning Representations}, + year={2026}, + url={https://openreview.net/forum?id=XFw2EPRUUR} +} +``` + diff --git a/config.json b/config.json new file mode 100644 index 0000000..6e21646 --- /dev/null +++ b/config.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "MixtralForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "mixtral", + "num_attention_heads": 16, + "num_experts_per_tok": 4, + "num_hidden_layers": 16, + "num_key_value_heads": 16, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_theta": 10000, + "router_aux_loss_coef": 0.01, + "router_jitter_noise": 0.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.0", + "use_cache": true, + "vocab_size": 99584 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..e4b65be --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05a13fbfae192687b12395f10f594b241267b84749239e57d79796c886abaa0f +size 4989300312 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..ac4dd3e --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f32b01ce1ec25d0c5b965cc274951f07cff644e0e4e3e77770687505be569857 +size 4999649184 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..a258282 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0977707db32ddee9e34916708d6513516779822a0cdb598303dffea4641aa0 +size 4249912864 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..4a23faf --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,890 @@ +{ + "metadata": { + "total_size": 14238748672 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.block_sparse_moe.gate.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.8.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.8.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.9.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.9.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.10.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.10.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.11.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.11.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.12.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.12.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.13.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.13.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.14.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.14.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.15.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.15.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.8.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.9.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.10.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.11.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.12.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.13.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.14.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.block_sparse_moe.experts.15.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.8.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.8.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.9.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.9.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.10.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.10.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.11.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.11.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.12.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.12.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.13.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.13.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.14.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.14.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.15.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.15.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.8.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.9.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.10.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.11.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.12.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.13.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.14.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.block_sparse_moe.experts.15.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.8.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.8.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.9.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.9.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.10.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.10.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.11.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.11.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.12.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.12.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.13.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.13.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.14.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.14.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.15.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.15.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.8.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.9.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.10.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.11.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.12.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.13.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.14.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.block_sparse_moe.experts.15.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.8.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.8.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.9.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.9.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.10.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.10.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.11.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.11.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.12.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.12.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.13.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.13.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.14.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.14.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.15.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.15.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.8.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.9.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.10.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.11.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.12.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.13.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.14.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.block_sparse_moe.experts.15.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.8.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.8.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.9.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.9.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.10.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.10.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.11.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.11.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.12.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.12.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.13.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.13.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.14.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.14.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.15.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.15.w3.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.8.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.9.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.10.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.11.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.12.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.13.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.14.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.block_sparse_moe.experts.15.w2.weight": "model-00001-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.8.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.8.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.9.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.9.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.10.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.10.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.11.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.11.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.12.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.12.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.13.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.13.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.14.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.14.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.15.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.15.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.8.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.9.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.10.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.11.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.12.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.13.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.14.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.5.block_sparse_moe.experts.15.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.8.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.8.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.9.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.9.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.10.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.10.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.11.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.11.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.12.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.12.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.13.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.13.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.14.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.14.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.15.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.15.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.8.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.9.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.10.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.11.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.12.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.13.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.14.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.6.block_sparse_moe.experts.15.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.8.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.8.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.9.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.9.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.10.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.10.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.11.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.11.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.12.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.12.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.13.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.13.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.14.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.14.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.15.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.15.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.8.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.9.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.10.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.11.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.12.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.13.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.14.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.7.block_sparse_moe.experts.15.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.8.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.8.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.9.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.9.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.10.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.10.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.11.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.11.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.12.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.12.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.13.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.13.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.14.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.14.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.15.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.15.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.8.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.9.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.10.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.11.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.12.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.13.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.14.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.8.block_sparse_moe.experts.15.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.8.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.8.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.9.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.9.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.10.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.10.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.11.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.11.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.12.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.12.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.13.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.13.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.14.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.14.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.15.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.15.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.8.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.9.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.10.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.11.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.12.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.13.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.14.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.9.block_sparse_moe.experts.15.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.8.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.8.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.9.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.9.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.10.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.10.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.11.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.11.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.12.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.12.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.13.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.13.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.14.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.14.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.15.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.15.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.8.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.9.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.10.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.11.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.12.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.13.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.14.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.10.block_sparse_moe.experts.15.w2.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.8.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.8.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.9.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.9.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.10.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.10.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.11.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.11.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.12.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.12.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.13.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.13.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.14.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.14.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.15.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.15.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.8.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.9.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.10.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.11.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.12.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.13.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.14.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.11.block_sparse_moe.experts.15.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.8.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.8.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.9.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.9.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.10.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.10.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.11.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.11.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.12.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.12.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.13.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.13.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.14.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.14.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.15.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.15.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.8.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.9.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.10.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.11.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.12.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.13.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.14.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.12.block_sparse_moe.experts.15.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.8.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.8.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.9.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.9.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.10.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.10.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.11.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.11.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.12.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.12.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.13.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.13.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.14.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.14.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.15.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.15.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.8.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.9.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.10.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.11.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.12.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.13.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.14.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.13.block_sparse_moe.experts.15.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.8.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.8.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.9.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.9.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.10.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.10.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.11.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.11.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.12.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.12.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.13.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.13.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.14.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.14.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.15.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.15.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.8.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.9.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.10.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.11.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.12.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.13.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.14.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.14.block_sparse_moe.experts.15.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.8.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.8.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.9.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.9.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.10.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.10.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.11.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.11.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.12.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.12.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.13.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.13.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.14.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.14.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.15.w1.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.15.w3.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.8.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.9.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.10.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.11.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.12.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.13.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.14.w2.weight": "model-00003-of-00003.safetensors", + "model.layers.15.block_sparse_moe.experts.15.w2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors", + "lm_head.weight": "model-00003-of-00003.safetensors" + } +} \ No newline at end of file diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..8644c8f --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": "", + "cls_token": "", + "eod_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..fc80107 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955dc1fa623fab38cc92a3f4ee172423ae6d73201c4207569bfdf5626bc733f0 +size 6416433 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..09aa857 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,18 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "unk_token": "", + "bos_token": "", + "eos_token": "", + "pad_token": "", + "cls_token": "", + "sep_token": "", + "eod_token": "", + "mask_token": "", + "extra_ids": 0, + "sp_model_kwargs": {}, + "model_max_length": 1000000000000000019884624838656, + "clean_up_tokenization_spaces": false, + "special_tokens_map_file": null, + "tokenizer_class": "PreTrainedTokenizerFast" +}