commit 1105c974d047e4fd7614be0ebcc576acf2dcddb1 Author: ModelHub XC Date: Sat Apr 11 12:14:58 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: inclusionAI/Ring-lite-2506 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..21b3632 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,49 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..fc637dc --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 inclusionAI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..12ea3e9 --- /dev/null +++ b/README.md @@ -0,0 +1,106 @@ +--- +license: mit +language: +- zh +- en +base_model: +- inclusionAI/Ling-lite-base-1.5 +--- +# Ring-lite-2506 + +

+ +

+ +

+ 🤗 Hugging Face +

+ +## Introduction + +Ring-lite-2506 is a lightweight, fully open-sourced MoE (Mixture of Experts) LLM designed for complex reasoning tasks. It is built upon the publicly available [Ling-lite-1.5](https://huggingface.co/inclusionAI/Ling-lite-1.5) model, which has 16.8B parameters with 2.75B activated parameters. We use a joint training pipeline combining knowledge distillation with reinforcement learning, achieving performance comparable to state-of-the-art (SOTA) small-size reasoning models on challenging benchmarks (AIME, LiveCodeBench, and GPQA-Diamond) while activating only one-third of their parameters. + + + +## Model Downloads + +

+ +| **Model** | **#Total Params** | **#Activated Params** | **Context Length** | **Download** | +| :----------------: | :---------------: | :-------------------: | :----------------: | :----------: | +| Ring-lite-2506 | 16.8B | 2.75B | 128K | [🤗 HuggingFace](https://huggingface.co/inclusionAI/Ring-lite-2506) | + +
+ +## Evaluation +For a comprehensive evaluation of the quality of our reasoning models, we implemented automatic benchmarks to assess their performance including math, code and science. + +

+ +

+ + + +More details are reported in our [technical report](https://arxiv.org/abs/2506.14731). + +## Quickstart + +### 🤗 Hugging Face Transformers +Here is a code snippet to show you how to use the chat model with `transformers`: + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_name = "inclusionAI/Ring-lite-2506" + +model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype="auto", + device_map="auto" +) +tokenizer = AutoTokenizer.from_pretrained(model_name) + +prompt = "Give me a short introduction to large language models." +messages = [ + {"role": "system", "content": "You are Ring, an assistant created by inclusionAI"}, + {"role": "user", "content": prompt} +] +text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True +) +model_inputs = tokenizer([text], return_tensors="pt").to(model.device) + +generated_ids = model.generate( + **model_inputs, + max_new_tokens=8192 +) +generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) +] + +response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +``` + +## Dataset +The training data of Ring-lite-2506 is release at [Ring-lite-sft-data](https://huggingface.co/datasets/inclusionAI/Ring-lite-sft-data) and [Ring-lite-rl-data](https://huggingface.co/datasets/inclusionAI/Ring-lite-rl-data). + +## Deployment +Please refer to [GitHub](https://github.com/inclusionAI/Ring/blob/main/README.md) + +## License +This code repository is licensed under [the MIT License](https://huggingface.co/inclusionAI/Ring-lite-2506/blob/main/LICENSE). + +## Citation +``` +@misc{ringteam2025ringlitescalablereasoningc3postabilized, + title={Ring-lite: Scalable Reasoning via C3PO-Stabilized Reinforcement Learning for LLMs}, + author={Ling Team}, + year={2025}, + eprint={2506.14731}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2506.14731}, +} +``` diff --git a/config.json b/config.json new file mode 100644 index 0000000..ffe4e25 --- /dev/null +++ b/config.json @@ -0,0 +1,44 @@ +{ + "architectures": [ + "BailingMoeForCausalLM" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bailing_moe.BailingMoeConfig", + "AutoModel": "modeling_bailing_moe.BailingMoeModel", + "AutoModelForCausalLM": "modeling_bailing_moe.BailingMoeForCausalLM" + }, + "eos_token_id": 126081, + "pad_token_id": 126081, + "first_k_dense_replace": 0, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.006, + "intermediate_size": 1408, + "max_position_embeddings": 32768, + "model_type": "bailing_moe", + "moe_intermediate_size": 1408, + "num_experts": 64, + "num_shared_experts": 2, + "norm_topk_prob": true, + "num_attention_heads": 16, + "num_experts_per_tok": 6, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 600000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0", + "use_cache": true, + "use_bias": false, + "use_qkv_bias": false, + "vocab_size": 126464, + "output_router_logits": false, + "embedding_dropout": 0.0, + "norm_head": false, + "norm_softmax": false, + "output_dropout": 0.0 +} \ No newline at end of file diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..f9291c3 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework":"Pytorch","task":"text-generation"} \ No newline at end of file diff --git a/configuration_bailing_moe.py b/configuration_bailing_moe.py new file mode 100644 index 0000000..3f028c8 --- /dev/null +++ b/configuration_bailing_moe.py @@ -0,0 +1,78 @@ +""" Bailing MoE model configuration """ + +from transformers.configuration_utils import PretrainedConfig + + +class BailingMoeConfig(PretrainedConfig): + model_type = "bailing_moe" + + def __init__( + self, + vocab_size=30592, + hidden_size=1024, + intermediate_size=None, + num_hidden_layers=24, + num_attention_heads=16, + num_key_value_heads=0, + hidden_act="silu", + use_qkv_bias=False, # bailing only + use_bias=True, # bailing only + rms_norm_eps=1e-05, + norm_head=False, # bailing only + tie_word_embeddings=False, # PretrainedConfig key, here change default value. + embedding_dropout=0.1, + attention_dropout=0.1, + output_dropout=0.1, + initializer_range=0.02, + max_position_embeddings=16384, + rope_theta=10000.0, + use_cache=True, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + rope_scaling=None, + pad_token_id=126081, + num_experts=16, + num_shared_experts=0, + num_experts_per_tok=2, + norm_topk_prob=True, + moe_intermediate_size=None, + first_k_dense_replace=0, + head_dim=None, + output_router_logits=False, + **kwargs, + ): + self.num_hidden_layers = num_hidden_layers + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.use_qkv_bias = use_qkv_bias + self.use_bias = use_bias + self.norm_head = norm_head + self.rms_norm_eps = rms_norm_eps + self.embedding_dropout = embedding_dropout + self.attention_dropout = attention_dropout + self.output_dropout = output_dropout + self.initializer_range = initializer_range + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.use_cache = use_cache + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + self.head_dim = head_dim or self.hidden_size // self.num_attention_heads + self.rope_scaling = rope_scaling + + # MoE configs + self.num_experts = num_experts + self.num_shared_experts = num_shared_experts + self.num_experts_per_tok = num_experts_per_tok + self.norm_topk_prob = norm_topk_prob + self.moe_intermediate_size = moe_intermediate_size + self.first_k_dense_replace = first_k_dense_replace + self.output_router_logits = output_router_logits + + super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..c66e3e6 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 126081, + "pad_token_id": 126081, + "transformers_version": "4.40.0" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..7c086d1 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83afd0fe022e3552a74da807869c865b734c88523da27a4510a32f085d9ec33 +size 10000012352 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..d03e93d --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ccecf15d89900ba376bff70aac7266a90320205b549337883e0fbd0e7386a6 +size 9997403496 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..c86a6e2 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b72c9cb181ce3ce4ba02afa6d78b8ef82f85c8b7da85b7cd472aa58517ef937c +size 9995576736 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..2f62281 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b3384dc88a60dbc5a20a7fe6617163374b1adbdd5a07c6669192e185a34abc +size 3611653272 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..6702a62 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,5610 @@ +{ + "metadata": { + "total_size": 33603948544 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.layers.0.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.0.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.10.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.10.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.10.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.11.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.11.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.11.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.12.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.12.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.12.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.13.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.13.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.13.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.14.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.14.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.14.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.15.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.15.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.15.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.16.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.16.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.16.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.17.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.17.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.17.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.18.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.18.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.18.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.19.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.19.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.19.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.20.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.20.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.20.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.21.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.21.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.21.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.22.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.22.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.22.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.23.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.23.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.23.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.24.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.24.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.24.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.25.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.25.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.25.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.26.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.26.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.26.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.27.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.27.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.27.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.28.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.28.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.28.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.29.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.29.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.29.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.30.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.30.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.30.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.31.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.31.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.31.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.32.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.32.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.32.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.33.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.33.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.33.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.34.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.34.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.34.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.35.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.35.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.35.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.36.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.36.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.36.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.37.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.37.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.37.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.38.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.38.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.38.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.39.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.39.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.39.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.40.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.40.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.40.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.41.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.41.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.41.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.42.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.42.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.42.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.43.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.43.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.43.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.44.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.44.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.44.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.45.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.45.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.45.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.46.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.46.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.46.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.47.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.47.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.47.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.48.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.48.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.48.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.49.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.49.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.49.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.50.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.50.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.50.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.51.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.51.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.51.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.52.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.52.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.52.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.53.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.53.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.53.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.54.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.54.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.54.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.55.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.55.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.55.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.56.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.56.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.56.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.57.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.57.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.57.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.58.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.58.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.58.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.59.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.59.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.59.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.60.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.60.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.60.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.61.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.61.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.61.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.62.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.62.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.62.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.63.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.63.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.63.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.9.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.9.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.experts.9.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.shared_experts.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.shared_experts.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.shared_experts.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.1.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.10.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.10.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.10.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.11.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.11.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.11.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.12.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.12.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.12.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.13.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.13.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.13.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.14.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.14.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.14.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.15.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.15.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.15.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.16.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.16.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.16.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.17.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.17.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.17.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.18.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.18.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.18.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.19.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.19.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.19.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.20.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.20.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.20.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.21.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.21.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.21.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.22.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.22.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.22.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.23.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.23.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.23.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.24.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.24.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.24.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.25.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.25.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.25.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.26.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.26.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.26.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.27.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.27.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.27.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.28.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.28.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.28.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.29.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.29.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.29.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.30.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.30.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.30.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.31.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.31.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.31.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.32.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.32.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.32.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.33.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.33.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.33.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.34.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.34.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.34.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.35.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.35.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.35.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.36.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.36.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.36.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.37.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.37.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.37.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.38.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.38.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.38.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.39.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.39.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.39.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.40.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.40.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.40.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.41.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.41.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.41.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.42.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.42.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.42.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.43.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.43.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.43.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.44.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.44.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.44.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.45.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.45.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.45.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.46.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.46.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.46.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.47.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.47.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.47.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.48.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.48.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.48.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.49.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.49.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.49.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.50.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.50.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.50.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.51.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.51.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.51.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.52.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.52.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.52.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.53.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.53.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.53.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.54.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.54.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.54.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.55.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.55.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.55.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.56.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.56.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.56.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.57.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.57.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.57.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.58.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.58.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.58.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.59.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.59.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.59.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.60.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.60.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.60.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.61.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.61.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.61.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.62.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.62.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.62.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.63.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.63.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.63.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.9.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.9.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.experts.9.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.shared_experts.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.shared_experts.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.shared_experts.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.10.attention.dense.weight": "model-00002-of-00004.safetensors", + "model.layers.10.attention.query_key_value.weight": "model-00002-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.0.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.0.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.0.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.1.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.1.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.1.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.2.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.2.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.2.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.3.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.3.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.3.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.4.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.4.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.4.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.49.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.49.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.49.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.5.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.5.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.5.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.50.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.50.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.50.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.51.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.51.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.51.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.52.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.52.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.52.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.53.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.53.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.53.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.54.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.54.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.54.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.55.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.55.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.55.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.56.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.56.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.56.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.57.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.57.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.57.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.58.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.58.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.58.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.59.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.59.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.59.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.6.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.6.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.6.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.60.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.60.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.60.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.61.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.61.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.61.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.62.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.62.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.62.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.63.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.63.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.63.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.7.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.7.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.7.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.8.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.8.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.8.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.attention.dense.weight": "model-00002-of-00004.safetensors", + "model.layers.11.attention.query_key_value.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.0.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.0.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.0.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.1.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.1.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.1.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.2.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.2.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.2.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.3.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.3.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.3.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.4.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.4.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.4.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.49.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.49.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.49.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.5.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.5.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.5.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.50.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.50.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.50.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.51.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.51.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.51.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.52.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.52.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.52.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.53.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.53.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.53.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.54.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.54.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.54.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.55.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.55.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.55.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.56.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.56.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.56.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.57.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.57.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.57.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.58.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.58.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.58.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.59.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.59.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.59.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.6.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.6.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.6.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.60.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.60.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.60.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.61.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.61.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.61.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.62.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.62.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.62.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.63.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.63.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.63.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.7.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.7.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.7.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.8.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.8.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.8.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.attention.dense.weight": "model-00002-of-00004.safetensors", + "model.layers.12.attention.query_key_value.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.0.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.0.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.0.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.1.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.1.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.1.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.2.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.2.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.2.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.3.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.3.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.3.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.4.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.4.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.4.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.49.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.49.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.49.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.5.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.5.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.5.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.50.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.50.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.50.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.51.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.51.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.51.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.52.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.52.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.52.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.53.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.53.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.53.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.54.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.54.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.54.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.55.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.55.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.55.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.56.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.56.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.56.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.57.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.57.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.57.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.58.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.58.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.58.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.59.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.59.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.59.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.6.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.6.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.6.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.60.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.60.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.60.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.61.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.61.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.61.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.62.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.62.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.62.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.63.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.63.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.63.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.7.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.7.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.7.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.8.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.8.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.8.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.attention.dense.weight": "model-00002-of-00004.safetensors", + "model.layers.13.attention.query_key_value.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.0.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.0.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.0.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.1.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.1.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.1.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.2.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.2.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.2.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.3.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.3.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.3.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.4.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.4.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.4.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.49.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.49.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.49.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.5.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.5.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.5.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.50.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.50.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.50.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.51.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.51.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.51.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.52.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.52.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.52.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.53.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.53.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.53.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.54.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.54.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.54.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.55.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.55.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.55.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.56.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.56.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.56.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.57.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.57.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.57.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.58.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.58.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.58.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.59.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.59.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.59.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.6.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.6.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.6.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.60.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.60.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.60.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.61.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.61.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.61.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.62.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.62.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.62.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.63.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.63.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.63.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.7.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.7.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.7.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.8.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.8.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.8.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.attention.dense.weight": "model-00002-of-00004.safetensors", + "model.layers.14.attention.query_key_value.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.0.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.0.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.0.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.1.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.1.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.1.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.2.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.2.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.2.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.3.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.3.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.3.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.4.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.4.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.4.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.49.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.49.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.49.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.5.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.5.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.5.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.50.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.50.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.50.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.51.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.51.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.51.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.52.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.52.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.52.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.53.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.53.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.53.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.54.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.54.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.54.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.55.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.55.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.55.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.56.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.56.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.56.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.57.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.57.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.57.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.58.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.58.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.58.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.59.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.59.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.59.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.6.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.6.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.6.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.60.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.60.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.60.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.61.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.61.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.61.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.62.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.62.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.62.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.63.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.63.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.63.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.7.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.7.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.7.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.8.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.8.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.8.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.attention.dense.weight": "model-00002-of-00004.safetensors", + "model.layers.15.attention.query_key_value.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.0.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.0.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.0.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.1.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.1.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.1.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.2.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.2.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.2.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.3.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.3.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.3.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.4.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.4.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.4.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.49.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.49.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.49.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.5.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.5.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.5.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.50.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.50.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.50.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.51.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.51.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.51.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.52.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.52.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.52.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.53.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.53.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.53.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.54.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.54.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.54.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.55.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.55.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.55.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.56.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.56.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.56.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.57.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.57.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.57.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.58.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.58.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.58.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.59.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.59.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.59.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.6.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.6.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.6.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.60.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.60.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.60.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.61.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.61.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.61.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.62.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.62.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.62.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.63.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.63.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.63.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.7.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.7.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.7.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.8.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.8.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.8.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.attention.dense.weight": "model-00002-of-00004.safetensors", + "model.layers.16.attention.query_key_value.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.0.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.0.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.0.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.1.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.1.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.1.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.2.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.2.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.2.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.3.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.3.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.3.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.4.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.4.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.4.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.5.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.5.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.5.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.6.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.6.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.6.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.experts.7.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.7.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.7.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.8.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.8.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.8.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.17.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.17.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.21.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.22.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.22.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.22.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.23.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.23.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.23.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.24.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.24.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.24.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.25.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.25.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.25.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.26.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.26.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.26.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.27.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.27.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.27.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.28.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.28.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.28.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.29.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.29.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.29.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.30.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.30.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.30.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.31.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.31.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.31.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.32.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.32.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.32.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.33.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.33.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.33.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.34.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.34.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.34.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.35.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.35.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.35.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.36.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.36.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.36.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.37.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.37.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.37.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.38.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.38.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.38.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.39.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.39.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.39.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.40.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.40.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.40.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.41.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.41.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.41.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.42.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.42.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.42.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.43.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.43.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.43.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.44.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.44.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.44.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.45.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.45.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.45.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.46.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.46.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.46.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.47.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.47.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.47.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.48.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.48.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.48.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.18.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.21.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.22.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.22.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.22.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.23.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.23.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.23.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.24.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.24.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.24.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.25.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.25.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.25.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.26.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.26.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.26.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.27.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.27.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.27.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.28.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.28.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.28.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.29.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.29.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.29.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.30.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.30.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.30.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.31.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.31.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.31.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.32.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.32.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.32.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.33.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.33.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.33.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.34.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.34.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.34.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.35.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.35.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.35.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.36.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.36.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.36.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.37.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.37.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.37.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.38.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.38.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.38.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.39.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.39.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.39.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.40.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.40.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.40.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.41.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.41.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.41.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.42.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.42.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.42.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.43.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.43.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.43.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.44.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.44.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.44.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.45.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.45.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.45.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.46.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.46.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.46.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.47.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.47.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.47.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.48.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.48.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.48.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.19.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.21.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.22.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.22.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.22.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.23.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.23.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.23.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.24.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.24.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.24.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.25.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.25.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.25.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.26.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.26.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.26.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.27.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.27.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.27.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.28.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.28.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.28.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.29.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.29.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.29.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.30.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.30.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.30.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.31.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.31.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.31.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.32.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.32.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.32.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.33.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.33.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.33.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.34.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.34.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.34.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.35.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.35.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.35.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.36.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.36.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.36.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.37.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.37.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.37.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.38.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.38.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.38.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.39.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.39.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.39.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.40.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.40.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.40.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.41.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.41.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.41.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.42.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.42.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.42.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.43.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.43.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.43.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.44.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.44.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.44.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.45.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.45.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.45.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.46.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.46.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.46.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.47.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.47.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.47.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.48.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.48.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.48.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.2.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.2.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.10.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.10.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.10.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.11.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.11.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.11.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.12.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.12.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.12.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.13.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.13.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.13.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.14.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.14.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.14.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.15.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.15.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.15.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.16.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.16.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.16.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.17.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.17.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.17.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.18.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.18.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.18.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.19.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.19.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.19.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.20.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.20.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.20.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.21.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.21.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.21.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.22.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.22.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.22.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.23.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.23.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.23.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.24.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.24.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.24.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.25.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.25.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.25.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.26.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.26.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.26.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.27.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.27.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.27.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.28.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.28.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.28.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.29.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.29.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.29.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.30.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.30.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.30.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.31.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.31.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.31.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.32.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.32.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.32.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.33.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.33.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.33.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.34.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.34.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.34.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.35.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.35.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.35.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.36.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.36.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.36.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.37.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.37.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.37.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.38.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.38.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.38.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.39.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.39.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.39.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.40.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.40.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.40.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.41.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.41.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.41.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.42.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.42.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.42.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.43.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.43.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.43.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.44.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.44.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.44.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.45.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.45.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.45.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.46.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.46.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.46.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.47.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.47.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.47.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.48.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.48.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.48.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.49.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.49.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.49.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.50.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.50.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.50.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.51.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.51.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.51.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.52.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.52.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.52.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.53.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.53.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.53.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.54.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.54.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.54.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.55.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.55.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.55.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.56.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.56.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.56.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.57.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.57.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.57.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.58.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.58.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.58.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.59.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.59.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.59.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.60.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.60.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.60.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.61.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.61.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.61.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.62.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.62.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.62.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.63.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.63.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.63.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.9.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.9.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.experts.9.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.shared_experts.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.shared_experts.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.shared_experts.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.20.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.20.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.21.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.22.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.22.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.22.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.23.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.23.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.23.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.24.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.24.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.24.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.25.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.25.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.25.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.26.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.26.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.26.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.27.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.27.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.27.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.28.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.28.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.28.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.29.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.29.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.29.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.30.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.30.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.30.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.31.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.31.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.31.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.32.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.32.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.32.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.33.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.33.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.33.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.34.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.34.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.34.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.35.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.35.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.35.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.36.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.36.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.36.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.37.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.37.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.37.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.38.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.38.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.38.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.39.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.39.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.39.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.40.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.40.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.40.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.41.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.41.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.41.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.42.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.42.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.42.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.43.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.43.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.43.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.44.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.44.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.44.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.45.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.45.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.45.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.46.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.46.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.46.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.47.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.47.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.47.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.48.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.48.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.48.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.21.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.21.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.22.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.22.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.22.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.23.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.23.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.23.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.24.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.24.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.24.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.25.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.25.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.25.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.26.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.26.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.26.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.27.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.27.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.27.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.28.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.28.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.28.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.29.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.29.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.29.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.30.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.30.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.30.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.31.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.31.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.31.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.32.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.32.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.32.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.33.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.33.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.33.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.34.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.34.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.34.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.35.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.35.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.35.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.36.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.36.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.36.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.37.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.37.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.37.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.38.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.38.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.38.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.39.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.39.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.39.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.40.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.40.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.40.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.41.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.41.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.41.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.42.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.42.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.42.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.43.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.43.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.43.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.44.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.44.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.44.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.45.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.45.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.45.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.46.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.46.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.46.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.47.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.47.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.47.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.48.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.48.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.48.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.22.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.21.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.22.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.22.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.22.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.23.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.23.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.23.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.24.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.24.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.24.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.25.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.25.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.25.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.26.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.26.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.26.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.27.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.27.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.27.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.28.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.28.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.28.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.29.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.29.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.29.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.30.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.30.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.30.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.31.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.31.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.31.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.32.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.32.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.32.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.33.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.33.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.33.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.34.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.34.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.34.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.35.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.35.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.35.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.36.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.36.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.36.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.37.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.37.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.37.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.38.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.38.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.38.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.39.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.39.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.39.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.40.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.40.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.40.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.41.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.41.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.41.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.42.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.42.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.42.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.43.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.43.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.43.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.44.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.44.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.44.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.45.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.45.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.45.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.46.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.46.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.46.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.47.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.47.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.47.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.48.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.48.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.48.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.23.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.21.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.22.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.22.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.22.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.23.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.23.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.23.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.24.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.24.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.24.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.25.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.25.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.25.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.26.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.26.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.26.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.27.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.27.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.27.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.28.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.28.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.28.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.29.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.29.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.29.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.30.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.30.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.30.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.31.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.31.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.31.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.32.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.32.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.32.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.33.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.33.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.33.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.34.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.34.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.34.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.35.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.35.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.35.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.36.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.36.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.36.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.37.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.37.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.37.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.38.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.38.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.38.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.39.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.39.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.39.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.40.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.40.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.40.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.41.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.41.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.41.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.42.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.42.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.42.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.43.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.43.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.43.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.44.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.44.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.44.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.45.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.45.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.45.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.46.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.46.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.46.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.47.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.47.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.47.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.48.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.48.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.48.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.24.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.21.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.22.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.22.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.22.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.23.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.23.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.23.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.24.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.24.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.24.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.25.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.25.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.25.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.26.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.26.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.26.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.27.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.27.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.27.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.28.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.28.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.28.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.29.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.29.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.29.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.30.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.30.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.30.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.31.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.31.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.31.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.32.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.32.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.32.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.33.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.33.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.33.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.34.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.34.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.34.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.35.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.35.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.35.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.36.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.36.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.36.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.37.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.37.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.37.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.38.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.38.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.38.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.39.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.39.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.39.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.40.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.40.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.40.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.41.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.41.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.41.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.42.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.42.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.42.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.43.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.43.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.43.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.44.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.44.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.44.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.45.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.45.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.45.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.46.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.46.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.46.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.47.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.47.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.47.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.48.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.48.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.48.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.49.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.49.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.49.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.50.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.50.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.50.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.51.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.51.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.51.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.52.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.52.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.52.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.53.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.53.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.53.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.54.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.54.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.54.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.55.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.55.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.55.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.56.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.56.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.56.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.57.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.57.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.57.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.58.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.58.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.58.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.59.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.59.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.59.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.60.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.60.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.60.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.61.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.61.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.61.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.62.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.62.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.62.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.63.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.63.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.63.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.attention.dense.weight": "model-00003-of-00004.safetensors", + "model.layers.25.attention.query_key_value.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.0.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.0.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.0.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.1.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.1.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.1.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.10.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.10.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.10.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.11.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.11.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.11.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.12.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.12.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.12.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.13.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.13.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.13.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.14.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.14.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.14.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.15.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.15.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.15.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.16.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.16.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.16.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.17.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.17.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.17.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.18.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.18.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.18.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.19.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.19.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.19.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.2.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.2.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.2.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.20.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.20.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.20.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.21.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.21.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.21.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.22.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.22.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.22.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.23.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.23.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.23.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.24.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.24.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.24.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.25.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.25.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.25.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.26.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.26.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.26.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.27.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.27.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.27.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.28.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.28.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.28.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.29.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.29.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.29.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.3.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.3.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.3.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.30.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.30.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.30.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.31.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.31.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.31.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.32.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.32.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.32.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.33.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.33.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.33.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.34.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.34.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.34.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.35.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.35.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.35.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.36.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.36.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.36.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.37.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.37.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.37.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.38.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.38.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.38.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.39.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.39.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.39.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.4.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.4.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.4.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.40.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.40.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.40.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.41.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.41.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.41.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.42.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.42.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.42.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.43.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.43.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.43.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.44.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.44.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.44.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.45.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.45.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.45.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.46.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.46.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.46.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.47.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.47.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.47.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.48.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.48.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.48.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.49.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.49.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.49.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.5.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.5.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.5.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.50.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.50.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.50.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.51.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.51.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.51.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.52.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.52.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.52.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.53.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.53.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.53.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.54.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.54.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.54.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.55.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.55.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.55.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.56.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.56.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.56.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.57.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.57.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.57.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.58.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.58.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.58.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.59.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.59.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.59.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.6.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.6.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.6.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.60.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.60.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.60.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.61.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.61.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.61.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.62.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.62.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.62.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.63.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.63.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.63.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.experts.7.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.7.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.7.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.8.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.8.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.8.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.9.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.9.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.experts.9.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.26.attention.dense.weight": "model-00004-of-00004.safetensors", + "model.layers.26.attention.query_key_value.weight": "model-00004-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.0.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.0.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.0.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.1.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.1.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.1.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.10.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.10.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.10.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.11.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.11.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.11.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.12.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.12.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.12.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.13.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.13.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.13.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.14.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.14.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.14.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.15.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.15.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.15.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.16.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.16.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.16.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.17.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.17.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.17.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.18.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.18.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.18.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.19.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.19.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.19.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.2.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.2.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.2.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.20.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.20.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.20.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.21.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.21.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.21.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.22.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.22.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.22.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.23.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.23.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.23.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.24.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.24.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.24.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.25.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.25.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.25.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.26.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.26.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.26.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.27.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.27.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.27.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.28.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.28.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.28.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.29.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.29.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.29.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.3.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.3.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.3.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.30.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.30.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.30.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.31.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.31.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.31.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.32.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.32.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.32.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.33.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.33.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.33.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.34.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.34.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.34.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.35.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.35.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.35.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.36.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.36.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.36.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.37.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.37.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.37.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.38.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.38.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.38.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.39.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.39.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.39.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.4.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.4.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.4.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.40.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.40.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.40.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.41.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.41.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.41.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.42.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.42.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.42.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.43.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.43.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.43.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.44.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.44.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.44.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.45.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.45.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.45.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.46.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.46.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.46.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.47.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.47.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.47.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.48.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.48.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.48.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.49.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.49.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.49.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.5.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.5.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.5.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.50.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.50.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.50.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.51.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.51.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.51.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.52.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.52.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.52.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.53.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.53.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.53.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.54.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.54.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.54.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.55.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.55.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.55.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.56.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.56.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.56.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.57.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.57.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.57.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.58.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.58.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.58.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.59.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.59.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.59.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.6.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.6.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.6.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.60.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.60.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.60.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.61.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.61.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.61.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.62.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.62.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.62.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.63.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.63.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.63.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.7.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.7.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.7.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.8.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.8.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.8.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.9.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.9.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.experts.9.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.gate.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.27.attention.dense.weight": "model-00004-of-00004.safetensors", + "model.layers.27.attention.query_key_value.weight": "model-00004-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.0.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.0.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.0.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.1.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.1.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.1.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.10.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.10.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.10.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.11.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.11.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.11.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.12.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.12.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.12.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.13.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.13.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.13.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.14.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.14.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.14.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.15.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.15.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.15.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.16.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.16.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.16.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.17.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.17.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.17.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.18.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.18.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.18.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.19.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.19.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.19.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.2.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.2.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.2.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.20.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.20.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.20.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.21.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.21.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.21.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.22.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.22.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.22.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.23.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.23.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.23.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.24.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.24.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.24.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.25.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.25.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.25.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.26.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.26.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.26.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.27.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.27.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.27.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.28.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.28.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.28.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.29.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.29.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.29.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.3.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.3.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.3.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.30.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.30.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.30.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.31.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.31.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.31.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.32.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.32.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.32.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.33.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.33.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.33.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.34.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.34.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.34.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.35.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.35.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.35.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.36.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.36.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.36.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.37.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.37.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.37.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.38.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.38.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.38.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.39.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.39.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.39.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.4.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.4.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.4.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.40.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.40.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.40.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.41.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.41.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.41.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.42.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.42.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.42.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.43.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.43.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.43.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.44.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.44.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.44.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.45.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.45.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.45.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.46.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.46.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.46.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.47.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.47.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.47.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.48.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.48.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.48.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.49.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.49.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.49.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.5.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.5.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.5.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.50.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.50.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.50.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.51.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.51.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.51.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.52.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.52.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.52.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.53.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.53.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.53.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.54.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.54.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.54.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.55.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.55.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.55.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.56.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.56.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.56.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.57.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.57.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.57.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.58.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.58.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.58.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.59.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.59.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.59.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.6.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.6.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.6.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.60.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.60.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.60.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.61.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.61.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.61.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.62.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.62.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.62.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.63.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.63.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.63.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.7.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.7.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.7.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.8.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.8.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.8.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.9.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.9.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.experts.9.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.gate.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.3.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.3.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.10.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.10.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.10.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.11.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.11.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.11.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.12.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.12.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.12.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.13.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.13.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.13.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.14.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.14.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.14.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.15.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.15.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.15.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.16.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.16.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.16.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.17.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.17.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.17.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.18.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.18.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.18.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.19.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.19.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.19.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.20.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.20.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.20.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.21.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.21.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.21.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.22.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.22.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.22.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.23.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.23.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.23.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.24.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.24.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.24.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.25.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.25.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.25.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.26.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.26.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.26.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.27.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.27.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.27.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.28.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.28.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.28.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.29.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.29.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.29.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.30.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.30.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.30.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.31.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.31.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.31.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.32.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.32.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.32.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.33.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.33.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.33.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.34.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.34.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.34.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.35.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.35.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.35.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.36.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.36.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.36.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.37.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.37.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.37.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.38.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.38.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.38.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.39.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.39.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.39.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.40.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.40.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.40.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.41.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.41.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.41.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.42.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.42.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.42.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.43.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.43.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.43.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.44.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.44.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.44.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.45.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.45.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.45.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.46.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.46.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.46.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.47.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.47.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.47.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.48.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.48.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.48.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.49.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.49.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.49.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.50.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.50.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.50.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.51.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.51.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.51.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.52.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.52.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.52.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.53.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.53.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.53.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.54.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.54.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.54.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.55.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.55.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.55.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.56.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.56.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.56.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.57.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.57.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.57.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.58.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.58.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.58.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.59.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.59.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.59.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.60.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.60.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.60.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.61.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.61.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.61.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.62.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.62.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.62.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.63.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.63.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.63.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.9.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.9.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.experts.9.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.4.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.10.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.10.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.10.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.11.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.11.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.11.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.12.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.12.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.12.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.13.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.13.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.13.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.14.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.14.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.14.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.15.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.15.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.15.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.16.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.16.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.16.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.17.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.17.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.17.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.18.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.18.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.18.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.19.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.19.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.19.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.20.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.20.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.20.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.21.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.21.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.21.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.22.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.22.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.22.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.23.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.23.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.23.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.24.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.24.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.24.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.25.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.25.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.25.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.26.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.26.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.26.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.27.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.27.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.27.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.28.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.28.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.28.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.29.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.29.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.29.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.30.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.30.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.30.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.31.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.31.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.31.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.32.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.32.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.32.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.33.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.33.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.33.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.34.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.34.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.34.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.35.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.35.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.35.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.36.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.36.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.36.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.37.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.37.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.37.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.38.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.38.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.38.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.39.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.39.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.39.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.40.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.40.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.40.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.41.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.41.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.41.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.42.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.42.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.42.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.43.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.43.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.43.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.44.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.44.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.44.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.45.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.45.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.45.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.46.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.46.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.46.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.47.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.47.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.47.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.48.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.48.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.48.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.49.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.49.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.49.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.50.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.50.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.50.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.51.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.51.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.51.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.52.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.52.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.52.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.53.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.53.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.53.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.54.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.54.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.54.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.55.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.55.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.55.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.56.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.56.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.56.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.57.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.57.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.57.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.58.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.58.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.58.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.59.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.59.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.59.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.60.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.60.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.60.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.61.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.61.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.61.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.62.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.62.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.62.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.63.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.63.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.63.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.9.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.9.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.experts.9.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.5.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.10.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.10.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.10.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.11.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.11.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.11.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.12.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.12.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.12.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.13.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.13.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.13.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.14.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.14.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.14.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.15.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.15.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.15.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.16.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.16.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.16.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.17.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.17.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.17.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.18.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.18.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.18.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.19.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.19.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.19.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.20.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.20.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.20.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.21.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.21.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.21.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.22.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.22.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.22.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.23.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.23.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.23.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.24.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.24.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.24.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.25.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.25.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.25.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.26.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.26.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.26.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.27.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.27.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.27.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.28.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.28.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.28.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.29.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.29.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.29.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.30.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.30.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.30.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.31.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.31.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.31.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.32.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.32.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.32.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.33.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.33.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.33.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.34.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.34.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.34.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.35.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.35.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.35.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.36.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.36.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.36.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.37.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.37.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.37.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.38.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.38.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.38.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.39.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.39.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.39.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.40.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.40.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.40.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.41.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.41.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.41.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.42.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.42.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.42.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.43.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.43.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.43.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.44.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.44.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.44.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.45.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.45.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.45.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.46.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.46.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.46.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.47.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.47.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.47.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.48.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.48.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.48.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.49.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.49.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.49.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.50.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.50.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.50.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.51.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.51.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.51.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.52.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.52.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.52.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.53.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.53.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.53.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.54.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.54.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.54.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.55.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.55.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.55.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.56.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.56.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.56.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.57.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.57.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.57.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.58.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.58.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.58.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.59.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.59.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.59.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.60.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.60.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.60.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.61.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.61.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.61.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.62.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.62.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.62.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.63.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.63.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.63.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.9.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.9.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.experts.9.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.6.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.10.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.10.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.10.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.11.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.11.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.11.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.12.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.12.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.12.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.13.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.13.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.13.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.14.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.14.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.14.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.15.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.15.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.15.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.16.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.16.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.16.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.17.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.17.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.17.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.18.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.18.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.18.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.19.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.19.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.19.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.20.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.20.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.20.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.21.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.21.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.21.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.22.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.22.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.22.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.23.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.23.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.23.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.24.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.24.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.24.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.25.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.25.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.25.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.26.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.26.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.26.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.27.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.27.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.27.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.28.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.28.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.28.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.29.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.29.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.29.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.30.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.30.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.30.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.31.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.31.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.31.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.32.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.32.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.32.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.33.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.33.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.33.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.34.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.34.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.34.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.35.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.35.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.35.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.36.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.36.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.36.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.37.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.37.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.37.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.38.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.38.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.38.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.39.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.39.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.39.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.40.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.40.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.40.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.41.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.41.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.41.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.42.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.42.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.42.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.43.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.43.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.43.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.44.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.44.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.44.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.45.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.45.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.45.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.46.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.46.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.46.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.47.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.47.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.47.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.48.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.48.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.48.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.49.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.49.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.49.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.50.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.50.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.50.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.51.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.51.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.51.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.52.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.52.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.52.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.53.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.53.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.53.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.54.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.54.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.54.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.55.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.55.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.55.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.56.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.56.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.56.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.57.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.57.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.57.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.58.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.58.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.58.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.59.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.59.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.59.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.60.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.60.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.60.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.61.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.61.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.61.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.62.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.62.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.62.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.63.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.63.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.63.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.9.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.9.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.experts.9.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.7.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.10.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.10.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.10.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.11.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.11.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.11.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.12.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.12.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.12.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.13.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.13.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.13.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.14.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.14.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.14.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.15.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.15.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.15.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.16.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.16.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.16.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.17.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.17.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.17.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.18.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.18.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.18.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.19.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.19.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.19.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.20.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.20.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.20.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.21.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.21.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.21.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.22.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.22.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.22.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.23.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.23.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.23.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.24.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.24.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.24.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.25.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.25.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.25.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.26.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.26.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.26.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.27.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.27.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.27.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.28.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.28.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.28.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.29.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.29.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.29.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.30.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.30.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.30.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.31.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.31.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.31.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.32.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.32.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.32.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.33.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.33.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.33.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.34.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.34.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.34.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.35.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.35.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.35.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.36.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.36.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.36.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.37.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.37.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.37.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.38.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.38.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.38.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.39.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.39.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.39.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.40.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.40.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.40.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.41.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.41.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.41.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.42.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.42.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.42.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.43.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.43.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.43.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.44.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.44.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.44.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.45.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.45.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.45.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.46.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.46.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.46.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.47.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.47.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.47.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.48.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.48.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.48.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.49.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.49.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.49.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.50.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.50.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.50.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.51.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.51.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.51.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.52.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.52.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.52.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.53.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.53.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.53.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.54.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.54.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.54.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.55.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.55.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.55.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.56.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.56.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.56.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.57.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.57.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.57.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.58.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.58.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.58.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.59.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.59.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.59.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.60.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.60.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.60.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.61.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.61.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.61.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.62.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.62.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.62.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.63.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.63.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.63.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.9.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.9.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.experts.9.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.attention.dense.weight": "model-00001-of-00004.safetensors", + "model.layers.8.attention.query_key_value.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.0.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.0.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.0.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.1.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.1.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.1.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.2.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.2.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.2.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.3.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.3.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.3.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.4.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.4.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.4.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.49.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.49.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.49.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.5.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.5.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.5.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.50.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.50.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.50.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.51.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.51.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.51.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.52.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.52.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.52.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.53.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.53.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.53.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.54.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.54.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.54.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.55.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.55.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.55.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.56.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.56.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.56.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.57.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.57.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.57.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.58.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.58.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.58.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.59.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.59.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.59.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.6.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.6.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.6.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.60.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.60.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.60.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.61.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.61.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.61.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.62.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.62.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.62.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.63.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.63.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.63.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.7.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.7.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.7.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.8.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.8.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.8.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.attention.dense.weight": "model-00002-of-00004.safetensors", + "model.layers.9.attention.query_key_value.weight": "model-00002-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.0.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.0.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.0.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.1.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.1.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.1.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.10.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.10.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.10.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.11.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.11.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.11.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.12.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.12.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.12.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.13.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.13.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.13.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.14.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.14.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.14.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.15.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.15.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.15.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.16.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.16.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.16.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.17.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.17.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.17.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.18.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.18.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.18.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.19.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.19.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.19.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.2.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.2.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.2.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.20.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.20.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.20.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.21.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.21.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.21.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.22.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.22.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.22.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.23.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.23.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.23.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.24.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.24.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.24.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.25.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.25.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.25.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.26.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.26.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.26.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.27.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.27.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.27.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.28.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.28.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.28.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.29.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.29.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.29.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.3.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.3.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.3.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.30.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.30.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.30.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.31.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.31.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.31.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.32.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.32.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.32.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.33.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.33.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.33.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.34.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.34.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.34.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.35.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.35.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.35.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.36.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.36.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.36.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.37.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.37.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.37.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.38.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.38.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.38.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.39.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.39.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.39.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.4.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.4.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.4.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.40.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.40.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.40.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.41.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.41.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.41.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.42.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.42.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.42.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.43.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.43.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.43.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.44.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.44.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.44.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.45.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.45.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.45.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.46.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.46.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.46.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.47.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.47.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.47.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.48.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.48.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.48.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.49.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.49.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.49.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.5.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.5.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.5.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.50.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.50.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.50.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.51.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.51.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.51.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.52.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.52.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.52.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.53.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.53.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.53.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.54.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.54.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.54.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.55.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.55.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.55.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.56.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.56.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.56.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.57.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.57.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.57.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.58.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.58.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.58.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.59.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.59.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.59.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.6.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.6.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.6.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.60.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.60.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.60.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.61.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.61.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.61.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.62.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.62.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.62.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.63.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.63.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.63.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.7.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.7.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.7.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.8.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.8.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.8.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.9.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.9.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.experts.9.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors", + "model.word_embeddings.weight": "model-00001-of-00004.safetensors" + } +} diff --git a/modeling_bailing_moe.py b/modeling_bailing_moe.py new file mode 100644 index 0000000..f4977e7 --- /dev/null +++ b/modeling_bailing_moe.py @@ -0,0 +1,1443 @@ +# coding=utf-8 +# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch BailingMoE model.""" +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_causal_attention_mask, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from transformers.modeling_outputs import ( + MoeModelOutputWithPast, + MoeCausalLMOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13 +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from transformers.utils.import_utils import is_torch_fx_available +from .configuration_bailing_moe import BailingMoeConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + +# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. +# It means that the function will not be traced through and simply appear as a node in the graph. +if is_torch_fx_available(): + if not is_torch_greater_or_equal_than_1_13: + import torch.fx + + _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BailingMoeConfig" + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + warnings.warn( + "Calling `transformers.models.BailingMoe.modeling_BailingMoe._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask" + ) + return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) + + +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + warnings.warn( + "Calling `transformers.models.BailingMoe.modeling_BailingMoe._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.BailingMoe.modeling_BailingMoe.AttentionMaskConverter._make_causal_mask" + ) + return AttentionMaskConverter._make_causal_mask( + input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length + ) + + +class BailingMoeRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + BailingMoeRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +ALL_LAYERNORM_LAYERS.append(BailingMoeRMSNorm) + + +class BailingMoeRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + self.max_seq_len_cached = None + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq.to(t.device)) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->BailingMoe +class BailingMoeLinearScalingRotaryEmbedding(BailingMoeRotaryEmbedding): + """BailingMoeRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->BailingMoe +class BailingMoeDynamicNTKScalingRotaryEmbedding(BailingMoeRotaryEmbedding): + """BailingMoeRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class BailingMoeMLP(nn.Module): + def __init__(self, config: BailingMoeConfig, intermediate_size: int): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = intermediate_size + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class BailingMoeGate(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.num_experts = config.num_experts + + # topk selection algorithm + self.norm_topk_prob = config.norm_topk_prob + self.gating_dim = config.hidden_size + self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim))) + self.reset_parameters() + + def reset_parameters(self) -> None: + import torch.nn.init as init + + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + + def forward(self, hidden_states): + bsz, seq_len, h = hidden_states.shape + # compute gating score + hidden_states = hidden_states.view(-1, h) + logits = F.linear(hidden_states, self.weight, None) + scores = logits.softmax(dim=-1, dtype=torch.float32) + + # select top-k experts + topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False) + + # norm gate to sum 1 + if self.top_k > 1 and self.norm_topk_prob: + denominator = topk_weight.sum(dim=-1, keepdim=True) + topk_weight = topk_weight / denominator + + return topk_idx, topk_weight, logits + + +class BailingMoeSparseMoeBlock(nn.Module): + """ + A mixed expert module containing shared experts. + """ + + def __init__(self, config: BailingMoeConfig): + super().__init__() + self.config = config + self.num_experts_per_tok = config.num_experts_per_tok + self.experts = self._setup_experts() + self.gate = BailingMoeGate(config) + if config.num_shared_experts is not None: + self.shared_experts = BailingMoeMLP( + config=config, intermediate_size=config.moe_intermediate_size * config.num_shared_experts + ) + + def _setup_experts(self): + return nn.ModuleList( + [ + BailingMoeMLP(config=self.config, intermediate_size=self.config.moe_intermediate_size) + for _ in range(self.config.num_experts) + ] + ) + + def forward(self, hidden_states): + identity = hidden_states + bsz, seq_len, h = hidden_states.shape + topk_idx, topk_weight, router_logits = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + flat_topk_idx = topk_idx.view(-1) + if self.training: + hidden_states = hidden_states.repeat_interleave(self.num_experts_per_tok, dim=0) + y = torch.empty_like(hidden_states) + for i, expert in enumerate(self.experts): + y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i]) + y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1) + y = y.to(hidden_states.dtype).view(bsz, seq_len, h) + else: + y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(bsz, seq_len, h) + if self.config.num_shared_experts is not None: + y = y + self.shared_experts(identity) + return y, (router_logits.view(bsz, seq_len, -1), topk_idx.view(bsz, seq_len, -1)) + + @torch.no_grad() + def moe_infer(self, x, topk_ids, topk_weight): + cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts))) + cnts.scatter_(1, topk_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = topk_ids.view(-1).argsort() + sorted_tokens = x[idxs // topk_ids.shape[1]] + sorted_tokens_shape = sorted_tokens.shape + tokens_per_expert = tokens_per_expert.cpu().numpy() + outputs = [] + start_idx = 0 + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + expert = self.experts[i] + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + expert_out = expert(tokens_for_this_expert) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) + new_x = torch.empty_like(outs) + new_x[idxs] = outs + final_out = ( + new_x.view(*topk_ids.shape, -1) + .type(topk_weight.dtype) + .mul_(topk_weight.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + return final_out + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->BailingMoe +class BailingMoeAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: BailingMoeConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim or self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + self.query_key_value = nn.Linear( + self.hidden_size, + (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, + bias=config.use_qkv_bias, + ) + self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.use_bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = BailingMoeRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = BailingMoeLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = BailingMoeDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + qkv = self.query_key_value(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) + + query_states, key_states, value_states = qkv.split( + [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2 + ) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states / math.sqrt(self.head_dim), key_states.transpose(2, 3)) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, -1) + + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->BailingMoe +class BailingMoeFlashAttention2(BailingMoeAttention): + """ + BailingMoe flash attention module. This module inherits from `BailingMoeAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # BailingMoeFlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + + qkv = self.query_key_value(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) + + query_states, key_states, value_states = qkv.split( + [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2 + ) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently cast in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slow down training & inference so it is recommended to not cast the LayerNorms + # in fp32. (BailingMoeRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + # Handle the case where the model is quantized + if hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + elif torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + query_length (`int`): + The length of the query sequence in terms of tokens. This represents the number of tokens in the + `query_states` tensor along the sequence dimension. It is used to determine the effective sequence + length for attention computations. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in BailingMoeFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->BailingMoe +class BailingMoeSdpaAttention(BailingMoeAttention): + """ + BailingMoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `BailingMoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from BailingMoeAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "BailingMoeModel is using BailingMoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + qkv = self.query_key_value(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) + + query_states, key_states, value_states = qkv.split( + [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2 + ) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1) + + attn_output = self.dense(attn_output) + + return attn_output, None, past_key_value + + +BAILING_MOE_ATTENTION_CLASSES = { + "eager": BailingMoeAttention, + "flash_attention_2": BailingMoeFlashAttention2, + "sdpa": BailingMoeSdpaAttention, +} + + +class BailingMoeDecoderLayer(nn.Module): + def __init__(self, config: BailingMoeConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.attention = BAILING_MOE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + + self.mlp = ( + BailingMoeSparseMoeBlock(config) + if (config.num_experts is not None and layer_idx >= config.first_k_dense_replace) + else BailingMoeMLP(config=config, intermediate_size=config.intermediate_size) + ) + self.input_layernorm = BailingMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = BailingMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): + cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_router_logits (`bool`, *optional*): + Whether or not to return the logits of all the routers. They are useful for computing the router loss, + and should not be returned during inference. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + if isinstance(hidden_states, tuple): + hidden_states, router_logits = hidden_states + else: + router_logits = None + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + if output_router_logits: + outputs += (router_logits,) + + return outputs + + +BAILINGMOE_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BailingMoeConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare BailingMoe Model outputting raw hidden-states without any specific head on top.", + BAILINGMOE_START_DOCSTRING, +) +class BailingMoePreTrainedModel(PreTrainedModel): + config_class = BailingMoeConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BailingMoeDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +BAILINGMOE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare BailingMoe Model outputting raw hidden-states without any specific head on top.", + BAILINGMOE_START_DOCSTRING, +) +class BailingMoeModel(BailingMoePreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BailingMoeDecoderLayer`] + + Args: + config: BailingMoeConfig + """ + + def __init__(self, config: BailingMoeConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [BailingMoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._use_sdpa = config._attn_implementation == "sdpa" + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.norm = BailingMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.word_embeddings + + def set_input_embeddings(self, value): + self.word_embeddings = value + + @add_start_docstrings_to_model_forward(BAILINGMOE_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, MoeModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_router_logits = ( + output_router_logits if output_router_logits is not None else self.config.output_router_logits + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers." + ) + use_cache = False + + past_key_values_length = 0 + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._use_sdpa and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_router_logits = () if output_router_logits else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + output_router_logits, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + output_router_logits=output_router_logits, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if output_router_logits and layer_outputs[-1] is not None: + all_router_logits += (layer_outputs[-1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits] + if v is not None + ) + return MoeModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + router_logits=all_router_logits, + ) + + +class BailingMoeForCausalLM(BailingMoePreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: BailingMoeConfig): + super().__init__(config) + self.model = BailingMoeModel(config) + self.vocab_size = config.vocab_size + self.norm_head = config.norm_head + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.word_embeddings + + def set_input_embeddings(self, value): + self.model.word_embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(BAILINGMOE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, MoeCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer + + >>> model = BailingMoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_router_logits = ( + output_router_logits if output_router_logits is not None else self.config.output_router_logits + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + output_router_logits=output_router_logits, + return_dict=return_dict, + **kwargs, + ) + + hidden_states = outputs[0] + + if self.norm_head: + if self.training: + norm_weight = ( + self.lm_head.weight / (torch.norm(self.lm_head.weight, p=2, dim=0, keepdim=True) + 1e-7).detach() + ) + logits = F.linear(hidden_states, norm_weight, None) + else: + self.lm_head.weight.data = ( + self.lm_head.weight.data.float() + / (torch.norm(self.lm_head.weight.data.float(), p=2, dim=0, keepdim=True) + 1e-7) + ).to(hidden_states.dtype) + logits = F.linear(hidden_states, self.lm_head.weight.data, None) + self.norm_head = False + else: + logits = self.lm_head(hidden_states) + + logits = logits.float() + + loss = None + aux_loss = None + + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + if output_router_logits: + output = (aux_loss,) + output + return (loss,) + output if loss is not None else output + + return MoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_logits, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, token_type_ids=None, **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = ( + past_key_values.get_max_length() + if hasattr(past_key_values, "get_max_length") + else past_key_values.get_max_cache_shape() + ) + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..fc6b586 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,38 @@ +{ + "additional_special_tokens": [ + "", + "", + "<|arithmetic_start|>", + "<|arithmetic_end|>", + "<|number_start|>", + "<|number_end|>" + ], + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "[CLS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenization_bailing.py b/tokenization_bailing.py new file mode 100644 index 0000000..4bfbb1d --- /dev/null +++ b/tokenization_bailing.py @@ -0,0 +1,1068 @@ +#!/usr/bin/env python3 +# coding=utf-8 +# Copyright (c) Ant Group. All rights reserved. + +import itertools +from typing import Any, Dict, List, Optional, Union + +import torch +from transformers import PreTrainedTokenizerFast +from transformers.tokenization_utils_base import AddedToken, BatchEncoding +from transformers.utils import TensorType, logging + +logger = logging.get_logger(__name__) + + +def is_system(msg): + return msg['role'].lower() == 'system' + + +def is_user(msg): + return msg['role'].lower() in ['human', 'user'] + + +def is_assistant(msg): + return msg['role'].lower() == 'assistant' + + +def _convert_to_conversation(query, system=None): + conversation = [] + if system: + conversation.append({"role": "SYSTEM", "content": system}) + if isinstance(query, str): + conversation.append({"role": "HUMAN", "content": query}) + elif isinstance(query, List): + conversation.extend(query) + elif isinstance(query, Dict): + if "messages" in query: + conversation.extend(query["messages"]) + if "system_message" in query and len(conversation) > 0 and not is_system(conversation[0]): + conversation.insert(0, {"role": "SYSTEM", "content": query["system_message"]}) + else: + conversation.append(query) + return conversation + + +class BailingTokenizer(PreTrainedTokenizerFast): + is_bailing_tokenizer = True + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None + + # add gmask_token + SPECIAL_TOKENS_ATTRIBUTES = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "gmask_token", + "additional_special_tokens", + ] + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + clean_up_tokenization_spaces=False, + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + cls_token="[CLS]", + pad_token="<|endoftext|>", + gmask_token="[gMASK]", + add_bos_token=False, + add_eos_token=False, + **kwargs, + ): + self.add_bos_token = add_bos_token + + self._gmask_token = ( + AddedToken(gmask_token, lstrip=False, rstrip=False, normalized=False) + if isinstance(gmask_token, str) + else gmask_token + ) + + self._sop_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False) + if isinstance(bos_token, str) + else bos_token + ) + + self._eop_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False) + if isinstance(eos_token, str) + else eos_token + ) + + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + bos_token=bos_token, + eos_token=eos_token, + cls_token=cls_token, + pad_token=pad_token, + gmask_token=gmask_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + **kwargs, + ) + + self.check_special_tokens() + + def check_special_tokens(self): + ''' + eos_token, cls_token, mask_token + special tokens should init, check special token is not None + ''' + for name, special_token in zip( + ['eos', 'bos', 'cls', 'gmask'], + [self.eos_token, self.bos_token, self.cls_token, self.gmask_token], + ): + assert special_token is not None, f'should init special token [{name}] in tokenizer_config.json' + + @property + def gmask_token(self) -> Optional[str]: + if self._gmask_token is None: + if self.verbose: + logger.error("Using gmask_token, but it is not set yet.") + return None + return str(self._gmask_token) + + @gmask_token.setter + def gmask_token(self, value): + if not isinstance(value, (str, AddedToken)) and value is not None: + raise ValueError("Cannot set a non-string value as the gmask token") + self._gmask_token = value + + @property + def gmask_token_id(self) -> Optional[int]: + if self._gmask_token is None: + return None + return self.convert_tokens_to_ids(self.gmask_token) + + @property + def sop_token(self) -> Optional[str]: + if self._sop_token is None: + if self.verbose: + logger.error("Using sop_token, but it is not set yet.") + return None + return str(self._sop_token) + + @sop_token.setter + def sop_token(self, value): + if not isinstance(value, (str, AddedToken)) and value is not None: + raise ValueError("Cannot set a non-string value as the sop token") + self._sop_token = value + + @property + def sop_token_id(self) -> Optional[int]: + if self._sop_token is None: + return None + return self.convert_tokens_to_ids(self.sop_token) + + @property + def eop_token(self) -> Optional[str]: + if self._eop_token is None: + if self.verbose: + logger.error("Using eop_token, but it is not set yet.") + return None + return str(self._eop_token) + + @eop_token.setter + def eop_token(self, value): + if not isinstance(value, (str, AddedToken)) and value is not None: + raise ValueError("Cannot set a non-string value as the eop token") + self._eop_token = value + + @property + def eop_token_id(self) -> Optional[int]: + if self._eop_token is None: + return None + return self.convert_tokens_to_ids(self.eop_token) + + @property + def vocab_size(self): + return len(self.get_vocab()) + + def _chat_from_json(self, chat, chat_format="antglm_chat", system=None): + msgs = chat if "messages" not in chat else chat["messages"] + _msgs = [] + sys_msg = None + for msg in msgs: + if is_system(msg): + sys_msg = msg['content'] + else: + _msgs.append(msg) + chat = {"messages": _msgs} + system = system or sys_msg + if system: + chat['system_message'] = system + from .chat_format import Chat + + return Chat.from_json(chat, name=chat_format) + + def apply_chat_template( + self, + conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]], + tools: Optional[List[Dict]] = None, + documents: Optional[List[Dict[str, str]]] = None, + chat_template: Optional[str] = None, + add_generation_prompt: bool = False, + system: str = None, # only used for legacy chatml + tokenize=False, + padding: bool = False, + truncation: bool = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_dict: bool = False, + return_assistant_tokens_mask: bool = False, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ): + if hasattr(self, "chat_template") and self.chat_template: + if isinstance(conversation, Dict) and "messages" in conversation: + conversation = conversation["messages"] + # use transformers built-in method + return super().apply_chat_template( + conversation=conversation, + tools=tools, + documents=documents, + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + tokenize=tokenize, + padding=padding, + truncation=truncation, + return_tensors=return_tensors, + return_dict=return_dict, + return_assistant_tokens_mask=return_assistant_tokens_mask, + tokenizer_kwargs=tokenizer_kwargs, + ) + + # 非chat_template方式后续将不再支持。 + logger.warning("Please set chat_template in tokenizer_config.json!") + + chat_format = kwargs.get('chat_format', 'antglm_chat') + + is_batched = False + + if isinstance(conversation, List) and ( + isinstance(conversation[0], (list, tuple)) or "messages" in conversation[0] + ): + conversations = conversation + is_batched = True + + if not is_batched: + conversations = [conversation] + + rendered = [] + for chat in conversations: + rendered_chat = self._chat_from_json(chat, chat_format=chat_format, system=system).prompt_str + rendered.append(rendered_chat) + + if not is_batched: + rendered = rendered[0] + + if tokenize: + out = self( + rendered, + padding=padding, + truncation=truncation, + max_length=max_length, + add_special_tokens=False, + return_tensors=return_tensors, + ) + if return_dict: + return out + else: + return out["input_ids"] + else: + return rendered + + def _build_position_ids( + self, + mask_pos: int, + bos_pos: int, + max_output_length: int, + rotary_type: Optional[str] = "none", + **kwargs, + ) -> List[List[int]]: + window_size = kwargs.get("window_size", 1024) - 1 + block_position_ids = [0] * bos_pos + + # 获得mask所在的位置,用于后面output positionid的构造 + if "1d" in rotary_type: + position_ids = list(range(bos_pos)) + list(range(mask_pos + 1, mask_pos + max_output_length + 2)) + block_position_ids = block_position_ids + list(range(1, max_output_length + 2)) + elif "2d" in rotary_type: + # 后面input_ids要加一个bos_id + position_ids = list(range(bos_pos)) + position_ids = position_ids + [mask_pos] * (1 + max_output_length) + block_position_ids = block_position_ids + list(range(1, max_output_length + 2)) + else: + # build position ids + position_ids = [] + repeat_times = bos_pos // window_size + for _ in range(repeat_times): + position_ids += list(range(window_size)) + position_ids += list(range(bos_pos - window_size * repeat_times)) + # need consider additional bos_id after input_ids + mask_pos = position_ids[-1] + position_ids += [mask_pos] * (max_output_length + 1) + + block_repeat_times = max_output_length // (window_size - 1) + additional_block_position_ids = [] + for _ in range(block_repeat_times): + additional_block_position_ids += list(range(1, window_size)) + additional_block_position_ids += list( + range(1, max_output_length + 2 - (window_size - 1) * block_repeat_times) + ) + block_position_ids = block_position_ids + additional_block_position_ids + + position_ids = [position_ids, block_position_ids] + return position_ids + + def _build_inputs_for_generation( + self, + input_ids: List[int], + max_input_length=None, + left_truncate=True, + max_output_length=1024, + rotary_type="none", + unidirectional_attention: bool = True, + attention_dtype=None, + **kwargs, + ): + if max_input_length and len(input_ids) > max_input_length: + if left_truncate: + input_ids = input_ids[-max_input_length:] + else: + input_ids = input_ids[:max_input_length] + + is_left_padding = input_ids[0] == self.eos_token_id + if not unidirectional_attention: + if input_ids[0] != self.cls_token_id: + input_ids = [self.cls_token_id] + input_ids + + if self.gmask_token_id not in set(input_ids): + input_ids = input_ids + [self.gmask_token_id] + + mask_pos = input_ids.index(self.gmask_token_id) + sep = len(input_ids) + else: + if self.add_bos_token: + input_ids = input_ids + [self.bos_token_id] + if self.eos_token_id in input_ids: + mask_pos = input_ids.index(self.eos_token_id) - 1 + else: + mask_pos = len(input_ids) - 1 + sep = len(input_ids) - 1 + else: + sep = len(input_ids) + if self.eos_token_id in input_ids: + if is_left_padding: + ori_input_ids = input_ids + input_ids = input_ids[::-1] + mask_pos = input_ids.index(self.eos_token_id) - 1 + mask_pos = max(0, mask_pos) # for empty sequence + if is_left_padding: + input_ids = ori_input_ids + mask_pos = sep - 1 - mask_pos # the first non-eos token + + else: + mask_pos = len(input_ids) - 1 + + position_ids = self._build_position_ids(mask_pos, sep, max_output_length, rotary_type, **kwargs) + + if is_left_padding: + position_ids[0] = [max(0, i - mask_pos) for i in range(len(position_ids[0]))] + + # 后面input_ids要加一个bos_id + total_length = sep + max_output_length + if self.add_bos_token: + total_length += 1 + + def build_mask_matrix(seq_length, sep, mask_pos, unidirectional_attention): + # 长序列使用bool类型节省显存 + if unidirectional_attention: + attention_mask = torch.ones([seq_length, seq_length], dtype=attention_dtype) + attention_mask = torch.tril(attention_mask) + if is_left_padding: + attention_mask[:, :mask_pos] = 0 + else: + attention_mask[:, mask_pos + 1 : sep] = 0 + else: + attention_mask = torch.zeros([seq_length, seq_length], dtype=attention_dtype) + attention_mask[:, : mask_pos + 1] = 1 + for i in range(sep, total_length): + attention_mask[i, sep : i + 1] = 1 + return attention_mask + + if self.add_bos_token: + attention_mask = build_mask_matrix(total_length, sep + 1, mask_pos, unidirectional_attention) + else: + attention_mask = build_mask_matrix(total_length, sep, mask_pos, unidirectional_attention) + attention_mask = torch.unsqueeze(attention_mask, dim=0) + attention_mask = torch.unsqueeze(attention_mask, dim=1) + if attention_dtype is None: + attention_mask = attention_mask.long() + inputs = { + "input_ids": torch.Tensor([input_ids]).long(), + "position_ids": torch.Tensor([position_ids]).long(), + "attention_mask": attention_mask, + } + return BatchEncoding(inputs) + + def build_inputs_for_generation( + self, + input_ids: Union[List[int], List[List[int]], torch.Tensor], + max_input_length=None, + left_truncate=True, + max_output_length=1024, + rotary_type="1d", + unidirectional_attention=True, + attention_dtype=None, + **kwargs, + ): + if isinstance(input_ids, torch.Tensor): + input_ids = input_ids.tolist() + + if isinstance(input_ids[0], list): + input_ids_list = [] + position_ids_list = [] + attention_mask_list = [] + for _input_ids in input_ids: + inputs = self._build_inputs_for_generation( + _input_ids, + max_input_length=max_input_length, + left_truncate=left_truncate, + max_output_length=max_output_length, + rotary_type=rotary_type, + unidirectional_attention=unidirectional_attention, + attention_dtype=attention_dtype, + **kwargs, + ) + input_ids_list.append(inputs['input_ids']) + position_ids_list.append(inputs['position_ids']) + attention_mask_list.append(inputs["attention_mask"]) + + max_ids_length = max([input.size(1) for input in input_ids_list]) + + for i in range(len(input_ids)): + cur_ids_length = input_ids_list[i].size(1) + if cur_ids_length < max_ids_length: + # pad input ids + pad_input_ids = input_ids_list[i].new_zeros((1, max_ids_length - cur_ids_length)) + input_ids_list[i] = torch.cat([pad_input_ids, input_ids_list[i]], dim=-1) + + # pad postition ids with left pad + # 0, 1, 2, 3, 4 ... -> 0, ..., 0, 1, 2, 3, 4, ... + pad_position_ids = input_ids_list[i].new_zeros((1, 2, max_ids_length - cur_ids_length)) + position_ids_list[i] = torch.cat([pad_position_ids, position_ids_list[i]], dim=-1) + + # pad generation attention mask with left and bottom pad + new_attention_mask = input_ids_list[i].new_zeros( + 1, + 1, + max_ids_length + max_output_length, + max_ids_length + max_output_length, + ) + new_attention_mask[ + :, + :, + max_ids_length - cur_ids_length :, + max_ids_length - cur_ids_length :, + ] = attention_mask_list[i] + attention_mask_list[i] = new_attention_mask.contiguous() + + input_ids_list = torch.cat(input_ids_list, dim=0) + position_ids_list = torch.cat(position_ids_list, dim=0) + attention_mask_list = torch.cat(attention_mask_list, dim=0) + + inputs = { + "input_ids": input_ids_list, + "position_ids": position_ids_list, + "attention_mask": attention_mask_list, + } + + return BatchEncoding(inputs) + else: + return self._build_inputs_for_generation( + input_ids, + max_input_length=max_input_length, + left_truncate=left_truncate, + max_output_length=max_output_length, + rotary_type=rotary_type, + unidirectional_attention=unidirectional_attention, + **kwargs, + ) + + def _build_inputs_for_train( + self, + inputs: Union[str, List[str]], + outputs: Union[str, List[str]], + new_conversation_offset: List[int] = None, + max_length: int = 2048, + rotary_type: str = "1d", + left_truncate: bool = True, + unidirectional_attention: bool = True, + isolation_position_ids: bool = False, + padding: bool = True, + use_fa2: bool = True, + use_packed: bool = True, + use_baichuan_packed: bool = False, + skip_truncated_turn: bool = False, + return_attention_mask: bool = True, + ): + r""" + Build tensor input for model training. If inputs and outputs are list, will pack them. + + Args: + inputs (str, List[str], List[Dict], List[List[Dict]]): the input prompts. + outputs (str, List[str]): the output responses. + max_length (int, Optional): the maximum length of the final input ids for training. Default: 2048 + rotary_type (str, Optional): the rotary type of position embedding. Default: 1d + left_truncate (bool, Optional): whether truncate the inputs from left. Default: True + use_fa2 (bool, Optional): whether to build attention mask under flash attention 2. + new_conversation_offset (List[int], Optional): 第idx条样本是全新的对话,[0, 1]代表:inputs[0]和outputs[0]是一个对话,inputs[1]和outputs[1]是一个对话. + """ + if use_packed and use_baichuan_packed and unidirectional_attention: + return self._build_baichuan_inputs_for_train( + inputs, + outputs, + new_conversation_offset, + max_length, + rotary_type, + left_truncate, + skip_truncated_turn, + use_fa2, + padding, + ) + if isinstance(inputs, str): + inputs = [inputs] + if isinstance(outputs, str): + outputs = [outputs] + + assert len(inputs) == len(outputs) + + input_ids = [self(item)['input_ids'] for item in inputs] + output_ids = [self(item)['input_ids'] for item in outputs] + + packed_input_ids = [] + packed_output_ids = [] + if new_conversation_offset is None: + new_conversation_offset = list(range(0, len(inputs))) + assert 0 in new_conversation_offset, f"没有0,请检查new_conversation_offset: {new_conversation_offset}" + current_len = 0 + + for idx, (input, output) in enumerate(zip(input_ids, output_ids)): + num_special_tokens = 0 + if not unidirectional_attention: + if idx in new_conversation_offset: + # cls and gmask + num_special_tokens += 2 + else: + # only gmask + num_special_tokens += 1 + else: + # sop and eos + if self.add_bos_token: + num_special_tokens += 2 + else: + num_special_tokens += 1 + + # truncate + if len(input) + len(output) + current_len > max_length - num_special_tokens: + if not use_packed or use_fa2 and unidirectional_attention: + attention_mask = torch.tensor(0) + elif use_fa2: + attention_mask = -1 * torch.ones([2, max_length]) + else: + attention_mask = torch.tril(torch.ones([max_length, max_length])) + # 返回一个空的样本,该样本不参与训练 + default_return = { + 'input_ids': (torch.ones(max_length) * self.eos_token_id).long(), + 'position_ids': torch.zeros(2, max_length).long(), + 'attention_mask': (attention_mask.long()), + 'labels': (torch.ones(max_length) * -100).long(), + } + # 如果不截断,直接返回 + if skip_truncated_turn: + if current_len == 0: + return default_return + else: + break + left_len = max_length - num_special_tokens - current_len + # 如果截断,只截断prompt + if left_len - len(output) > 0: + if left_truncate: + input = input[-(left_len - len(output)) :] + else: + input = input[: left_len - len(output)] + else: + # response超过left_len,直接返回 + if current_len == 0: + return default_return + else: + break + if unidirectional_attention: + packed_input_ids.append(list(input)) + else: + if num_special_tokens == 4: + packed_input_ids.append([self.cls_token_id] + list(input) + [self.gmask_token_id]) + else: + packed_input_ids.append(list(input) + [self.gmask_token_id]) + + packed_output_ids.append(list(output) + [self.eos_token_id]) + current_len += len(input) + len(output) + num_special_tokens + + assert current_len <= max_length + + if use_packed: + # pack模式 + def build_mask_matrix(seq_length, sep): + # https://github.com/pytorch/pytorch/issues/101932, fix triu/tril bf16 support + m = torch.ones((1, seq_length, seq_length)) + mask = torch.arange(1, m.shape[-1] + 1).reshape(1, -1, 1).to(m.device) + ids = torch.arange(1, m.shape[-1] + 1).reshape(1, 1, -1).expand(1, m.shape[-1], -1).to(m.device) + m = (ids <= mask).type_as(m) + + m[0, :, : int(sep)] = 1 + m = m.squeeze(0) + return m + + tokens = [] + attention_mask_list = [] + input_length_list = [] + position_id_list = [] + block_position_id_list = [] + for input, output in zip(packed_input_ids, packed_output_ids): + if self.add_bos_token: + data = input + [self.sop_token_id] + output + mask_pos = len(input) - 1 + else: + data = input + output + mask_pos = len(input) - 2 + if return_attention_mask: + if unidirectional_attention: + attention_mask = build_mask_matrix(len(data), 0) + else: + attention_mask = build_mask_matrix(len(data), len(input)) + attention_mask = attention_mask.squeeze((0, 1)) + + attention_mask_list.append(attention_mask) + input_length_list.append(len(input)) + tokens += data + + sop_pos = mask_pos + 1 + position_ids, block_position_ids = self._build_position_ids( + mask_pos=mask_pos, bos_pos=sop_pos, max_output_length=len(output), rotary_type=rotary_type + ) + + position_id_list.append(position_ids) + block_position_id_list.append(block_position_ids) + + labels = [] + for i in range(len(packed_input_ids)): + if self.add_bos_token: + labels += [-100] * len(packed_input_ids[i]) + packed_output_ids[i] + [-100] + else: + labels += [-100] * (len(packed_input_ids[i]) - 1) + packed_output_ids[i] + [-100] + + total_len = 0 + if use_fa2: + pack_attention_mask = -1 * torch.ones([2, current_len]) + else: + pack_attention_mask = torch.tril(torch.ones([current_len, current_len])) + + pack_position_ids = [] + pack_block_position_ids = [] + total_len = 0 + max_index = 0 + for i in range(len(position_id_list)): + + if use_fa2: + pack_attention_mask[0][i] = total_len + pack_attention_mask[1][i] = total_len + input_length_list[i] + else: + pack_attention_mask[ + total_len : total_len + attention_mask.shape[0], + total_len : total_len + attention_mask.shape[0], + ] = attention_mask + position_ids = [pid + max_index for pid in position_id_list[i]] + block_position_ids = block_position_id_list[i] + pack_position_ids.extend(position_ids) + pack_block_position_ids.extend(block_position_ids) + if not isolation_position_ids: + max_index = pack_position_ids[-1] + 1 + total_len += len(position_id_list[i]) + position_ids = [pack_position_ids, pack_block_position_ids] + else: + # 单输入模式 + # 真多轮下,一条样本可能会有好几轮对话,此时需要获取第一条样本的结束位置 + if len(new_conversation_offset) > 1: + end_idx = new_conversation_offset[1] + else: + end_idx = 1 + input, output = list(itertools.chain(*packed_input_ids[:end_idx])), list( + itertools.chain(*packed_output_ids[:end_idx]) + ) + if self.add_bos_token: + tokens = input + [self.sop_token_id] + output + else: + tokens = input + output + + if self.add_bos_token: + labels = [-100] * len(input) + output + [-100] + position_ids = self._build_position_ids( + mask_pos=len(input) - 1, bos_pos=len(input), max_output_length=len(output), rotary_type=rotary_type + ) + else: + labels = [-100] * (len(input) - 1) + output + [-100] + position_ids = self._build_position_ids( + mask_pos=len(input) - 2, + bos_pos=len(input) - 1, + max_output_length=len(output), + rotary_type=rotary_type, + ) + attention_mask = len(input) + assert current_len == len(tokens) + + # 最大长度补全 + if max_length > 0 and len(tokens) < max_length and padding: + pad_length = max_length - len(tokens) + tokens += [self.pad_token_id] * pad_length + labels.extend([-100] * pad_length) + position_ids[0] += [0] * pad_length + position_ids[1] += [0] * pad_length + + if use_packed: + if use_fa2: + new_attention_mask = -1 * torch.ones([2, max_length]) + new_attention_mask[:, :current_len] = pack_attention_mask + else: + new_attention_mask = torch.tril(torch.ones([max_length, max_length])) + new_attention_mask[:current_len, :current_len] = pack_attention_mask + pack_attention_mask = new_attention_mask.contiguous() + + assert len(tokens) == len(labels) + + if max_length > 0 and padding: + assert len(tokens) == max_length + + if use_fa2 and unidirectional_attention: + # pack_attention_mask = torch.zeros([1], dtype=torch.long) + pack_attention_mask = torch.tensor(0) + + if use_packed: + if not use_fa2: + attention_mask = pack_attention_mask.unsqueeze(0).long() + else: + attention_mask = pack_attention_mask + else: + attention_mask = torch.tensor(attention_mask).long() + return { + 'input_ids': torch.tensor(tokens).long(), + 'position_ids': torch.tensor(position_ids).long(), + 'attention_mask': attention_mask, + 'labels': torch.tensor(labels).long(), + } + + def _build_baichuan_inputs_for_train( + self, + inputs: Union[str, List[str]], + outputs: Union[str, List[str]], + new_conversation_offset: List[int] = None, + max_length: int = 2048, + rotary_type: str = "1d", + left_truncate: bool = True, + skip_truncated_turn: bool = True, + use_fa2: bool = True, + padding: bool = True, + ): + ''' + input: HUMAN u1 ASSISTANT a11 a12 HUMAN u2 ASSISTANT a21 a22 <|endoftext|> HUMAN u1 ASSISTANT a11 a12 HUMAN u2 ASSISTANT a21 a22 <|endoftext|> + output: x x x x x x a11 a12 <|endoftext|> x x x x x x a21 a22 <|endoftext|> x x x x x x x a11 a12 <|endoftext|> x x x x x x a21 a22 <|endoftext|> x + 只适用真多轮+pack数据训练单向模型,需要打开use_true_multiturn + ''' + if isinstance(inputs, str): + inputs = [inputs] + if isinstance(outputs, str): + outputs = [outputs] + assert len(inputs) == len(outputs) + + input_ids = [self(item)['input_ids'] for item in inputs] + output_ids = [self(item)['input_ids'] for item in outputs] + + packed_input_ids = [] + packed_output_ids = [] + + if new_conversation_offset is None: + new_conversation_offset = list(range(0, len(inputs))) + assert 0 in new_conversation_offset, f"没有0,请检查new_conversation_offset: {new_conversation_offset}" + current_len = 0 + + for idx, (input, output) in enumerate(zip(input_ids, output_ids)): + num_special_tokens = 0 + if idx != 0 and idx in new_conversation_offset: + # 在input_ids加入eos,只有第0条样本不加 + num_special_tokens += 1 + + # truncate + if len(input) + len(output) + current_len > max_length - num_special_tokens: + if use_fa2: + attention_mask = torch.tensor(0) + else: + attention_mask = torch.tril(torch.ones([max_length, max_length])) + # 返回一个空的样本,该样本不参与训练 + default_return = { + 'input_ids': (torch.ones(max_length) * self.eos_token_id).long(), + 'position_ids': torch.zeros(2, max_length).long(), + 'attention_mask': (attention_mask.long()), + 'labels': (torch.ones(max_length) * -100).long(), + } + + # 如果不截断,直接返回 + if skip_truncated_turn: + if current_len == 0: + return default_return + else: + break + left_len = max_length - num_special_tokens - current_len + # 如果截断,只截断prompt + if left_len - len(output) > 0: + if left_truncate: + input = input[-(left_len - len(output)) :] + else: + input = input[: left_len - len(output)] + else: + # response超过left_len,直接返回 + if current_len == 0: + return default_return + else: + break + # 这里拼的是input_ids + if num_special_tokens == 1: + packed_input_ids.append([self.eos_token_id] + list(input)) + else: + packed_input_ids.append(list(input)) + packed_output_ids.append(list(output)) + current_len += len(input) + len(output) + num_special_tokens + assert current_len <= max_length + + def build_mask_matrix(seq_length, sep): + # https://github.com/pytorch/pytorch/issues/101932, fix triu/tril bf16 support + m = torch.ones((1, seq_length, seq_length)) + mask = torch.arange(1, m.shape[-1] + 1).reshape(1, -1, 1).to(m.device) + ids = torch.arange(1, m.shape[-1] + 1).reshape(1, 1, -1).expand(1, m.shape[-1], -1).to(m.device) + m = (ids <= mask).type_as(m) + + m[0, :, : int(sep)] = 1 + m = m.squeeze(0) + return m + + tokens = [] + attention_mask_list = [] + position_id_list = [] + block_position_id_list = [] + token_lens = [] + for input, output in zip(packed_input_ids, packed_output_ids): + data = input + output + if not use_fa2: + attention_mask = build_mask_matrix(len(data), 0) + attention_mask_list.append(attention_mask) + tokens += data + token_lens.append(len(data)) + + position_ids, block_position_ids = self._build_position_ids( + mask_pos=len(input) - 2, bos_pos=len(input) - 1, max_output_length=len(output), rotary_type=rotary_type + ) + + position_id_list.append(position_ids) + block_position_id_list.append(block_position_ids) + + labels = [] + for i in range(len(packed_input_ids)): + labels += [-100] * (len(packed_input_ids[i]) - 1) + packed_output_ids[i] + [self.eos_token_id] + + total_len = 0 + if use_fa2: + pack_attention_mask = torch.Tensor([[0], [1]]) + else: + pack_attention_mask = torch.tril(torch.ones([max_length, max_length])) + + pack_position_ids = [] + pack_block_position_ids = [] + total_len = 0 + max_index = 0 + for i in range(len(token_lens)): + if not use_fa2: + attention_mask = attention_mask_list[i] + pack_attention_mask[ + total_len : total_len + attention_mask.shape[0], total_len : total_len + attention_mask.shape[0] + ] = attention_mask + position_ids = [pid + max_index for pid in position_id_list[i]] + block_position_ids = block_position_id_list[i] + pack_position_ids.extend(position_ids) + pack_block_position_ids.extend(block_position_ids) + max_index = pack_position_ids[-1] + 1 + total_len += token_lens[i] + position_ids = [pack_position_ids, pack_block_position_ids] + + if max_length > 0 and len(tokens) < max_length and padding: + pad_length = max_length - len(tokens) + tokens += [self.pad_token_id] * pad_length + labels.extend([-100] * pad_length) + position_ids[0] += [0] * pad_length + position_ids[1] += [0] * pad_length + + assert len(tokens) == len(labels) + + if not use_fa2: + attention_mask = pack_attention_mask.unsqueeze(0).long() + else: + attention_mask = torch.tensor(0) + return { + 'input_ids': torch.tensor(tokens).long(), + 'position_ids': torch.tensor(position_ids).long(), + 'attention_mask': attention_mask, + 'labels': torch.tensor(labels).long(), + } + + def build_inputs_for_train( + self, + data: Union[Dict, List[Dict]], + new_conversation_offset: List[int] = None, + chat_format="antglm_chat", + is_chat_format=True, # 如果传入的是字符串,用于说明是否已经是 + use_true_multiturn=False, + max_length: int = 2048, + rotary_type: str = "1d", + left_truncate: bool = True, + unidirectional_attention: bool = True, + isolation_position_ids: bool = False, + padding: bool = True, + use_fa2: bool = True, + use_packed: bool = True, + use_baichuan_packed: bool = False, + skip_truncated_turn: bool = False, + return_attention_mask: bool = True, + ): + r""" + Build tensor input for model training. If inputs and outputs are list, will pack them. + + Args: + inputs (str, List[str], List[Dict], List[List[Dict]]): the input prompts. + outputs (str, List[str]): the output responses. + new_conversation_offset (List[int]): the offset index of the new conversation turn. + is_chat_format (bool): whether the input is already chatml format + max_length (int, Optional): the maximum length of the final input ids for training. Default: 2048 + rotary_type (str, Optional): the rotary type of position embedding. Default: 1d + left_truncate (bool, Optional): whether truncate the inputs from left. Default: True + use_fa2 (bool, Optional): whether to build attention mask under flash attention 2. + """ + if isinstance(data, List): + # chatml list + _inputs = [] + _outputs = [] + new_conversation_offset = [] + for _input in data: + if use_true_multiturn: + chat = self._chat_from_json(_input, chat_format=chat_format) + chat_data = chat.prompt_pack + new_conversation_offset.append(len(_inputs)) + _inputs.extend(chat_data['input']) + _outputs.extend(chat_data['output']) + else: + _conversation = _convert_to_conversation(_input) + assert is_assistant(_conversation[-1]) + + _inputs.append( + self.apply_chat_template(_conversation[:-1], tokenize=False, add_generation_prompt=True) + ) + _outputs.append(_conversation[-1]['content']) + + return self._build_inputs_for_train( + inputs=_inputs, + outputs=_outputs, + new_conversation_offset=new_conversation_offset, + max_length=max_length, + rotary_type=rotary_type, + left_truncate=left_truncate, + unidirectional_attention=unidirectional_attention, + isolation_position_ids=isolation_position_ids, + padding=padding, + use_fa2=use_fa2, + use_packed=use_packed, + use_baichuan_packed=use_baichuan_packed, + skip_truncated_turn=skip_truncated_turn, + return_attention_mask=return_attention_mask, + ) + elif isinstance(data, Dict): + if 'messages' in data: + # chatml format + if use_true_multiturn: + chat = self._chat_from_json(data, chat_format=chat_format) + chat_data = chat.prompt_pack + else: + _conversation = _convert_to_conversation(data) + assert is_assistant(_conversation[-1]) + + chat_data = { + "input": self.apply_chat_template( + _conversation[:-1], tokenize=False, add_generation_prompt=True + ), + "output": _conversation[-1]['content'], + } + + return self._build_inputs_for_train( + inputs=chat_data['input'], + outputs=chat_data['output'], + max_length=max_length, + rotary_type=rotary_type, + left_truncate=left_truncate, + unidirectional_attention=unidirectional_attention, + isolation_position_ids=isolation_position_ids, + padding=padding, + use_fa2=use_fa2, + use_packed=use_packed, + use_baichuan_packed=use_baichuan_packed, + skip_truncated_turn=skip_truncated_turn, + return_attention_mask=return_attention_mask, + ) + else: + inputs = data['input'] + outputs = data['output'] + + if isinstance(inputs, str): + inputs = [inputs] + if isinstance(outputs, str): + outputs = [outputs] + + if not is_chat_format and chat_format: + inputs = [ + self.apply_chat_template( + [{"role": "HUMAN", "content": item}], tokenize=False, chat_format=chat_format + ) + for item in inputs + ] + + return self._build_inputs_for_train( + inputs=inputs, + outputs=outputs, + new_conversation_offset=new_conversation_offset, + max_length=max_length, + rotary_type=rotary_type, + left_truncate=left_truncate, + unidirectional_attention=unidirectional_attention, + isolation_position_ids=isolation_position_ids, + padding=padding, + use_fa2=use_fa2, + use_packed=use_packed, + use_baichuan_packed=use_baichuan_packed, + skip_truncated_turn=skip_truncated_turn, + return_attention_mask=return_attention_mask, + ) diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..9d96552 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e35981f02e539be62d3fa40b489a2cd13c4869301e6f419a5ff7167c2d4a056 +size 6098787 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..5313248 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2155 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "added_tokens_decoder": { + "126080": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126081": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126082": { + "content": "[CLS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126083": { + "content": "[gMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126084": { + "content": "<|reserved_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126085": { + "content": "<|reserved_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126086": { + "content": "<|reserved_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126087": { + "content": "<|reserved_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126088": { + "content": "<|reserved_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126089": { + "content": "<|reserved_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126090": { + "content": "<|reserved_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126091": { + "content": "<|reserved_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126092": { + "content": "<|reserved_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126093": { + "content": "<|reserved_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126094": { + "content": "<|reserved_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126095": { + "content": "<|reserved_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126096": { + "content": "<|reserved_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126097": { + "content": "<|reserved_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126098": { + "content": "<|reserved_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126099": { + "content": "<|reserved_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126100": { + "content": "<|reserved_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126101": { + "content": "<|reserved_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126102": { + "content": "<|reserved_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126103": { + "content": "<|reserved_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126104": { + "content": "<|reserved_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126105": { + "content": "<|reserved_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126106": { + "content": "<|reserved_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126107": { + "content": "<|reserved_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126108": { + "content": "<|reserved_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126109": { + "content": "<|reserved_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126110": { + "content": "<|reserved_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126111": { + "content": "<|reserved_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126112": { + "content": "<|reserved_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126113": { + "content": "<|reserved_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126114": { + "content": "<|reserved_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126115": { + "content": "<|reserved_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126116": { + "content": "<|reserved_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126117": { + "content": "<|reserved_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126118": { + "content": "<|reserved_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126119": { + "content": "<|reserved_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126120": { + "content": "<|reserved_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126121": { + "content": "<|reserved_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126122": { + "content": "<|reserved_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126123": { + "content": "<|reserved_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126124": { + "content": "<|reserved_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126125": { + "content": "<|reserved_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126126": { + "content": "<|reserved_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126127": { + "content": "<|reserved_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126128": { + "content": "<|reserved_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126129": { + "content": "<|reserved_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126130": { + "content": "<|reserved_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126131": { + "content": "<|reserved_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126132": { + "content": "<|reserved_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126133": { + "content": "<|reserved_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126134": { + "content": "<|reserved_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126135": { + "content": "<|reserved_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126136": { + "content": "<|reserved_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126137": { + "content": "<|reserved_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126138": { + "content": "<|reserved_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126139": { + "content": "<|reserved_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126140": { + "content": "<|reserved_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126141": { + "content": "<|reserved_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126142": { + "content": "<|reserved_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126143": { + "content": "<|reserved_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126144": { + "content": "<|reserved_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126145": { + "content": "<|reserved_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126146": { + "content": "<|reserved_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126147": { + "content": "<|reserved_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126148": { + "content": "<|reserved_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126149": { + "content": "<|reserved_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126150": { + "content": "<|reserved_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126151": { + "content": "<|reserved_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126152": { + "content": "<|reserved_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126153": { + "content": "<|reserved_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126154": { + "content": "<|reserved_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126155": { + "content": "<|reserved_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126156": { + "content": "<|reserved_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126157": { + "content": "<|reserved_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126158": { + "content": "<|reserved_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126159": { + "content": "<|reserved_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126160": { + "content": "<|reserved_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126161": { + "content": "<|reserved_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126162": { + "content": "<|reserved_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126163": { + "content": "<|reserved_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126164": { + "content": "<|reserved_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126165": { + "content": "<|reserved_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126166": { + "content": "<|reserved_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126167": { + "content": "<|reserved_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126168": { + "content": "<|reserved_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126169": { + "content": "<|reserved_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126170": { + "content": "<|reserved_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126171": { + "content": "<|reserved_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126172": { + "content": "<|reserved_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126173": { + "content": "<|reserved_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126174": { + "content": "<|reserved_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126175": { + "content": "<|reserved_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126176": { + "content": "<|reserved_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126177": { + "content": "<|reserved_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126178": { + "content": "<|reserved_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126179": { + "content": "<|reserved_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126180": { + "content": "<|reserved_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126181": { + "content": "<|reserved_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126182": { + "content": "<|reserved_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126183": { + "content": "<|reserved_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126184": { + "content": "<|reserved_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126185": { + "content": "<|reserved_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126186": { + "content": "<|reserved_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126187": { + "content": "<|reserved_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126188": { + "content": "<|reserved_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126189": { + "content": "<|reserved_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126190": { + "content": "<|reserved_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126191": { + "content": "<|reserved_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126192": { + "content": "<|reserved_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126193": { + "content": "<|reserved_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126194": { + "content": "<|reserved_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126195": { + "content": "<|reserved_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126196": { + "content": "<|reserved_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126197": { + "content": "<|reserved_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126198": { + "content": "<|reserved_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126199": { + "content": "<|reserved_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126200": { + "content": "<|reserved_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126201": { + "content": "<|reserved_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126202": { + "content": "<|reserved_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126203": { + "content": "<|reserved_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126204": { + "content": "<|reserved_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126205": { + "content": "<|reserved_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126206": { + "content": "<|reserved_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126207": { + "content": "<|reserved_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126208": { + "content": "<|reserved_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126209": { + "content": "<|reserved_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126210": { + "content": "<|reserved_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126211": { + "content": "<|reserved_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126212": { + "content": "<|reserved_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126213": { + "content": "<|reserved_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126214": { + "content": "<|reserved_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126215": { + "content": "<|reserved_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126216": { + "content": "<|reserved_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126217": { + "content": "<|reserved_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126218": { + "content": "<|reserved_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126219": { + "content": "<|reserved_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126220": { + "content": "<|reserved_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126221": { + "content": "<|reserved_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126222": { + "content": "<|reserved_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126223": { + "content": "<|reserved_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126224": { + "content": "<|reserved_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126225": { + "content": "<|reserved_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126226": { + "content": "<|reserved_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126227": { + "content": "<|reserved_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126228": { + "content": "<|reserved_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126229": { + "content": "<|reserved_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126230": { + "content": "<|reserved_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126231": { + "content": "<|reserved_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126232": { + "content": "<|reserved_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126233": { + "content": "<|reserved_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126234": { + "content": "<|reserved_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126235": { + "content": "<|reserved_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126236": { + "content": "<|reserved_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126237": { + "content": "<|reserved_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126238": { + "content": "<|reserved_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126239": { + "content": "<|reserved_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126240": { + "content": "<|reserved_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126241": { + "content": "<|reserved_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126242": { + "content": "<|reserved_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126243": { + "content": "<|reserved_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126244": { + "content": "<|reserved_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126245": { + "content": "<|reserved_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126246": { + "content": "<|reserved_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126247": { + "content": "<|reserved_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126248": { + "content": "<|reserved_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126249": { + "content": "<|reserved_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126250": { + "content": "<|reserved_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126251": { + "content": "<|reserved_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126252": { + "content": "<|reserved_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126253": { + "content": "<|reserved_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126254": { + "content": "<|reserved_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126255": { + "content": "<|reserved_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126256": { + "content": "<|reserved_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126257": { + "content": "<|reserved_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126258": { + "content": "<|reserved_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126259": { + "content": "<|reserved_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126260": { + "content": "<|reserved_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126261": { + "content": "<|reserved_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126262": { + "content": "<|reserved_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126263": { + "content": "<|reserved_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126264": { + "content": "<|reserved_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126265": { + "content": "<|reserved_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126266": { + "content": "<|reserved_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126267": { + "content": "<|reserved_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126268": { + "content": "<|reserved_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126269": { + "content": "<|reserved_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126270": { + "content": "<|reserved_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126271": { + "content": "<|reserved_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126272": { + "content": "<|reserved_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126273": { + "content": "<|reserved_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126274": { + "content": "<|reserved_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126275": { + "content": "<|reserved_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126276": { + "content": "<|reserved_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126277": { + "content": "<|reserved_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126278": { + "content": "<|reserved_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126279": { + "content": "<|reserved_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126280": { + "content": "<|reserved_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126281": { + "content": "<|reserved_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126282": { + "content": "<|reserved_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126283": { + "content": "<|reserved_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126284": { + "content": "<|reserved_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126285": { + "content": "<|reserved_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126286": { + "content": "<|reserved_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126287": { + "content": "<|reserved_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126288": { + "content": "<|reserved_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126289": { + "content": "<|reserved_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126290": { + "content": "<|reserved_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126291": { + "content": "<|reserved_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126292": { + "content": "<|reserved_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126293": { + "content": "<|reserved_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126294": { + "content": "<|reserved_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126295": { + "content": "<|reserved_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126296": { + "content": "<|reserved_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126297": { + "content": "<|reserved_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126298": { + "content": "<|reserved_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126299": { + "content": "<|reserved_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126300": { + "content": "<|reserved_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126301": { + "content": "<|reserved_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126302": { + "content": "<|reserved_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126303": { + "content": "<|reserved_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126304": { + "content": "<|reserved_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126305": { + "content": "<|reserved_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126306": { + "content": "<|reserved_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126307": { + "content": "<|reserved_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126308": { + "content": "<|reserved_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126309": { + "content": "<|reserved_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126310": { + "content": "<|reserved_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126311": { + "content": "<|reserved_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126312": { + "content": "<|reserved_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126313": { + "content": "<|reserved_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126314": { + "content": "<|reserved_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126315": { + "content": "<|reserved_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126316": { + "content": "<|reserved_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126317": { + "content": "<|reserved_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126318": { + "content": "<|reserved_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126319": { + "content": "<|reserved_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126320": { + "content": "<|reserved_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126321": { + "content": "<|reserved_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126322": { + "content": "<|reserved_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126323": { + "content": "<|reserved_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126324": { + "content": "<|reserved_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126325": { + "content": "<|reserved_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126326": { + "content": "<|reserved_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126327": { + "content": "<|reserved_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126328": { + "content": "<|reserved_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126329": { + "content": "<|reserved_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126330": { + "content": "<|reserved_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126331": { + "content": "<|reserved_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126332": { + "content": "<|reserved_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126333": { + "content": "<|reserved_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126334": { + "content": "<|reserved_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126335": { + "content": "<|reserved_token_251|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126336": { + "content": "<|reserved_token_252|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126337": { + "content": "<|reserved_token_253|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126338": { + "content": "<|reserved_token_254|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126339": { + "content": "<|reserved_token_255|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126340": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126341": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126342": { + "content": "<|arithmetic_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126343": { + "content": "<|arithmetic_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126344": { + "content": "<|number_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "126345": { + "content": "<|number_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "<|arithmetic_start|>", + "<|arithmetic_end|>", + "<|number_start|>", + "<|number_end|>" + ], + "bos_token": "<|startoftext|>", + "chat_template": "{% for message in messages %}{% set role = message['role'] | lower %}{% if role == 'user' %}{% set role = 'HUMAN' %}{% endif %}{% set role = role | upper %}{{ '' + role + '' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "cls_token": "[CLS]", + "eos_token": "<|endoftext|>", + "fast_tokenizer": true, + "gmask_token": "[gMASK]", + "merges_file": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|endoftext|>", + "tokenizer_class": "PreTrainedTokenizerFast", + "trust_remote_code": true, + "vocab_file": null +}