From 34e5aef21d565af91820c6fedd0b4bd9d00d6b9a Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Mon, 8 Jun 2026 10:18:16 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: hakurei/instruct-12b Source: Original Platform --- .gitattributes | 34 ++ README.md | 105 ++++++ config.json | 25 ++ generation_config.json | 6 + pytorch_model-00001-of-00005.bin | 3 + pytorch_model-00002-of-00005.bin | 3 + pytorch_model-00003-of-00005.bin | 3 + pytorch_model-00004-of-00005.bin | 3 + pytorch_model-00005-of-00005.bin | 3 + pytorch_model.bin.index.json | 551 +++++++++++++++++++++++++++++++ special_tokens_map.json | 6 + tokenizer.json | 3 + tokenizer_config.json | 11 + 13 files changed, 756 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 pytorch_model-00001-of-00005.bin create mode 100644 pytorch_model-00002-of-00005.bin create mode 100644 pytorch_model-00003-of-00005.bin create mode 100644 pytorch_model-00004-of-00005.bin create mode 100644 pytorch_model-00005-of-00005.bin create mode 100644 pytorch_model.bin.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c7d9f33 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,34 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..95b4892 --- /dev/null +++ b/README.md @@ -0,0 +1,105 @@ +--- +license: apache-2.0 +datasets: +- hakurei/open-instruct-v1 +language: +- en +library_name: transformers +pipeline_tag: text-generation +tags: +- instruct +- code +- pythia +inference: false +thumbnail: >- + https://cdn.discordapp.com/attachments/886209362572476486/1095084844066820196/instruct-12b2.png +--- + +# Instruct-12B + +This model was created by finetuning [EleutherAI's Pythia 12B deduped model](https://huggingface.co/EleutherAI/pythia-12b-deduped) on the [Open-Instruct v1 dataset](https://huggingface.co/datasets/hakurei/open-instruct-v1) for the purpose of allowing Large Language Models to follow the user's intentions. Additionally, this model was developed with the intention of creating an open-source alternative for [OpenAI's InstructGPT series of models](https://openai.com/research/instruction-following), therefore the prompting for Instruct-12B and GPT-3.5 will be quite similar. + +If there are any questions that you have regaring this model, please feel free to reach out in my [Discord server](https://discord.gg/touhouai) or in DMs: ``haru#1367``. + +## Usage + +Every prompt, user input, and generated outputs are delimited by double newlines in the dataset. Therefore, to prompt with this model, you would have to write your prompts in the format of: + +``` +user instruction here + +optional additional user input + +generated output +``` + +So, from the above, a formatted prompt could look something like: + +``` +Does this tweet have negative or positive sentiment? + +i hate my life!!!! +``` + +And the following generation would look like this: + +``` +Negative +``` + +## Example Outputs + +``` +Explain what computers are while sounding like a western cowboy + +Computer, pardner. That there machine can store and process information faster than the fastest gunslinger in these parts. It's got more memory than a steer and can make calculations quicker than a pocket watch. It's the equalizer when it comes to computin', and I aim to keep it that way. +``` + +``` +List me several ways on how to be happy + +- Spend time with loved ones +- Take up a hobby +- Exercise regularly +- Eat healthy food +- Get enough sleep +- Stay positive +- Avoid negative people +- Practice meditation +- Do things that make you smile +- Make time for yourself +``` + +``` +How do I make a cake? + +Ingredients: +- Flour +- Sugar +- Eggs +- Milk +- Butter +- Baking powder +- Vanilla extract +- Salt +- Baking soda +- Vegetable oil +Steps: +1. Preheat the oven to 350 degrees. +2. Grease a 9x13 inch baking pan. +3. Sift together the flour, sugar, baking powder, baking soda, and salt in a large bowl. +4. In another bowl, mix the eggs, milk, and vanilla extract. +5. Stir in the vegetable oil and mix until combined. +6. Pour the wet ingredients into the dry ingredients and stir until just combined. +7. Pour batter into the prepared pan. +8. Bake for 30 minutes or until a toothpick inserted in the center comes out clean. +9. Allow to cool before frosting. +10. Frost the cake. +11. Enjoy! +``` + +### Contact + +You can reach out to me in my [Discord server](https://discord.gg/touhouai) for any questions. + +[![Discord Server](https://discordapp.com/api/guilds/930499730843250783/widget.png?style=banner2)](https://discord.gg/touhouai) \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..bf9cb3f --- /dev/null +++ b/config.json @@ -0,0 +1,25 @@ +{ + "_name_or_path": "base-12b", + "architectures": [ + "GPTNeoXForCausalLM" + ], + "bos_token_id": 0, + "eos_token_id": 0, + "hidden_act": "gelu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 20480, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 2048, + "model_type": "gpt_neox", + "num_attention_heads": 40, + "num_hidden_layers": 36, + "rotary_emb_base": 10000, + "rotary_pct": 0.25, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.28.0.dev0", + "use_cache": true, + "use_parallel_residual": true, + "vocab_size": 50277 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..7deffd9 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 0, + "transformers_version": "4.28.0.dev0" +} diff --git a/pytorch_model-00001-of-00005.bin b/pytorch_model-00001-of-00005.bin new file mode 100644 index 0000000..62d8762 --- /dev/null +++ b/pytorch_model-00001-of-00005.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64a8943b47bd8400f94812d4ebd078031ae75f3a8378e5bf93728a4c1c51df11 +size 9873250630 diff --git a/pytorch_model-00002-of-00005.bin b/pytorch_model-00002-of-00005.bin new file mode 100644 index 0000000..ee00b7f --- /dev/null +++ b/pytorch_model-00002-of-00005.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae7081ab939bd10ad3877ab47bac46072d3425411e3c65dbcb0310b2acc9b6cc +size 9678325248 diff --git a/pytorch_model-00003-of-00005.bin b/pytorch_model-00003-of-00005.bin new file mode 100644 index 0000000..80ba5fc --- /dev/null +++ b/pytorch_model-00003-of-00005.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6fb67a01b0fa50713399abdc7358825cf1f992210051b99943dd72cf0181971 +size 9682542681 diff --git a/pytorch_model-00004-of-00005.bin b/pytorch_model-00004-of-00005.bin new file mode 100644 index 0000000..5f75ebb --- /dev/null +++ b/pytorch_model-00004-of-00005.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7509ac3473e4a2fe666ff40f8215ecb3b9d37a7cd26b27bcff56e105fbf1117 +size 9997176965 diff --git a/pytorch_model-00005-of-00005.bin b/pytorch_model-00005-of-00005.bin new file mode 100644 index 0000000..cc17f8a --- /dev/null +++ b/pytorch_model-00005-of-00005.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ad7e99b0d3f2e3ef1a1f9d9e4dcbdebf2f163101e25c1d8db357e3e63250271 +size 8287342839 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..6d5b13c --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,551 @@ +{ + "metadata": { + "total_size": 47386330312.0 + }, + "weight_map": { + "embed_out.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.embed_in.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.final_layer_norm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.final_layer_norm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.0.attention.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.attention.dense.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.attention.dense.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.attention.masked_bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.attention.query_key_value.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.attention.query_key_value.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.input_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.post_attention_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.attention.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.attention.dense.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.attention.dense.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.attention.masked_bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.attention.query_key_value.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.attention.query_key_value.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.input_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.post_attention_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.10.attention.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.attention.dense.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.attention.dense.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.attention.masked_bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.attention.query_key_value.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.attention.query_key_value.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.input_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.post_attention_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.attention.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.attention.dense.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.attention.dense.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.attention.masked_bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.attention.query_key_value.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.attention.query_key_value.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.input_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.post_attention_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.attention.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.attention.dense.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.attention.dense.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.attention.masked_bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.attention.query_key_value.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.attention.query_key_value.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.input_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.post_attention_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.attention.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.attention.dense.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.attention.dense.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.attention.masked_bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.attention.query_key_value.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.attention.query_key_value.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.input_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.post_attention_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.attention.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.attention.dense.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.attention.dense.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.attention.masked_bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.attention.query_key_value.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.attention.query_key_value.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.input_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.post_attention_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.15.attention.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.attention.dense.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.attention.dense.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.attention.masked_bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.attention.query_key_value.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.attention.query_key_value.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.input_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.input_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.post_attention_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.15.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.attention.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.attention.dense.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.attention.dense.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.attention.masked_bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.attention.query_key_value.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.attention.query_key_value.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.input_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.input_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.post_attention_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.16.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.attention.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.attention.dense.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.attention.dense.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.attention.masked_bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.attention.query_key_value.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.attention.query_key_value.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.input_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.input_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.post_attention_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.17.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.attention.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.attention.dense.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.attention.dense.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.attention.masked_bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.attention.query_key_value.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.attention.query_key_value.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.input_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.post_attention_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.attention.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.attention.dense.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.attention.dense.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.attention.masked_bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.attention.query_key_value.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.attention.query_key_value.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.input_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.post_attention_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.2.attention.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.attention.dense.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.attention.dense.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.attention.masked_bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.attention.query_key_value.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.attention.query_key_value.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.input_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.post_attention_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.20.attention.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.attention.dense.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.attention.dense.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.attention.masked_bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.attention.query_key_value.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.attention.query_key_value.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.input_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.post_attention_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.attention.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.attention.dense.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.attention.dense.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.attention.masked_bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.attention.query_key_value.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.attention.query_key_value.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.input_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.post_attention_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.attention.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.attention.dense.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.attention.dense.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.attention.masked_bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.attention.query_key_value.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.attention.query_key_value.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.input_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.22.post_attention_layernorm.bias": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin", + "gpt_neox.layers.23.attention.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.attention.dense.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.attention.dense.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.attention.masked_bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.attention.query_key_value.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.attention.query_key_value.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.input_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.input_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.post_attention_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.23.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.attention.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.attention.dense.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.attention.dense.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.attention.masked_bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.attention.query_key_value.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.attention.query_key_value.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.input_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.input_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.post_attention_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.24.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.attention.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.attention.dense.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.attention.dense.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.attention.masked_bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.attention.query_key_value.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.attention.query_key_value.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.input_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.input_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.post_attention_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.25.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.attention.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.attention.dense.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.attention.dense.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.attention.masked_bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.attention.query_key_value.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.attention.query_key_value.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.input_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.input_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.post_attention_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.26.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.attention.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.attention.dense.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.attention.dense.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.attention.masked_bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.attention.query_key_value.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.attention.query_key_value.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.input_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.input_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.post_attention_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.27.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.attention.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.attention.dense.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.attention.dense.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.attention.masked_bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.attention.query_key_value.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.attention.query_key_value.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.input_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.input_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.post_attention_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.28.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.attention.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.attention.dense.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.attention.dense.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.attention.masked_bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.attention.query_key_value.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.attention.query_key_value.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.input_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.input_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.mlp.dense_4h_to_h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.mlp.dense_h_to_4h.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.post_attention_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.29.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.3.attention.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.attention.dense.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.attention.dense.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.attention.masked_bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.attention.query_key_value.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.attention.query_key_value.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.input_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.post_attention_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.30.attention.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.30.attention.dense.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.30.attention.dense.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.30.attention.masked_bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.30.attention.query_key_value.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.30.attention.query_key_value.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.30.attention.rotary_emb.inv_freq": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.30.input_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.30.input_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.30.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.30.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.30.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.30.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.30.post_attention_layernorm.bias": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.30.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin", + "gpt_neox.layers.31.attention.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.attention.dense.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.attention.dense.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.attention.masked_bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.attention.query_key_value.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.attention.query_key_value.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.input_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.input_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.post_attention_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.31.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.attention.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.attention.dense.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.attention.dense.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.attention.masked_bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.attention.query_key_value.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.attention.query_key_value.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.input_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.input_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.post_attention_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.32.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.attention.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.attention.dense.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.attention.dense.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.attention.masked_bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.attention.query_key_value.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.attention.query_key_value.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.input_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.input_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.post_attention_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.33.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.attention.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.attention.dense.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.attention.dense.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.attention.masked_bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.attention.query_key_value.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.attention.query_key_value.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.input_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.input_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.post_attention_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.34.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.attention.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.attention.dense.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.attention.dense.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.attention.masked_bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.attention.query_key_value.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.attention.query_key_value.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.attention.rotary_emb.inv_freq": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.input_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.input_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.mlp.dense_4h_to_h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.mlp.dense_h_to_4h.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.post_attention_layernorm.bias": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.35.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin", + "gpt_neox.layers.4.attention.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.attention.dense.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.attention.dense.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.attention.masked_bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.attention.query_key_value.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.attention.query_key_value.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.input_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.post_attention_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.attention.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.attention.dense.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.attention.dense.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.attention.masked_bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.attention.query_key_value.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.attention.query_key_value.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.input_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.post_attention_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.attention.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.attention.dense.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.attention.dense.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.attention.masked_bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.attention.query_key_value.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.attention.query_key_value.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.input_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.post_attention_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.7.attention.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.7.attention.dense.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.7.attention.dense.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.7.attention.masked_bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.7.attention.query_key_value.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.7.attention.query_key_value.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.7.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.7.input_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.7.post_attention_layernorm.bias": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin", + "gpt_neox.layers.8.attention.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.attention.dense.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.attention.dense.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.attention.masked_bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.attention.query_key_value.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.attention.query_key_value.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.input_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.post_attention_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.attention.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.attention.dense.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.attention.dense.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.attention.masked_bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.attention.query_key_value.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.attention.query_key_value.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.input_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.post_attention_layernorm.bias": "pytorch_model-00002-of-00005.bin", + "gpt_neox.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..7433646 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..0e86b81 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f4501b22e21aea75ec8ed4209fa5f71bf9ceea2cfb7815426ac632ce712843f +size 2113971 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..fbd09e4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,11 @@ +{ + "add_prefix_space": false, + "bos_token": "<|endoftext|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|endoftext|>", + "model_max_length": 512, + "padding_side": "right", + "special_tokens_map_file": "/admin/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/4e49eadb5d14bd22f314ec3f45b69a87b88c7691/special_tokens_map.json", + "tokenizer_class": "GPTNeoXTokenizer", + "unk_token": "<|endoftext|>" +}