From d7f59851194cfb7e7119289cfb1aed03620fa90a Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 3 Jun 2026 22:36:12 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: togethercomputer/Pythia-Chat-Base-7B Source: Original Platform --- .gitattributes | 51 ++++ README.md | 176 +++++++++++ config.json | 25 ++ configuration.json | 1 + pytorch_model-00001-of-00002.bin | 3 + pytorch_model-00002-of-00002.bin | 3 + pytorch_model.bin.index.json | 491 +++++++++++++++++++++++++++++++ special_tokens_map.json | 5 + tokenizer.json | 3 + tokenizer_config.json | 9 + 10 files changed, 767 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 configuration.json create mode 100644 pytorch_model-00001-of-00002.bin create mode 100644 pytorch_model-00002-of-00002.bin create mode 100644 pytorch_model.bin.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9ee0220 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,51 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text + +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text +pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..1cb73a6 --- /dev/null +++ b/README.md @@ -0,0 +1,176 @@ +--- +license: apache-2.0 +language: +- en +--- + +***

Feel free to try out our [OpenChatKit feedback app](https://huggingface.co/spaces/togethercomputer/OpenChatKit)!

*** + +# Pythia-Chat-Base-7B-v0.16 + +> TLDR: As part of OpenChatKit (codebase available [here](https://github.com/togethercomputer/OpenChaT)), +> Pythia-Chat-Base-7B-v0.16 is a 7B parameter language model, fine-tuned from EleutherAI’s Pythia 7B with over 40 million instructions on 100% carbon negative compute. + +Pythia-Chat-Base-7B-v0.16 is based on ElutherAI’s Pythia-7B model, and is fine-tuned with data focusing on dialog-style interactions. +We focused the tuning on several tasks such as question answering, classification, extraction, and summarization. +We’ve fine-tuned the model with a collection of 43 million high-quality instructions. +Together partnered with LAION and Ontocord.ai, who both helped curate the dataset the model is based on. +You can read more about this process and the availability of this dataset in LAION’s blog post [here](https://laion.ai/blog/oig-dataset/). + +In addition to the aforementioned fine-tuning, Pythia-Chat-Base-7B-v0.16 has also undergone further fine-tuning via a small amount of feedback data. +This process allows the model to better adapt to human preferences in the conversations. + +One of the notable features of Pythia-Chat-Base-7B-v0.16 is its ability to **run inference on a 12GB GPU**, thanks to the quantization technique. +It helps maintain the dialogue capabilities while making the model more accessible to a wider range of users and hardware configurations. + +## Model Details +- **Developed by**: Together Computer. +- **Model type**: Language Model +- **Language(s)**: English +- **License**: Apache 2.0 +- **Model Description**: A 7B parameter open source chat model, fine-tuned from EleutherAI’s Pythia with over 40M instructions on 100% carbon negative compute +- **Resources for more information**: [GitHub Repository](https://github.com/togethercomputer/OpenChaT). + +# Quick Start + +## GPU Inference + +This requires a GPU with 24GB memory. +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +# init +tokenizer = AutoTokenizer.from_pretrained("togethercomputer/Pythia-Chat-Base-7B-v0.16") +model = AutoModelForCausalLM.from_pretrained("togethercomputer/Pythia-Chat-Base-7B-v0.16", torch_dtype=torch.float16) +model = model.to('cuda:0') + +# infer +inputs = tokenizer(": Hello!\n:", return_tensors='pt').to(model.device) +outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8) +output_str = tokenizer.decode(outputs[0]) +print(output_str) +``` + +## GPU Inference in Int8 + +This requires a GPU with 12GB memory. +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +# init +tokenizer = AutoTokenizer.from_pretrained("togethercomputer/Pythia-Chat-Base-7B-v0.16") +model = AutoModelForCausalLM.from_pretrained("togethercomputer/Pythia-Chat-Base-7B-v0.16", device_map="auto", load_in_8bit=True) + +# infer +inputs = tokenizer(": Hello!\n:", return_tensors='pt').to(model.device) +outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8) +output_str = tokenizer.decode(outputs[0]) +print(output_str) +``` + + +## CPU Inference + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +# init +tokenizer = AutoTokenizer.from_pretrained("togethercomputer/Pythia-Chat-Base-7B-v0.16") +model = AutoModelForCausalLM.from_pretrained("togethercomputer/Pythia-Chat-Base-7B-v0.16", torch_dtype=torch.bfloat16) + +# infer +inputs = tokenizer(": Hello!\n:", return_tensors='pt').to(model.device) +outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8) +output_str = tokenizer.decode(outputs[0]) +print(output_str) +``` + + +## Strengths of the model + +There are several tasks that OpenChatKit excels at out of the box. This includes: + +- Summarization and question answering within context. +- Extraction. +- Classification. + +In addition, the model does well on few-shot prompts. For both classification and extraction, the model performs even better with few shots, as in most HELM tasks. [Contact us](https://www.together.xyz/contact) if you’re interested in trying few-shot prompts with the model. + +## Weaknesses of the model + +That said, there are several areas where we have more work to do, and we need your help! Some of these include: + +- Knowledge-based closed question and answering: The chatbot may hallucinate and give incorrect results. Be sure to fact check, and if possible provide feedback with the corrected information. +- Coding tasks: The chatbot was not trained on a large enough corpus of source code to excel at writing code. We welcome contributions of additional datasets to improve this! +- Repetition: Sometimes the chatbot will repeat its response. We’re working to improve this, but in the meantime you can click the refresh button to start a new conversation. +- Context switching: If you change the topic in the middle of a conversation the chatbot often cannot make the switch automatically and will continue to give answers related to the prior topic. +- Creative writing and longer answers: The chatbot does not generate long, creative text such as an essay or story. + +We are excited to work with you to address these weaknesses by getting your feedback, bolstering data sets, and improving accuracy. + +# Uses + +## Direct Use + +The model is intended for research purposes. Possible research areas and tasks include + +- Safe deployment of models which have the potential to generate harmful content. +- Probing and understanding the limitations and biases of dialogue models or language models. +- Generation of artworks and use in design and other artistic processes. +- Applications in educational or creative tools. +- Research on dialogue models or language models. + +Excluded uses are described below. + +### Misuse, Malicious Use, and Out-of-Scope Use + +The OpenChatKit community provides Pythia-Chat-Base-7B-v0.16 as an open source tool for building chatbots. +The community is not responsible for any misuse, malicious use, or out-of-scope use of the model. +It is the responsibility of the end user to ensure that the model is used in a responsible and ethical manner. + +#### Out-of-Scope Use + +Pythia-Chat-Base-7B-v0.16 is designed for use in chatbot applications and may not perform well for other use cases outside of its intended scope. +For example, it may not be suitable for use in safety-critical applications or for making decisions that have a significant impact on individuals or society. +It is important to consider the limitations of the model and to only use it for its intended purpose. + +#### Misuse and Malicious Use + +Pythia-Chat-Base-7B-v0.16 is designed for use in chatbot applications and should not be used for any other purpose. +Misuse of the model, such as using it to engage in illegal or unethical activities, is strictly prohibited and goes against the principles of the OpenChatKit community project. + +Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to: + +- Generating fake news, misinformation, or propaganda +- Promoting hate speech, discrimination, or violence against individuals or groups +- Impersonating individuals or organizations without their consent +- Engaging in cyberbullying or harassment +- Defamatory content +- Spamming or scamming +- Sharing confidential or sensitive information without proper authorization +- Violating the terms of use of the model or the data used to train it +- Creating automated bots for malicious purposes such as spreading malware, phishing scams, or spamming + +## Limitations + +Pythia-Chat-Base-7B-v0.16, like other language model-based chatbots, has limitations that should be taken into consideration. +For example, the model may not always provide accurate or relevant answers, particularly for questions that are complex, ambiguous, or outside of its training data. +We therefore welcome contributions from individuals and organizations, and encourage collaboration towards creating a more robust and inclusive chatbot. + +## Training + +**Training Data** + +Please refer to [togethercomputer/OpenDataHub](https://github.com/togethercomputer/OpenDataHub) + +**Training Procedure** + +- **Hardware:** 8 x A100 GPUs +- **Optimizer:** [8bit-AdamW](https://github.com/TimDettmers/bitsandbytes) +- **Gradient Accumulations**: 4 +- **Batch:** 4 x 4 x 16 x 2048 = 524288 tokens +- **Learning rate:** warmup to 1e-5 for 100 steps and then kept constant + +## Community + +Join us on [Together Discord](https://discord.gg/6ZVDU8tTD4) \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e53527d --- /dev/null +++ b/config.json @@ -0,0 +1,25 @@ +{ + "_name_or_path": "togethercomputer/Pythia-Chat-Base-7B-v0.16", + "architectures": [ + "GPTNeoXForCausalLM" + ], + "bos_token_id": 0, + "eos_token_id": 0, + "hidden_act": "gelu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 16384, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 2048, + "model_type": "gpt_neox", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "rotary_emb_base": 10000, + "rotary_pct": 0.25, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.21.1", + "use_cache": true, + "use_parallel_residual": true, + "vocab_size": 50432 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/pytorch_model-00001-of-00002.bin b/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000..3bad984 --- /dev/null +++ b/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76c64e60ad71d8835fbcf6be49321594f5c02b85f0e3d8cda8b6781d1531b321 +size 10045933972 diff --git a/pytorch_model-00002-of-00002.bin b/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000..8e72a87 --- /dev/null +++ b/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ac9e162f32e582db7a54b65df40f73a4e0243f61597e6cf212369030ac13e6f +size 3803055346 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..94ed2a2 --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,491 @@ +{ + "metadata": { + "total_size": 13848822848 + }, + "weight_map": { + "embed_out.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.embed_in.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.0.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.23.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.24.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.3.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.30.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.4.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..0204ed1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,5 @@ +{ + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "unk_token": "<|endoftext|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..8fa6a67 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c24618a1b3e6a38167beff1c72cffd126c3a66254347304b50547d12c5f25624 +size 2113710 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..89c7087 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,9 @@ +{ + "add_prefix_space": false, + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "name_or_path": "EleutherAI/pythia-6.9b-deduped", + "special_tokens_map_file": "/fsx/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/3523781c8df75f7741687a4284f6f70e1afa12f4/special_tokens_map.json", + "tokenizer_class": "GPTNeoXTokenizer", + "unk_token": "<|endoftext|>" +}