From 6eb85e996c3c2e822105734b2cb829342a8284b4 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Thu, 4 Jun 2026 13:42:13 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: togethercomputer/RedPajama-INCITE-7B-Instruct Source: Original Platform --- .gitattributes | 51 ++++ README.md | 344 ++++++++++++++++++++++ config.json | 25 ++ configuration.json | 1 + generation_config.json | 6 + pytorch_model-00001-of-00002.bin | 3 + pytorch_model-00002-of-00002.bin | 3 + pytorch_model.bin.index.json | 491 +++++++++++++++++++++++++++++++ special_tokens_map.json | 5 + tokenizer.json | 3 + tokenizer_config.json | 9 + 11 files changed, 941 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 configuration.json create mode 100644 generation_config.json create mode 100644 pytorch_model-00001-of-00002.bin create mode 100644 pytorch_model-00002-of-00002.bin create mode 100644 pytorch_model.bin.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..e8878d9 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,51 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text + +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text +pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5665516 --- /dev/null +++ b/README.md @@ -0,0 +1,344 @@ +--- +license: apache-2.0 +language: +- en +datasets: +- togethercomputer/RedPajama-Data-1T +- togethercomputer/RedPajama-Data-Instruct +widget: +- text: |- + Label the sentences as either 'positive', 'negative', 'mixed', or 'neutral': + + Sentence: I can say that there isn't anything I would change. + Label: positive + + Sentence: I'm not sure about this. + Label: neutral + + Sentence: I liked some parts but I didn't like other parts. + Label: mixed + + Sentence: I think the background image could have been better. + Label: negative + + Sentence: I really like it. + Label: + example_title: Sentiment Analysis +- text: |- + Please answer the following question: + + Question: What is the capital of Canada? + Answer: Ottawa + + Question: What is the currency of Switzerland? + Answer: Swiss franc + + Question: In which country is Wisconsin located? + Answer: + example_title: Question Answering +- text: >- + Given a news article, classify its topic. + + Possible labels: 1. World 2. Sports 3. Business 4. Sci/Tech + + + Article: A nearby star thought to harbor comets and asteroids now appears to + be home to planets, too. + + Label: Sci/Tech + + + Article: Soaring crude prices plus worries about the economy and the outlook + for earnings are expected to hang over the stock market next week during the + depth of the summer doldrums. + + Label: Business + + + Article: Murtagh a stickler for success Northeastern field hockey coach + Cheryl Murtagh doesn't want the glare of the spotlight that shines on her to + detract from a team that has been the America East champion for the past + three years and has been to the NCAA tournament 13 times. + + Label:: + example_title: Topic Classification +- text: |- + Paraphrase the given sentence into a different sentence. + + Input: Can you recommend some upscale restaurants in New York? + Output: What upscale restaurants do you recommend in New York? + + Input: What are the famous places we should not miss in Paris? + Output: Recommend some of the best places to visit in Paris? + + Input: Could you recommend some hotels that have cheap price in Zurich? + Output: + example_title: Paraphrasing +- text: >- + Given a review from Amazon's food products, the task is to generate a short + summary of the given review in the input. + + + Input: I have bought several of the Vitality canned dog food products and + have found them all to be of good quality. The product looks more like a + stew than a processed meat and it smells better. My Labrador is finicky and + she appreciates this product better than most. + + Output: Good Quality Dog Food + + + Input: Product arrived labeled as Jumbo Salted Peanuts...the peanuts were + actually small sized unsalted. Not sure if this was an error or if the + vendor intended to represent the product as 'Jumbo'. + + Output: Not as Advertised + + + Input: My toddler loves this game to a point where he asks for it. That's a + big thing for me. Secondly, no glitching unlike one of their competitors + (PlayShifu). Any tech I don’t have to reach out to support for help is a + good tech for me. I even enjoy some of the games and activities in this. + Overall, this is a product that shows that the developers took their time + and made sure people would not be asking for refund. I’ve become bias + regarding this product and honestly I look forward to buying more of this + company’s stuff. Please keep up the great work. + + Output: + example_title: Text Summarization +- text: |- + Identify which sense of a word is meant in a given context. + + Context: The river overflowed the bank. + Word: bank + Sense: river bank + + Context: A mouse takes much more room than a trackball. + Word: mouse + Sense: computer mouse + + Context: The bank will not be accepting cash on Saturdays. + Word: bank + Sense: commercial (finance) banks + + Context: Bill killed the project + Word: kill + Sense: + example_title: Word Sense Disambiguation +- text: >- + Given a pair of sentences, choose whether the two sentences agree + (entailment)/disagree (contradiction) with each other. + + Possible labels: 1. entailment 2. contradiction + + + Sentence 1: The skier was on the edge of the ramp. Sentence 2: The skier was + dressed in winter clothes. + + Label: entailment + + + Sentence 1: The boy skated down the staircase railing. Sentence 2: The boy + is a newbie skater. + + Label: contradiction + + + Sentence 1: Two middle-aged people stand by a golf hole. Sentence 2: A + couple riding in a golf cart. + + Label: + example_title: Natural Language Inference +inference: + parameters: + temperature: 0.7 + top_p: 0.7 + top_k: 50 + max_new_tokens: 128 +--- + +# RedPajama-INCITE-7B-Instruct + +RedPajama-INCITE-7B-Instruct was developed by Together and leaders from the open-source AI community including Ontocord.ai, ETH DS3Lab, AAI CERC, Université de Montréal, MILA - Québec AI Institute, Stanford Center for Research on Foundation Models (CRFM), Stanford Hazy Research research group and LAION. + +The model was fine-tuned for few-shot applications on the data of [GPT-JT](https://huggingface.co/togethercomputer/GPT-JT-6B-v1), with exclusion of tasks that overlap with the HELM core scenarios. + + - Base Model: [RedPajama-INCITE-7B-Base](https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base) + - Instruction-tuned Version: [RedPajama-INCITE-7B-Instruct](https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Instruct) + - Chat Version: [RedPajama-INCITE-7B-Chat](https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat) + + +## Model Details +- **Developed by**: Together Computer. +- **Model type**: Language Model +- **Language(s)**: English +- **License**: Apache 2.0 +- **Model Description**: A 6.9B parameter pretrained language model. + +# Quick Start + +Please note that the model requires `transformers` version >= 4.25.1. + +## GPU Inference + +This requires a GPU with 16GB memory. + +```python +import torch +import transformers +from transformers import AutoTokenizer, AutoModelForCausalLM + +MIN_TRANSFORMERS_VERSION = '4.25.1' + +# check transformers version +assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.' + +# init +tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct") +model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct", torch_dtype=torch.float16) +model = model.to('cuda:0') +# infer +prompt = "Q: The capital of France is?\nA:" +inputs = tokenizer(prompt, return_tensors='pt').to(model.device) +input_length = inputs.input_ids.shape[1] +outputs = model.generate( + **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True +) +token = outputs.sequences[0, input_length:] +output_str = tokenizer.decode(token) +print(output_str) +""" +Paris +""" +``` + +## GPU Inference in Int8 + +This requires a GPU with 12GB memory. + +To run inference with int8, please ensure you have installed accelerate and bitandbytes. You can install them with the following command: + +```bash +pip install accelerate +pip install bitsandbytes +``` + +Then you can run inference with int8 as follows: + +```python +import torch +import transformers +from transformers import AutoTokenizer, AutoModelForCausalLM + +MIN_TRANSFORMERS_VERSION = '4.25.1' + +# check transformers version +assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.' + +# init +tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct") +model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct", device_map='auto', torch_dtype=torch.float16, load_in_8bit=True) + +# infer +prompt = "Q: The capital of France is?\nA:" +inputs = tokenizer(prompt, return_tensors='pt').to(model.device) +input_length = inputs.input_ids.shape[1] +outputs = model.generate( + **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True +) +token = outputs.sequences[0, input_length:] +output_str = tokenizer.decode(token) +print(output_str) +""" +Paris +""" +``` + +## CPU Inference + +```python +import torch +import transformers +from transformers import AutoTokenizer, AutoModelForCausalLM + +MIN_TRANSFORMERS_VERSION = '4.25.1' + +# check transformers version +assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.' + +# init +tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct") +model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct", torch_dtype=torch.bfloat16) +# infer +prompt = "Q: The capital of France is?\nA:" +inputs = tokenizer(prompt, return_tensors='pt').to(model.device) +input_length = inputs.input_ids.shape[1] +outputs = model.generate( + **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True +) +token = outputs.sequences[0, input_length:] +output_str = tokenizer.decode(token) +print(output_str) +""" +Paris +""" +``` + +Please note that since `LayerNormKernelImpl` is not implemented in fp16 for CPU, we use `bfloat16` for CPU inference. + + +# Uses + +## Direct Use + +Excluded uses are described below. + +### Misuse, Malicious Use, and Out-of-Scope Use + +It is the responsibility of the end user to ensure that the model is used in a responsible and ethical manner. + +#### Out-of-Scope Use + +RedPajama-INCITE-7B-Instruct is a language model and may not perform well for other use cases outside of its intended scope. +For example, it may not be suitable for use in safety-critical applications or for making decisions that have a significant impact on individuals or society. +It is important to consider the limitations of the model and to only use it for its intended purpose. + +#### Misuse and Malicious Use + +RedPajama-INCITE-7B-Instruct is designed for language modeling. +Misuse of the model, such as using it to engage in illegal or unethical activities, is strictly prohibited and goes against the principles of the project. + +Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to: + +- Generating fake news, misinformation, or propaganda +- Promoting hate speech, discrimination, or violence against individuals or groups +- Impersonating individuals or organizations without their consent +- Engaging in cyberbullying or harassment +- Defamatory content +- Spamming or scamming +- Sharing confidential or sensitive information without proper authorization +- Violating the terms of use of the model or the data used to train it +- Creating automated bots for malicious purposes such as spreading malware, phishing scams, or spamming + +## Limitations + +RedPajama-INCITE-7B-Instruct, like other language models, has limitations that should be taken into consideration. +For example, the model may not always provide accurate or relevant answers, particularly for questions that are complex, ambiguous, or outside of its training data. +We therefore welcome contributions from individuals and organizations, and encourage collaboration towards creating a more robust and inclusive chatbot. + +## Training + +**Training Data** + +Please refer to [togethercomputer/RedPajama-Data-1T](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T) + +**Training Procedure** + +- **Hardware:** 8 A100 +- **Optimizer:** Adam +- **Gradient Accumulations**: 1 +- **Num of Tokens:** 1B tokens +- **Learning rate:** 1e-5 + +## Community + +Join us on [Together Discord](https://discord.gg/6ZVDU8tTD4) \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..3b24590 --- /dev/null +++ b/config.json @@ -0,0 +1,25 @@ +{ + "_name_or_path": "togethercomputer/RedPajama-INCITE-7B-Instruct", + "architectures": [ + "GPTNeoXForCausalLM" + ], + "bos_token_id": 0, + "eos_token_id": 0, + "hidden_act": "gelu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 16384, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 2048, + "model_type": "gpt_neox", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "rotary_emb_base": 10000, + "rotary_pct": 1.0, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.28.1", + "use_cache": true, + "use_parallel_residual": false, + "vocab_size": 50432 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..344b847 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 0, + "transformers_version": "4.28.1" +} diff --git a/pytorch_model-00001-of-00002.bin b/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000..62afd9e --- /dev/null +++ b/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7b5df87c35bb62501ed7f7e1096330b29de6edccb2eada4d072828ec42ac881 +size 10045936084 diff --git a/pytorch_model-00002-of-00002.bin b/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000..4b23b7c --- /dev/null +++ b/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a66fa668ccef5bb81c3e15f906477015f88dd2dd245a7e22fc6ba563b165f0aa +size 3803056370 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..9f60147 --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,491 @@ +{ + "metadata": { + "total_size": 13731385408.0 + }, + "weight_map": { + "embed_out.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.embed_in.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.0.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.24.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.3.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.30.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.dense.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.dense.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.masked_bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.input_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "gpt_neox.layers.4.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.dense.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.dense.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.masked_bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.input_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin", + "gpt_neox.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..0204ed1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,5 @@ +{ + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "unk_token": "<|endoftext|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..79c8a4c --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf430678137c8491ca82fb7092ee49e44ad38857fffe1e4a4a5ed860139a5b8 +size 2113738 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..af28df4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,9 @@ +{ + "add_prefix_space": false, + "bos_token": "<|endoftext|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|endoftext|>", + "model_max_length": 2048, + "tokenizer_class": "GPTNeoXTokenizer", + "unk_token": "<|endoftext|>" +}