初始化项目,由ModelHub XC社区提供模型
Model: OEvortex/vortex-3b Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
vortex[[:space:]]3b.png filter=lfs diff=lfs merge=lfs -text
|
||||
50
LICENSE.md
Normal file
50
LICENSE.md
Normal file
@@ -0,0 +1,50 @@
|
||||
************************************************
|
||||
**** HelpingAI License ****
|
||||
************************************************
|
||||
|
||||
Version 2.0
|
||||
|
||||
Developed by Abhay Koul
|
||||
|
||||
### Preamble
|
||||
|
||||
The HelpingAI License governs the use of HelpingAI's digital assets, including but not limited to software, scripts, datasets, documents, images, audio recordings, videos. The HelpingAI License aims to provide clear, comprehensive terms for accessing, modifying, and sharing resources, while promoting ethical development practices.
|
||||
|
||||
### Grant of Rights
|
||||
|
||||
Under the HelpingAI License, HelpingAI grants you the rights to copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Content, provided you comply with the terms and conditions outlined in this document.
|
||||
|
||||
### Terms and Conditions
|
||||
|
||||
To exercise the rights granted in the previous section, you must adhere to the following terms and conditions:
|
||||
|
||||
2.1. **Redistribution of Source Code.**
|
||||
If you redistribute the Source Code, you must include the complete HelpingAI License with your distribution. You must also add clear notifications in all modified files stating:
|
||||
|
||||
> "This Work is released under the HelpingAI License v2.0."
|
||||
|
||||
2.2. **Distribution in Binary Form.**
|
||||
If you distribute Binaries derived from the Source Code, you must include the following statement in your distribution:
|
||||
|
||||
> "This Work is based on the HelpingAI Licensed Work, under the HelpingAI License v2.0."
|
||||
|
||||
2.3. **Notification of Changes.**
|
||||
You must clearly indicate any modifications you make to the Source Code or Documentation, including detailed comments about the nature and extent of the changes. Include the date and originator of the modifications.
|
||||
|
||||
2.4. **Branding Attribution.**
|
||||
You must not remove or alter any HelpingAI branding, logos, or notices included in the Content without explicit prior consent from HelpingAI.
|
||||
|
||||
2.5. **Disclaimer of Warranty.**
|
||||
The Content is provided "AS IS," without any implied warranties, including but not limited to warranties of merchantability, fitness for a particular purpose, and non-infringement.
|
||||
|
||||
2.6. **Limitation of Liability.**
|
||||
To the maximum extent permitted by law, neither HelpingAI nor any contributor shall be liable for any loss, personal injury, property damage, or any indirect, special, incidental, or consequential damages arising from or related to the use of the Content.
|
||||
|
||||
2.7. **Governing Law.**
|
||||
This HelpingAI License shall be governed and construed in accordance with the laws of the jurisdiction where HelpingAI primarily operates.
|
||||
|
||||
### Definitions
|
||||
|
||||
3.1. **"Source Code"** refers to the preferred form for making modifications to the Content, typically represented by human-readable programming languages, scripts, or documentation formats.
|
||||
|
||||
3.2. **"Binaries"** refers to compiled forms of the Source Code, such as executables, libraries, or similar artifacts produced from the Source Code.
|
||||
147
README.md
Normal file
147
README.md
Normal file
@@ -0,0 +1,147 @@
|
||||
---
|
||||
language:
|
||||
- en
|
||||
license: other
|
||||
tags:
|
||||
- HelpingAI
|
||||
- vortex
|
||||
datasets:
|
||||
- OEvortex/Vortex-50k
|
||||
license_name: helpingai
|
||||
license_link: LICENSE.md
|
||||
pipeline_tag: text-generation
|
||||
model-index:
|
||||
- name: vortex-3b
|
||||
results:
|
||||
- task:
|
||||
type: text-generation
|
||||
name: Text Generation
|
||||
dataset:
|
||||
name: AI2 Reasoning Challenge (25-Shot)
|
||||
type: ai2_arc
|
||||
config: ARC-Challenge
|
||||
split: test
|
||||
args:
|
||||
num_few_shot: 25
|
||||
metrics:
|
||||
- type: acc_norm
|
||||
value: 31.91
|
||||
name: normalized accuracy
|
||||
source:
|
||||
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=OEvortex/vortex-3b
|
||||
name: Open LLM Leaderboard
|
||||
- task:
|
||||
type: text-generation
|
||||
name: Text Generation
|
||||
dataset:
|
||||
name: HellaSwag (10-Shot)
|
||||
type: hellaswag
|
||||
split: validation
|
||||
args:
|
||||
num_few_shot: 10
|
||||
metrics:
|
||||
- type: acc_norm
|
||||
value: 56.89
|
||||
name: normalized accuracy
|
||||
source:
|
||||
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=OEvortex/vortex-3b
|
||||
name: Open LLM Leaderboard
|
||||
- task:
|
||||
type: text-generation
|
||||
name: Text Generation
|
||||
dataset:
|
||||
name: MMLU (5-Shot)
|
||||
type: cais/mmlu
|
||||
config: all
|
||||
split: test
|
||||
args:
|
||||
num_few_shot: 5
|
||||
metrics:
|
||||
- type: acc
|
||||
value: 27.32
|
||||
name: accuracy
|
||||
source:
|
||||
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=OEvortex/vortex-3b
|
||||
name: Open LLM Leaderboard
|
||||
- task:
|
||||
type: text-generation
|
||||
name: Text Generation
|
||||
dataset:
|
||||
name: TruthfulQA (0-shot)
|
||||
type: truthful_qa
|
||||
config: multiple_choice
|
||||
split: validation
|
||||
args:
|
||||
num_few_shot: 0
|
||||
metrics:
|
||||
- type: mc2
|
||||
value: 37.39
|
||||
source:
|
||||
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=OEvortex/vortex-3b
|
||||
name: Open LLM Leaderboard
|
||||
- task:
|
||||
type: text-generation
|
||||
name: Text Generation
|
||||
dataset:
|
||||
name: Winogrande (5-shot)
|
||||
type: winogrande
|
||||
config: winogrande_xl
|
||||
split: validation
|
||||
args:
|
||||
num_few_shot: 5
|
||||
metrics:
|
||||
- type: acc
|
||||
value: 60.14
|
||||
name: accuracy
|
||||
source:
|
||||
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=OEvortex/vortex-3b
|
||||
name: Open LLM Leaderboard
|
||||
- task:
|
||||
type: text-generation
|
||||
name: Text Generation
|
||||
dataset:
|
||||
name: GSM8k (5-shot)
|
||||
type: gsm8k
|
||||
config: main
|
||||
split: test
|
||||
args:
|
||||
num_few_shot: 5
|
||||
metrics:
|
||||
- type: acc
|
||||
value: 0.91
|
||||
name: accuracy
|
||||
source:
|
||||
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=OEvortex/vortex-3b
|
||||
name: Open LLM Leaderboard
|
||||
---
|
||||

|
||||
**Model Overview**
|
||||
|
||||
vortex-3b is a 2.78 billion parameter causal language model created by OEvortex that is derived from EleutherAI's Pythia-2.8b and fine-tuned on Vortex-50k dataset'
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
# Initialize the pipeline
|
||||
pipe = pipeline("text-generation", model="OEvortex/vortex-3b")
|
||||
|
||||
# Use the pipeline
|
||||
text = "Once upon a time"
|
||||
generated_text = pipe(text, max_length=100, do_sample=True)[0]['generated_text']
|
||||
|
||||
print(generated_text)
|
||||
```
|
||||
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
|
||||
Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_OEvortex__vortex-3b)
|
||||
|
||||
|
||||
| Metric | vortex 3b | vortex 3b-v2 | dolly-v2-3b | pythia-2.8b-deduped |
|
||||
|---------|----------:|-------------:|------------------:|----------------------------------:|
|
||||
| Avg. | 35.76 | 37.46 | 25.26 | 36.72 |
|
||||
| AI2 Reasoning Challenge (25-Shot) | 31.91 | 39.68 | 22.83 | 36.26 |
|
||||
| HellaSwag (10-Shot) | 56.89 | 65.04 | 26.55 | 60.66 |
|
||||
| MMLU (5-Shot) | 27.32 | 25.09 | 24.7 | 26.78 |
|
||||
| TruthfulQA (0-shot) | 37.39 | 33.80 | 0 | 35.56 |
|
||||
| Winogrande (5-shot) | 60.14 | 59.12 | 59.43 | 60.22 |
|
||||
| GSM8k (5-shot) | 0.91 | 2.05 | 1.86 | 0.83 |
|
||||
|
||||
30
config.json
Normal file
30
config.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"_name_or_path": "EleutherAI/pythia-2.8b",
|
||||
"architectures": [
|
||||
"GPTNeoXForCausalLM"
|
||||
],
|
||||
"attention_bias": true,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 0,
|
||||
"classifier_dropout": 0.1,
|
||||
"eos_token_id": 0,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout": 0.0,
|
||||
"hidden_size": 2560,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 10240,
|
||||
"layer_norm_eps": 1e-05,
|
||||
"max_position_embeddings": 2048,
|
||||
"model_type": "gpt_neox",
|
||||
"num_attention_heads": 32,
|
||||
"num_hidden_layers": 32,
|
||||
"rope_scaling": null,
|
||||
"rotary_emb_base": 10000,
|
||||
"rotary_pct": 0.25,
|
||||
"tie_word_embeddings": false,
|
||||
"torch_dtype": "float16",
|
||||
"transformers_version": "4.38.0.dev0",
|
||||
"use_cache": true,
|
||||
"use_parallel_residual": true,
|
||||
"vocab_size": 50304
|
||||
}
|
||||
7
generation_config.json
Normal file
7
generation_config.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 0,
|
||||
"do_sample": true,
|
||||
"eos_token_id": 0,
|
||||
"transformers_version": "4.38.0.dev0"
|
||||
}
|
||||
212
instruct_pipeline.py
Normal file
212
instruct_pipeline.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import logging
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from transformers import Pipeline, PreTrainedTokenizer
|
||||
|
||||
from transformers.utils import is_tf_available
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
INSTRUCTION_KEY = "### Instruction:"
|
||||
RESPONSE_KEY = "### Response:"
|
||||
END_KEY = "### End"
|
||||
INTRO_BLURB = (
|
||||
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
|
||||
)
|
||||
|
||||
# This is the prompt that is used for generating responses using an already trained model. It ends with the response
|
||||
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
|
||||
PROMPT_FOR_GENERATION_FORMAT = """{intro}
|
||||
|
||||
{instruction_key}
|
||||
{instruction}
|
||||
|
||||
{response_key}
|
||||
""".format(
|
||||
intro=INTRO_BLURB,
|
||||
instruction_key=INSTRUCTION_KEY,
|
||||
instruction="{instruction}",
|
||||
response_key=RESPONSE_KEY,
|
||||
)
|
||||
|
||||
|
||||
def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
|
||||
"""Gets the token ID for a given string that has been added to the tokenizer as a special token.
|
||||
|
||||
When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
|
||||
treated specially and converted to a single, new token. This retrieves the token ID each of these keys map to.
|
||||
|
||||
Args:
|
||||
tokenizer (PreTrainedTokenizer): the tokenizer
|
||||
key (str): the key to convert to a single token
|
||||
|
||||
Raises:
|
||||
RuntimeError: if more than one ID was generated
|
||||
|
||||
Returns:
|
||||
int: the token ID for the given key
|
||||
"""
|
||||
token_ids = tokenizer.encode(key)
|
||||
if len(token_ids) > 1:
|
||||
raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
|
||||
return token_ids[0]
|
||||
|
||||
|
||||
class InstructionTextGenerationPipeline(Pipeline):
|
||||
def __init__(
|
||||
self, *args, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs
|
||||
):
|
||||
"""Initialize the pipeline
|
||||
|
||||
Args:
|
||||
do_sample (bool, optional): Whether or not to use sampling. Defaults to True.
|
||||
max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
|
||||
top_p (float, optional): If set to float < 1, only the smallest set of most probable tokens with
|
||||
probabilities that add up to top_p or higher are kept for generation. Defaults to 0.92.
|
||||
top_k (int, optional): The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
||||
Defaults to 0.
|
||||
"""
|
||||
super().__init__(*args, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k,
|
||||
**kwargs)
|
||||
|
||||
def _sanitize_parameters(self,
|
||||
return_full_text: bool = None,
|
||||
**generate_kwargs):
|
||||
preprocess_params = {}
|
||||
|
||||
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
|
||||
# append a newline to yield a single token. find whatever token is configured for the response key.
|
||||
tokenizer_response_key = next(
|
||||
(token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
|
||||
)
|
||||
|
||||
response_key_token_id = None
|
||||
end_key_token_id = None
|
||||
if tokenizer_response_key:
|
||||
try:
|
||||
response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
|
||||
end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
|
||||
|
||||
# Ensure generation stops once it generates "### End"
|
||||
generate_kwargs["eos_token_id"] = end_key_token_id
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
forward_params = generate_kwargs
|
||||
postprocess_params = {
|
||||
"response_key_token_id": response_key_token_id,
|
||||
"end_key_token_id": end_key_token_id
|
||||
}
|
||||
|
||||
if return_full_text is not None:
|
||||
postprocess_params["return_full_text"] = return_full_text
|
||||
|
||||
return preprocess_params, forward_params, postprocess_params
|
||||
|
||||
def preprocess(self, instruction_text, **generate_kwargs):
|
||||
prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)
|
||||
inputs = self.tokenizer(
|
||||
prompt_text,
|
||||
return_tensors="pt",
|
||||
)
|
||||
inputs["prompt_text"] = prompt_text
|
||||
inputs["instruction_text"] = instruction_text
|
||||
return inputs
|
||||
|
||||
def _forward(self, model_inputs, **generate_kwargs):
|
||||
input_ids = model_inputs["input_ids"]
|
||||
attention_mask = model_inputs.get("attention_mask", None)
|
||||
|
||||
if input_ids.shape[1] == 0:
|
||||
input_ids = None
|
||||
attention_mask = None
|
||||
in_b = 1
|
||||
else:
|
||||
in_b = input_ids.shape[0]
|
||||
|
||||
generated_sequence = self.model.generate(
|
||||
input_ids=input_ids.to(self.model.device),
|
||||
attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
|
||||
pad_token_id=self.tokenizer.pad_token_id,
|
||||
**generate_kwargs,
|
||||
)
|
||||
|
||||
out_b = generated_sequence.shape[0]
|
||||
if self.framework == "pt":
|
||||
generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
|
||||
elif self.framework == "tf":
|
||||
generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
|
||||
|
||||
instruction_text = model_inputs.pop("instruction_text")
|
||||
return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}
|
||||
|
||||
def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_full_text: bool = False):
|
||||
|
||||
generated_sequence = model_outputs["generated_sequence"][0]
|
||||
instruction_text = model_outputs["instruction_text"]
|
||||
|
||||
generated_sequence: List[List[int]] = generated_sequence.numpy().tolist()
|
||||
records = []
|
||||
for sequence in generated_sequence:
|
||||
|
||||
# The response will be set to this variable if we can identify it.
|
||||
decoded = None
|
||||
|
||||
# If we have token IDs for the response and end, then we can find the tokens and only decode between them.
|
||||
if response_key_token_id and end_key_token_id:
|
||||
# Find where "### Response:" is first found in the generated tokens. Considering this is part of the
|
||||
# prompt, we should definitely find it. We will return the tokens found after this token.
|
||||
try:
|
||||
response_pos = sequence.index(response_key_token_id)
|
||||
except ValueError:
|
||||
logger.warn(f"Could not find response key {response_key_token_id} in: {sequence}")
|
||||
response_pos = None
|
||||
|
||||
if response_pos:
|
||||
# Next find where "### End" is located. The model has been trained to end its responses with this
|
||||
# sequence (or actually, the token ID it maps to, since it is a special token). We may not find
|
||||
# this token, as the response could be truncated. If we don't find it then just return everything
|
||||
# to the end. Note that even though we set eos_token_id, we still see the this token at the end.
|
||||
try:
|
||||
end_pos = sequence.index(end_key_token_id)
|
||||
except ValueError:
|
||||
end_pos = None
|
||||
|
||||
decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
|
||||
|
||||
if not decoded:
|
||||
# Otherwise we'll decode everything and use a regex to find the response and end.
|
||||
|
||||
fully_decoded = self.tokenizer.decode(sequence)
|
||||
|
||||
# The response appears after "### Response:". The model has been trained to append "### End" at the
|
||||
# end.
|
||||
m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
|
||||
|
||||
if m:
|
||||
decoded = m.group(1).strip()
|
||||
else:
|
||||
# The model might not generate the "### End" sequence before reaching the max tokens. In this case,
|
||||
# return everything after "### Response:".
|
||||
m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
|
||||
if m:
|
||||
decoded = m.group(1).strip()
|
||||
else:
|
||||
logger.warn(f"Failed to find response in:\n{fully_decoded}")
|
||||
|
||||
# If the full text is requested, then append the decoded text to the original instruction.
|
||||
# This technically isn't the full text, as we format the instruction in the prompt the model has been
|
||||
# trained on, but to the client it will appear to be the full text.
|
||||
if return_full_text:
|
||||
decoded = f"{instruction_text}\n{decoded}"
|
||||
|
||||
rec = {"generated_text": decoded}
|
||||
|
||||
records.append(rec)
|
||||
|
||||
return records
|
||||
3
model-00001-of-00002.safetensors
Normal file
3
model-00001-of-00002.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:57963228eb08d224a2581117cc7768cf5a8764ef7a2a4a764a3d8502b4e74eed
|
||||
size 4978208880
|
||||
3
model-00002-of-00002.safetensors
Normal file
3
model-00002-of-00002.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ac93dcc7420c4ac8151e768373baacab9e998ed885205582cf3706afe3351345
|
||||
size 572254832
|
||||
395
model.safetensors.index.json
Normal file
395
model.safetensors.index.json
Normal file
@@ -0,0 +1,395 @@
|
||||
{
|
||||
"metadata": {
|
||||
"total_size": 5550417920
|
||||
},
|
||||
"weight_map": {
|
||||
"embed_out.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.embed_in.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.final_layer_norm.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.final_layer_norm.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.attention.dense.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.attention.dense.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.attention.query_key_value.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.attention.query_key_value.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.mlp.dense_4h_to_h.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.mlp.dense_h_to_4h.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.mlp.dense_h_to_4h.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.30.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.attention.dense.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.attention.dense.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.attention.query_key_value.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.attention.query_key_value.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.input_layernorm.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.mlp.dense_4h_to_h.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.mlp.dense_h_to_4h.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.post_attention_layernorm.bias": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.attention.dense.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.attention.dense.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.input_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
|
||||
"gpt_neox.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors"
|
||||
}
|
||||
}
|
||||
3
pytorch_model-00001-of-00002.bin
Normal file
3
pytorch_model-00001-of-00002.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:79523e43e43d59d28d10b1dba5f4f9eaff701495cd4d805f58784e7cba6bfdab
|
||||
size 4978295662
|
||||
3
pytorch_model-00002-of-00002.bin
Normal file
3
pytorch_model-00002-of-00002.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c23c5cbaaab95f84b6bc865040dcb3f5c5e6a6f705973199f9721e1f8a18046d
|
||||
size 572260205
|
||||
395
pytorch_model.bin.index.json
Normal file
395
pytorch_model.bin.index.json
Normal file
@@ -0,0 +1,395 @@
|
||||
{
|
||||
"metadata": {
|
||||
"total_size": 5550417920
|
||||
},
|
||||
"weight_map": {
|
||||
"embed_out.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.embed_in.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.0.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.24.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.25.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.26.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.27.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.28.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.29.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.30.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.30.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.30.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.30.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.30.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.30.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.30.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.30.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.30.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.30.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.30.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.30.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.31.attention.dense.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.attention.dense.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.attention.query_key_value.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.attention.query_key_value.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.input_layernorm.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.post_attention_layernorm.bias": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"gpt_neox.layers.4.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.attention.dense.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.attention.dense.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.attention.query_key_value.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.attention.query_key_value.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.input_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.post_attention_layernorm.bias": "pytorch_model-00001-of-00002.bin",
|
||||
"gpt_neox.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin"
|
||||
}
|
||||
}
|
||||
11
special_tokens_map.json
Normal file
11
special_tokens_map.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"additional_special_tokens": [
|
||||
"### End",
|
||||
"### Instruction:",
|
||||
"### Response:"
|
||||
],
|
||||
"bos_token": "<|endoftext|>",
|
||||
"eos_token": "<|endoftext|>",
|
||||
"pad_token": "<|endoftext|>",
|
||||
"unk_token": "<|endoftext|>"
|
||||
}
|
||||
100538
tokenizer.json
Normal file
100538
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
10
tokenizer_config.json
Normal file
10
tokenizer_config.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"add_prefix_space": false,
|
||||
"bos_token": "<|endoftext|>",
|
||||
"eos_token": "<|endoftext|>",
|
||||
"model_max_length": 1000000000000000019884624838656,
|
||||
"name_or_path": "EleutherAI/pythia-2.8b",
|
||||
"special_tokens_map_file": "/admin/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/4e49eadb5d14bd22f314ec3f45b69a87b88c7691/special_tokens_map.json",
|
||||
"tokenizer_class": "GPTNeoXTokenizer",
|
||||
"unk_token": "<|endoftext|>"
|
||||
}
|
||||
3
vortex 3b.png
Normal file
3
vortex 3b.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:04b4c8104a551ec9754fd1169842ee67c06ced0fb16569b5fca804c2068578e9
|
||||
size 1135879
|
||||
Reference in New Issue
Block a user