commit 446ae3ace0c7744117fa03b16c77ebc6191cce31 Author: ModelHub XC Date: Wed May 6 02:00:46 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: EleutherAI/annealing_baseline_ga_v3_interleaved_gclip-0.5-fp-adversarial-20251110_154724 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f89b3f4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,52 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text +model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bc5f30d --- /dev/null +++ b/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..bb28bff --- /dev/null +++ b/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "GPTNeoXForCausalLM" + ], + "attention_bias": true, + "attention_dropout": 0.0, + "bos_token_id": 0, + "classifier_dropout": 0.1, + "dtype": "bfloat16", + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 16384, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 2048, + "model_type": "gpt_neox", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "partial_rotary_factor": 0.25, + "rope_scaling": null, + "rope_theta": 10000.0, + "rotary_emb_base": 10000.0, + "rotary_pct": 0.25, + "tie_word_embeddings": false, + "transformers_version": "4.56.2", + "use_cache": true, + "use_parallel_residual": true, + "vocab_size": 50304 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..f868a2d --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 2, + "transformers_version": "4.56.2" +} diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..4b1575b --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b556012d8caf00bc7f53f3613b1b998d06ccb193154ae0e858e0684e7d193e2e +size 4976746880 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..100d1c5 --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1477477a7f594d0d80bde1038221c9d5a241f935fa75060115496a7d641871bc +size 4967384136 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..6e57bc9 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eac1c6897368bae4abdabae2a887d8c64aeb159e779e70493458eb1f486e7b7 +size 3768422160 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..fad208f --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,396 @@ +{ + "metadata": { + "total_parameters": 6856253440, + "total_size": 13712506880 + }, + "weight_map": { + "embed_out.weight": "model-00003-of-00003.safetensors", + "gpt_neox.embed_in.weight": "model-00001-of-00003.safetensors", + "gpt_neox.final_layer_norm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.final_layer_norm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.0.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.11.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.11.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.11.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.11.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.11.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.11.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.12.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.2.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.20.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.attention.dense.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.attention.dense.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.attention.query_key_value.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.attention.query_key_value.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.input_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.post_attention_layernorm.bias": "model-00002-of-00003.safetensors", + "gpt_neox.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "gpt_neox.layers.24.attention.dense.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.attention.dense.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.attention.query_key_value.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.attention.query_key_value.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.input_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.mlp.dense_h_to_4h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.post_attention_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.attention.dense.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.attention.dense.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.attention.query_key_value.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.attention.query_key_value.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.input_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.mlp.dense_h_to_4h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.post_attention_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.attention.dense.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.attention.dense.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.attention.query_key_value.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.attention.query_key_value.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.input_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.mlp.dense_h_to_4h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.post_attention_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.attention.dense.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.attention.dense.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.attention.query_key_value.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.attention.query_key_value.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.input_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.mlp.dense_h_to_4h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.post_attention_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.attention.dense.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.attention.dense.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.attention.query_key_value.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.attention.query_key_value.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.input_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.mlp.dense_h_to_4h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.post_attention_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.attention.dense.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.attention.dense.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.attention.query_key_value.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.attention.query_key_value.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.input_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.mlp.dense_h_to_4h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.post_attention_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.3.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.30.attention.dense.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.attention.dense.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.attention.query_key_value.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.attention.query_key_value.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.input_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.mlp.dense_h_to_4h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.post_attention_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.attention.dense.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.attention.dense.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.attention.query_key_value.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.attention.query_key_value.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.input_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.mlp.dense_4h_to_h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.mlp.dense_4h_to_h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.mlp.dense_h_to_4h.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.post_attention_layernorm.bias": "model-00003-of-00003.safetensors", + "gpt_neox.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "gpt_neox.layers.4.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.attention.dense.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.attention.dense.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.attention.query_key_value.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.attention.query_key_value.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.input_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.post_attention_layernorm.bias": "model-00001-of-00003.safetensors", + "gpt_neox.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..156262f --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..049c291 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c0d99d84af59e9126913fafe5210822963e9a3065ee43e6833b358b0c2f825 +size 3564303 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..2674b86 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,215 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "<|padding|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "50254": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50255": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50256": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50257": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50258": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50259": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50260": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50261": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50262": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50263": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50264": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50265": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50266": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50267": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50268": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50269": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50270": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50271": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50272": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50273": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50274": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50275": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "50276": { + "content": " ", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|endoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|endoftext|>", + "tokenizer_class": "GPTNeoXTokenizer", + "unk_token": "<|endoftext|>" +}