commit 79e7b5cdffacfb4dfd885f8a881f618da1c586c0
Author: ModelHub XC <noreply@modelhub.org.cn>
Date:   Wed Jun 10 15:15:16 2026 +0800

    初始化项目，由ModelHub XC社区提供模型
    
    Model: WestCode1357/gpt-sw3-6.7b
    Source: Original Platform

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..c7d9f33
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,34 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/GPT-SW3_MODEL_GUIDELINES b/GPT-SW3_MODEL_GUIDELINES
new file mode 100644
index 0000000..62f8c97
--- /dev/null
+++ b/GPT-SW3_MODEL_GUIDELINES
@@ -0,0 +1,47 @@
+GPT-SW3 Model Guidelines and Use Policies
+ 
+Introduction
+
+To ensure the responsible and sustainable use of the GPT-SW3 Model, we have established these guidelines and use policies. Every user must adhere to them to guarantee the shared objective of open and fair AI application.
+
+1. Acceptable Use
+
+Users must ensure the application of the AI model respects the principles of fairness, inclusivity, and responsible standards.
+
+Prohibited Activities:
+
+Any activities not in line with Lawful AI or Lawful AI-use as specified from time to time in the Ethics Guidelines for Trustworthy Artificial Intelligence created by High Level Expert Group on AI[1].
+
+Regardless, engaging in illegal or harmful activities, including terrorism, child exploitation, and human trafficking, discrimination, harassment, or harm based on race, gender, ethnicity, or any other protected category, unauthorized practice of regulated professions: such as medical, financial, or legal without proper qualifications, infringing on third-party rights, generating malicious code, or harming infrastructure shall never be other than prohibited activities.
+
+2. Redistribution & Attribution
+
+Users can redistribute the Model or derivatives but must always credit the Model. When redistributing, these guidelines shall be shared and measures shall be taken to ensure the next user abides by them.
+
+3. Feedback and Continuous Improvement
+
+Users are encouraged to provide feedback on the Model's performance and any potential ethical and/or harmful issues . Users are encouraged to suggest enhancements that promote inclusivity, fairness, transparency, and utility.
+
+4. Handling Misinformation and Misrepresentation
+
+Do not use the model to promote, or spread misinformation. Never represent AI-generated outputs as human-generated content.
+
+5. Personal and Sensitive Data
+
+Never use the Model to process or generate personal or sensitive information without explicit consent and legal rights. Always respect privacy rights and data protection laws when using the model.
+
+6. Engagement with the AI Community
+
+Users are encouraged to collaborate with peers, sharing knowledge and best practices related to the Model. Consider joining or initiating forums, workshops, or discussions to promote the responsible use of the Model.
+
+7. Monitoring and Compliance
+
+Regularly review these guidelines and policies to ensure continuous compliance. Understand that non-compliance might lead to the revocation of the license.
+
+8. Dispute Resolution
+
+Approach Lindholmen Science Park directly in case of any conflicts or concerns related to the Model. Both parties should prioritize amicable resolution methods before considering legal action. Every user of the Model, whether an individual, organization, or developer, has the responsibility to prioritize the responsible use of AI. By adhering to these guidelines and use policies, we can collaboratively ensure that AI serves as a tool for progress, fairness, and societal enhancement.
+
+
+________________
+[1] https://digital-strategy.ec.europa.eu/en/policies/expert-group-ai
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..a315a8a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,172 @@
+﻿AI Sweden's LLM AI Model License Agreement
+
+1. Introduction
+
+This license governs the use of GPT-SW3 (the "Model"). By accessing or using the Model, you agree to adhere to these terms. This license aims to promote the open and responsible use of our AI while ensuring the security and respect of users' rights.
+
+2. Grant of License
+
+Lindholmen Science Park hereby grants you, under the terms and conditions stipulated in the AI Sweden’s LLM AI Model License Agreement, (the “Agreement”), a non-exclusive, transferable, worldwide, and royalty-free license to:
+* Use: Access and utilize the Model in accordance with the set limitations in section 7. Acceptable Use Policy and within the stipulated guidelines.
+* Modify: Adjust, adapt, and build upon the Model to better suit your individual or organizational needs, while still adhering to the principles and guidelines established herein.
+* Distribute: Share the Model and your derivative works with others, provided such distribution is in compliance with the terms of this license.
+
+Nature of the Grant
+* Non-exclusive: The license does not confer any form of exclusivity; Lindholmen Science Park reserves the right to grant similar licenses to other individuals or entities.
+* Transferable: You are permitted to sub-license, rent, lease, assign, or otherwise transfer your rights under this license to any third party.
+* Worldwide: This license permits the use, modification, and distribution of the Model anywhere globally, encouraging a collaborative, international approach to furthering the Model's development and application.
+* Royalty-free: You are not required to pay any form of royalties to Lindholmen Science Park for the rights granted under this license, fostering open and unrestricted access to the Model.
+
+Obligations of the Licensee
+* Compliance: You must comply with all the terms and conditions set forth in this license to maintain the rights granted herein.
+
+Reservation of Rights
+* Intellectual Property: This license does not transfer any ownership of the intellectual property associated with the Model. Lindholmen Science Park retains all intellectual property rights not expressly granted in this license.
+* Amendments: Lindholmen Science Park reserves the right to alter the terms of this license in the future, in response to changing technological and societal landscapes, ensuring a dynamic and adaptive licensing approach.
+
+3. Scope of Use
+
+Non-commercial Use
+* Education and Research: Licensees are encouraged to use the Model for educational and research purposes, fostering knowledge sharing and innovation.
+* Public Service: Licensees can leverage the Model to build applications and services aimed at community welfare and public good.
+
+Commercial Use
+* Business Ventures: Licensees are permitted to use the Model to create products, services, or applications that seek to generate profit, encouraging entrepreneurial initiatives.
+* Corporate Research: Licensees may employ the Model for corporate research, enhancing business strategies and solutions.
+
+4. Attribution Requirements
+
+Non-commercial Use
+* Acknowledgement: While it is encouraged, non-commercial users are not strictly required to attribute the Model; however, voluntary attribution is appreciated to acknowledge the efforts of the original creators.
+
+Commercial Use
+* Mandatory Attribution: Commercial users are mandated to provide clear and conspicuous attribution to the original creators of the Model, promoting transparency and credit where it's due. 
+
+Format of Attribution The original creators are AI Sweden, RISE and WASP. The attribution should be presented in a manner that is reasonable and customarily used in commercial products or services, which could include but is not limited to:
+* Documentation: Including attribution in the user manuals, installation guides, or on the official website where the product or service is detailed.
+* Application Interface: Incorporating attribution in a dedicated "About" or "Credits" section within the application or service interface.
+* Marketing Materials: Featuring attribution in marketing and promotional materials, highlighting the utilization of the Model in the commercial offering.
+
+5. Redistribution
+
+* Redistributor: A person or entity that shares the Model with a third party, either in its original form or with modifications.
+
+Authorization for Redistribution
+* Original Model: Licensees are permitted to redistribute the Model in its original form, provided that they comply with the terms and conditions outlined in this license.
+* Modified Model: Licensees are permitted to modify and redistribute the Model, inclusive of derivative works created through the modification of the original Model, aligning with the stipulations delineated in this license.
+
+Adherence to the License
+* Binding Effect on Derivative Works: Any derivative work based on the Model shall be governed by the terms and conditions of this license, ensuring a consistent ethical and legal framework for all adaptations of the Model.
+
+Transparency and Documentation
+* Modifications: Redistributors are required to clearly indicate the nature and extent of modifications undertaken, fostering transparency and informed use.
+
+No Misrepresentation
+* Original Endorsement: Redistributors must not convey or imply any endorsement by the original creators of the Model for the redistributed or derivative work, avoiding any misrepresentation or undue association.
+
+Safety and Ethical Considerations
+* Obligation to Maintain Standards: Redistributors must ensure that the Model, whether in its original or modified form, maintains a standard of safety, ethical utility, and respect for users’ rights, consistent with the objectives and principles embodied in this license.
+
+Regulatory Compliance
+* Legal Adherence: Redistributors are mandated to ensure compliance with applicable legal and regulatory norms, including data protection laws, while redistributing the Model or any derivative works.
+
+Feedback and Community Engagement
+* Feedback Channels: Redistributors are encouraged to establish feedback channels for users and stakeholders, facilitating a collaborative approach towards the continual improvement of the Model and derivative works.
+
+6. Third-party Integrations
+
+Compliance with License Terms: Third parties who integrate the Model into their own systems, products, or services ("Integrated Products") must ensure that such Integrated Products are in full compliance with the terms and conditions outlined in this license. This entails a commitment to uphold the ethical, responsible and lawful utilization of the Model.
+ 
+Notification and Transparency: Third parties are required to clearly notify users of Integrated Products about the incorporation of the Model and ensure that the terms of this license are made accessible to the users to foster transparency and informed usage.
+ 
+Liability: Third parties are responsible for any liabilities arising from their non-compliance with the terms of this license in relation to the use of the Model in Integrated Products. They are required to indemnify Lindholmen Science Park against claims, damages, and losses arising out of such non-compliance.
+ 
+Security and Privacy: Third parties must undertake necessary measures to ensure the security and privacy of the users’ data while using Integrated Products, including adherence to applicable data protection laws and regulations.
+ 
+Intellectual Property Rights: The integration of the Model into third-party systems must respect and preserve the intellectual property rights of Lindholmen Science Park concerning the Model, including trademarks, copyrights, and patents, as applicable.
+  
+Termination: In case of violation of any terms of this license by third parties, Lindholmen Science Park reserves the right to terminate the license granted to such third parties, requiring them to cease the use and integration of the Model in their systems immediately.
+ 
+Feedback and Cooperation: Third parties are encouraged to maintain a collaborative relationship with Lindholmen Science Park, providing feedback on the Model’s performance in Integrated Products and cooperating in efforts to enhance the Model's functionalities and rectify any issues.
+
+7. Acceptable Use Policy
+
+Definition of Guidelines and Use Policies: The "GPT-SW3 Model Guidelines and Use Policies" refer to the structured set of principles, rules, and parameters established by Lindholmen Science Park which govern the acceptable use of the Model. These policies are devised to foster safe, ethical, responsible and sustainable use of the Model that guarantees the shared objective of open and fair Ai application.
+
+ 
+Compliance Obligation: By accessing or utilizing the Model, you expressly agree to adhere to the GPT-SW3 Guidelines and Use Policies as laid down by Lindholmen Science Park. This entails using the Model responsibly, and lawfully, in accordance with the stipulated guidelines.
+ 
+Updates and Modifications: Lindholmen Science Park reserves the right to periodically review, amend, or update the GPT-SW3 Guidelines and Use Policies to adapt to technological advancements, legal developments, or societal changes. Users are responsible for keeping themselves abreast of the latest updates to ensure ongoing compliance.
+ 
+Feedback and Reporting Mechanism: Users are encouraged to actively engage with Lindholmen Science Park through designated channels to report any misuse, violations, or to suggest improvements regarding the Model's functioning, thus fostering a collaborative environment for the enhancement of the Model.
+ 
+Consequences of Violation: Non-compliance with the GPT-SW3 Guidelines and Use Policies may result in punitive actions, including but not limited to, the temporary or permanent revocation of access rights to the Model, legal actions, and/or public disclosure of the violation, as deemed appropriate by Lindholmen Science Park.
+ 
+Indemnification: Users agree to indemnify and hold harmless Lindholmen Science Park against any claims, damages, or liabilities arising out of the violation of the GPT-SW3 Guidelines and Use Policies.
+
+8. Feedback
+
+Commitment to Continuous Improvement: Lindholmen Science Park values the insights and perspectives of our users. We are committed to continuous improvement and encourage users to actively participate in the evolution of the Model through constructive feedback, reporting of issues, and suggestions for enhancements.
+ 
+Feedback Channels: Users may provide their feedback through the following channels:
+ 
+Email: Reach us at nlu@ai.se
+Community Forums: Join discussions and share your insights on our community forums at AI Nordics Discord - http://discord.gg/RgKVztg3xU
+
+Anonymous Feedback: We welcome anonymous feedback to allow users to share their perspectives freely. However, we encourage users to provide contact information to facilitate follow-up discussions and updates on the addressed issues.
+ 
+Responsiveness: While we endeavor to review and consider all feedback received, we cannot guarantee a response to every submission. We appreciate your understanding and patience as we work diligently to enhance the Model for all users.
+ 
+Intellectual Property: By submitting feedback, you grant Lindholmen Science Park a worldwide, non-exclusive, royalty-free, perpetual, irrevocable license to use, reproduce, modify, adapt, publish, distribute, and incorporate such feedback into our work and research, without acknowledgement or compensation to you.
+
+9. Warranty and Liability
+
+The model is provided "as is," and Lindholmen Science Park disclaims all warranties, express or implied.
+No Warranty: THE MODEL IS PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES OF PERFORMANCE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, ACCURACY, OMISSIONS, COMPLETENESS, CURRENTNESS, AND DELAYS.
+ 
+Cap on Liability: TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, LINDHOLMEN SCIENCE PARK 'S CUMULATIVE LIABILITY TO YOU, FOR ANY AND ALL CLAIMS RELATED TO THE MODEL, SHALL NOT EXCEED AN AGGREGATE AMOUNT EQUAL TO THE LESSER OF (i) € 500 OR (ii) THE TOTAL AMOUNTS YOU PAID TO LINDHOLMEN SCIENCE PARK IN THE TWELVE (12) MONTHS IMMEDIATELY PRECEDING THE INCIDENT GIVING RISE TO THE LIABILITY.
+ 
+Exclusion of Certain Liabilities: LINDHOLMEN SCIENCE PARK SHALL NOT BE LIABLE FOR ANY INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE, CONSEQUENTIAL, OR ANY OTHER FORM OF DAMAGES, INCLUDING, BUT NOT LIMITED TO, LOSS OF PROFITS, DATA, GOODWILL, OR ANY OTHER INTANGIBLE LOSSES, ARISING OUT OF OR RELATED TO THIS AGREEMENT, THE USE OR THE INABILITY TO USE THE MODEL, EVEN IF LINDHOLMEN SCIENCE PARK HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+ 
+Essential Purpose: THE LIMITATIONS SPECIFIED IN THIS SECTION SHALL SURVIVE AND APPLY EVEN IF ANY LIMITED REMEDY SPECIFIED IN THIS AGREEMENT IS FOUND TO HAVE FAILED ITS ESSENTIAL PURPOSE.
+ 
+Basis of the Bargain: THE LIMITATIONS OF DAMAGES SET FORTH ABOVE ARE FUNDAMENTAL ELEMENTS OF THE BASIS OF THE BARGAIN BETWEEN LINDHOLMEN SCIENCE PARK AND YOU.
+ 
+Local Laws and Regulations: Nothing in this clause aims to limit or exclude any liability that cannot be limited or excluded under applicable laws. Users are encouraged to be aware of and adhere to local laws and regulations governing the use of AI models and services.
+
+10. Termination Clauses
+
+* Model Materials: Refers to the Model and all associated documentation, guidelines, and policies provided by Lindholmen Science Park.
+Commencement and Duration
+
+* This Agreement shall become effective upon your acceptance or when you commence access to or use of the Model Materials (“Effective Date”) and shall remain in full force and effect unless earlier terminated in accordance with this Agreement.
+Termination by Organization
+
+* Breach: Lindholmen Science Park reserves the right to terminate this Agreement unilaterally if you are found to be in breach of any term or condition stipulated in this Agreement.
+
+* Safety and Compliance: Lindholmen Science Park further reserves the right to terminate this Agreement to comply with any applicable law, regulation, or guideline, or to preserve the safety, integrity, and lawful operation of the Model and associated resources.
+Termination by You
+
+* You have the right to terminate this Agreement at any time by ceasing all use of the Model Materials and deleting all copies of the Materials in your possession or control.
+Consequences of Termination
+
+* Ceasing Use: Upon termination of this Agreement for any reason the license is also terminated. Therefore, if the Agreement is terminated you must cease all use of the Model Materials and promptly delete and destroy all copies, full or partial, of the Materials in your possession or control.
+
+* Survival of Rights and Obligations: The rights and obligations contained in Sections 6 and 8 of this Agreement shall survive the termination of this Agreement and shall continue to bind you and any permitted successors and assignees.
+
+* No Liability for Termination: Lindholmen Science Park will not be liable for any damages, losses, costs, or harms arising from the termination of this Agreement, and termination will not affect any liability accrued before the termination date.
+
+11. Jurisdiction and Governing Law
+
+Any dispute, controversy, or claim arising out of or in connection with this contract, or the breach, termination, or invalidity thereof, shall be finally settled by arbitration administered by the Stockholm Chamber of Commerce Arbitration Institute (the “SCC”).
+The Rules for Expedited Arbitrations shall apply, unless the SCC in its discretion determines, taking into account the complexity of the case, the amount in dispute and other circumstances, that the Arbitration Rules shall apply. In the latter case, the SCC shall also decide whether the Arbitral Tribunal shall be composed of one or three arbitrators. The seat of arbitration shall be Stockholm, Sweden. The language to be used in the arbitral proceedings shall be English. This contract shall be governed by the substantive law of Sweden.
+
+12. Updates and Revisions
+
+Commitment to Update: Lindholmen Science Park acknowledges the rapid pace of technological and societal advancements. We remain committed to periodically reviewing and updating the terms of this Agreement to remain in harmony with such developments, thereby safeguarding the interests of all stakeholders involved while promoting sustainable and responsible AI use.
+ 
+Right to Amend: Lindholmen Science Park  reserves the right, at its sole discretion, to amend, modify, or replace any part of this Agreement. It is your responsibility to check this Agreement periodically for changes. Your continued use of or access to the Model Materials following the posting of any changes to this Agreement constitutes acceptance of those changes.
+ 
+Grace Period: In the event of any substantial amendment to the terms of this Agreement, you will be provided with a notice period of 30 days from the date such amendments are posted for you to review through the distribution platform to review and adapt to the amended terms. Should you disagree with the amendments, you reserve the right to terminate this Agreement in accordance with the termination clause herein.
+ 
+Adherence to Future Norms: By agreeing to this Agreement, you commit to adhering to potential future norms, regulations, and guidelines that may be introduced in the jurisdiction pertaining to the use of AI technologies, even if they are introduced after your acceptance of this Agreement.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7c0e390
--- /dev/null
+++ b/README.md
@@ -0,0 +1,74 @@
+---
+language:
+- sv
+- "no"
+- da
+- is
+- en
+tags:
+- text-generation
+- swedish
+- nordic
+- gpt-sw3
+- AI-Sweden
+license: other
+library_name: transformers
+---
+
+# gpt-sw3-6.7b
+
+GPT-SW3 6.7B base model. Text completion in Swedish, Norwegian, Danish, Icelandic, and English.
+
+**Size:** 6.7B | **Type:** base | **Languages:** Swedish, Norwegian, Danish, Icelandic, English
+
+> Community mirror of [AI-Sweden-Models/gpt-sw3-6.7b](https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b)
+
+---
+
+## Warning and Disclaimer
+
+This model is provided as-is for research and educational purposes.
+Community redistribution of AI Sweden's GPT-SW3 under the same modified RAIL license.
+
+**You are responsible for any content you create using this model. Use responsibly.**
+
+The model may reflect biases from training data and may generate inaccurate, offensive,
+or inappropriate content. Neither the uploader nor AI Sweden are liable for downstream misuse.
+Review the [AI Sweden RAIL license](LICENSE) before any production deployment.
+
+> *"You are responsible for any content you create using this model. Enjoy responsibly."*
+
+---
+
+## Usage
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+model_id = "WestCode1357/gpt-sw3-6.7b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
+device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+
+prompt = "Träd är fina för att"
+inputs = tokenizer(prompt, return_tensors="pt").to(device)
+out = model.generate(**inputs, max_new_tokens=150, do_sample=True, temperature=0.7)
+print(tokenizer.decode(out[0]))
+```
+
+## Intended Use
+
+> ⚠️ **These models contain extreme bias and are NOT intended for commercial use.**
+> **For scientific and research use only.**
+
+GPT-SW3 was trained on large-scale web data and may reflect harmful societal biases present in that data. It has not been aligned or safety-tuned beyond its original training. Use strictly in controlled research settings. Do not deploy in any consumer-facing or commercial product without thorough evaluation and additional safety measures.
+
+## About GPT-SW3
+
+GPT-SW3 is developed by AI Sweden in collaboration with RISE and WASP WARA for Media and Language.
+Trained on 320B tokens: Swedish, Norwegian, Danish, Icelandic, English, and code.
+
+- **Original models:** https://huggingface.co/AI-Sweden-Models
+- **Project page:** https://www.ai.se/en/project/gpt-sw3
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..e4b39df
--- /dev/null
+++ b/config.json
@@ -0,0 +1,37 @@
+{
+  "_name_or_path": "AI-Sweden-Models/gpt-sw3-6.7b",
+  "activation_function": "gelu",
+  "apply_query_key_layer_scaling": true,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 1,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 1,
+  "initializer_range": 0.01,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 4096,
+  "n_head": 32,
+  "n_inner": 16384,
+  "n_layer": 32,
+  "n_positions": 2048,
+  "normalize_attention_scores": true,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tokenizer_class": "GPTSw3Tokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.0.dev0",
+  "use_cache": false,
+  "vocab_size": 64000,
+  "n_ctx": 2048
+}
diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors
new file mode 100644
index 0000000..4c96a25
--- /dev/null
+++ b/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce11f0333f074f87f04749853b652fe2dfc3a98f1504a4de6085843baafb41d9
+size 9993224624
diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors
new file mode 100644
index 0000000..1411e60
--- /dev/null
+++ b/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19fb4870e3ebfb2c250dcfec2147686eca1cb4b2b4ad6bfbbf6ef8f6447ccf52
+size 9985116208
diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors
new file mode 100644
index 0000000..e3b8f26
--- /dev/null
+++ b/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c1143206fd49248f0214f926ad22e5a5b511d2f664862f85ed58bd5904f9ddd
+size 8063283552
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000..b4e5569
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,460 @@
+{
+    "metadata": {
+        "total_size": 28041576576
+    },
+    "weight_map": {
+        "lm_head.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.0.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.11.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.11.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.11.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.11.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.11.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.11.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.11.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.11.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.11.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.2.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.20.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.23.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.23.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.23.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.3.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.30.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.4.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.ln_f.bias": "model-00003-of-00003.safetensors",
+        "transformer.ln_f.weight": "model-00003-of-00003.safetensors",
+        "transformer.wpe.weight": "model-00001-of-00003.safetensors",
+        "transformer.wte.weight": "model-00001-of-00003.safetensors"
+    }
+}
\ No newline at end of file
diff --git a/pytorch_model-00001-of-00003.bin b/pytorch_model-00001-of-00003.bin
new file mode 100644
index 0000000..72d502d
--- /dev/null
+++ b/pytorch_model-00001-of-00003.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9b03084caa3ba332c933f1c8837dbfaab4c51f9d92dd0e37fd7511a06bddcb1
+size 9993261183
diff --git a/pytorch_model-00002-of-00003.bin b/pytorch_model-00002-of-00003.bin
new file mode 100644
index 0000000..bc7131e
--- /dev/null
+++ b/pytorch_model-00002-of-00003.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beb141a499274a85d6d9c6a1c1ef0fdacb17ef34caa3484b1482b7edb1b1a093
+size 9985155825
diff --git a/pytorch_model-00003-of-00003.bin b/pytorch_model-00003-of-00003.bin
new file mode 100644
index 0000000..b2f6332
--- /dev/null
+++ b/pytorch_model-00003-of-00003.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9bf1c1365379ffb1a750d5d0d444cbfcfbdb58e0ee2d15ff56923bd915cd1cc
+size 8063310520
diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json
new file mode 100644
index 0000000..5a1ae90
--- /dev/null
+++ b/pytorch_model.bin.index.json
@@ -0,0 +1,460 @@
+{
+  "metadata": {
+    "total_size": 28041576576
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.0.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.2.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.20.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.23.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.23.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.3.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.30.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.4.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.ln_f.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.ln_f.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.wpe.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.wte.weight": "pytorch_model-00001-of-00003.bin"
+  }
+}
diff --git a/spiece.model b/spiece.model
new file mode 100644
index 0000000..9ec80b4
--- /dev/null
+++ b/spiece.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a76244a65ab35adda1b1cdb7b49be970d143bcc489d7b05d87551a12de78878
+size 1071963