初始化项目，由ModelHub XC社区提供模型

Model: Joaoffg/SHARE-4B-Base-2604 Source: Original Platform
2026-05-30 21:56:12 +08:00
commit 95fd0e8dfb
14 changed files with 249782 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/176
+++ b/176
@@ -0,0 +1,176 @@
+~~~
+Generated on: 2026-02-16 13:33:15.715000+00:00
+License ID: 02a257f4-5c16-41c6-9db1-d2b86954ea90
+License Template Version: e8502289197accc4ddd023f0fc234ca26062a9f1
+~~~
+
+### **Social-Humanities AI For Research and Education RAIL-MS**
+
+Licensed Artifact(s):
+
+   - Model
+
+   - Source Code
+
+
+**Section I: PREAMBLE**
+
+This RAIL License is generally applicable to the Artifact(s) identified above. 
+
+For valuable consideration, You and Licensor agree as follows:
+
+**1. Definitions**
+
+(a) “**Application**” refers to a sequence of instructions or statements written in machine code language, including object code (that is the product of a compiler), binary code (data using a two-symbol system) or an intermediate language (such as register transfer language).
+
+(b) “**Artifact**” refers to a software application (in either binary or source code format), Model, and/or Source Code, in accordance with what is specified above as the “Licensed Artifact”.
+
+(c) ”**Contribution**" means any work, including any modifications or additions to an Artifact, that is intentionally submitted to Licensor for inclusion or incorporation in the Artifact directly or indirectly by the rights owner. For the purposes of this definition, “**submitted**” means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing, sharing and improving the Artifact, but excluding communication that is conspicuously marked or otherwise designated in writing by the contributor as "**Not a Contribution.**"
+
+(d) "**Contributor**" means Licensor or any other individual or legal entity that creates or owns a Contribution that is added to or incorporated into an Artifact or its Derivative.
+
+(e) **“Data”** means a collection of information and/or content extracted from the dataset used with a given Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
+
+(f) **“Derivative**” means a work derived from or based upon an Artifact, and includes all modified versions of such Artifact.
+
+(g) **“Distribution”** means any transmission, reproduction, publication or other sharing of an Artifact or Derivative to a third party, including providing a hosted service incorporating the Artifact, which is made available by electronic or other remote means - e.g. API-based or web access.
+
+(h) “**Harm**” includes but is not limited to physical, mental, psychological, financial and reputational damage, pain, or loss.
+
+(i) "**License**" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
+
+(j) “**Licensor**” means the rights owner (by virtue of creation or documented transfer of ownership) or entity authorized by the rights owner (e.g., exclusive licensee) that is granting the rights in this License.
+
+(k) “**Model**” means any machine-learning based assembly or assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Source Code.
+
+(l) **“Output”** means the results of operating a Model as embodied in informational content resulting therefrom.
+
+(m) “**Permitted Purpose**” means for non-commercial scientific research, development, and education only. "Non-commercial" means use explicitly not intended for or directed towards commercial advantage or monetary compensation.
+
+(n) “**Source Code**” means any collection of text written using human-readable programming language, including the code and scripts used to define, run, load, benchmark or evaluate a Model or any component thereof, and/or used to prepare data for training or evaluation, if any. Source Code includes any accompanying documentation, tutorials, examples, etc, if any. For clarity, the term “Source Code” as used in this License includes any and all Derivatives of such Source Code.
+
+(o) “**Third Parties**” means individuals or legal entities that are not under common control with Licensor or You.
+
+(p) **“Use”** includes accessing, using, copying, modifying, and/or distributing an Artifact; in connection with a Model as Artifact, Use also includes creating content, fine-tuning, updating, running, training, evaluating and/or re-parametrizing such Model.
+
+(q) "**You**" (or "**Your**") means an individual or legal entity receiving and exercising permissions granted by this License and/or making use of the Artifact for permitted purposes and in any permitted field of use, including usage of the Artifact in an end-use application - e.g. chatbot, translator, image generator, etc.
+
+**Section II: INTELLECTUAL PROPERTY RIGHTS**
+
+Both copyright and patent grants may apply to the Artifact. The Artifact is subject to additional terms as described in Section III below, which govern the use of the Artifact in the event that Section II is held unenforceable or inapplicable.
+
+**2. Grant of Copyright License**. Conditioned upon compliance with Section III below and subject to the terms and conditions of this License, each Contributor hereby grants to You, only in connection with the Permitted Purpose, a worldwide, non-exclusive, royalty-free copyright license to reproduce, use, publicly display, publicly perform, sublicense, and distribute the Artifact and Derivatives thereof.
+
+**3. Grant of Patent License**. Conditioned upon compliance with Section III below and subject to the terms and conditions of this License, and only where and as applicable, each Contributor hereby grants to You, only in connection with the Permitted Purpose, a worldwide, non-exclusive, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, and use the Artifact where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Artifact to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Artifact and/or a Contribution incorporated within the Artifact constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License in connection with the Artifact shall terminate as of the date such litigation is asserted or filed.
+
+Licensor and Contributor each have the right to grant the licenses above.
+
+**Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION**
+
+**4. Use-based restrictions.** The restrictions set forth in Attachment A are mandatory Use-based restrictions. Therefore You may not Use the Artifact in violation of such restrictions. You may Use the Artifact only subject to this License. You shall require all of Your users who use the Artifact or its Derivative to comply with the terms of this paragraph and only for the Permitted Purpose.
+
+**5. The Output You Generate with a Model (as Artifact). Except as set forth herein, Licensor claims no ownership rights in the Output You generate. You are accountable for the Output You generate and its subsequent uses. However, Your use of the Output is strictly subject to the Use Restrictions in Attachment A. For the avoidance of doubt, You may not use the Output to contravene any provision stated in this License, including the prohibitions on model distillation and training data extraction.
+
+**6. Distribution and Redistribution**. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Artifact or its Derivatives in any medium, with or without modifications, provided that You meet the following conditions:
+
+1. Use-based restrictions in paragraph 4 MUST be included as a condition precedent to effect any type of legal agreement (e.g. a license) governing the use and/or distribution of the Artifact or its Derivatives, and You shall give such notice to any subsequent Third Party recipients;
+2. You shall give any Third Party recipients of the Artifact or its Derivatives a copy of this License;
+3. You shall cause any modified files to carry prominent notices stating that You changed the files;
+4. You shall retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Artifact or its Derivatives.
+5. You and any Third Party recipients of the Artifact or its Derivative shall adhere to the Permitted Purpose.
+
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions with respect to paragraph **6.1.,** to govern the use, reproduction, or Distribution of Your modifications, or for any Derivative, **provided that** Your use, reproduction, and Distribution of the Artifact or its Derivative otherwise complies with the conditions stated in this License. In other words, the Use-based restrictions in Attachment A form the minimum set of terms for You to license to Third Parties any Artifact or its Derivative, but You may add more restrictive terms if You deem it necessary.
+
+**Section IV: OTHER PROVISIONS**
+
+**7. Updates and Runtime Restrictions.** To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Artifact in violation of this License or update the Artifact through electronic means.
+
+**8. Trademarks and related.** Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
+
+**9. Disclaimer of Warranty**. Unless required by applicable law or agreed to in writing, Licensor provides the Artifact (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using the Artifact, and assume any risks associated with Your exercise of permissions under this License.
+
+**10. Limitation of Liability**. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Artifact (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+
+**11.** If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
+
+**12.** **Term and Termination.** The term of this License will commence upon the earlier of (a) Your acceptance of this License or (b) accessing the Artifact; and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Licensor may terminate this License if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of the Artifact. Section 10 shall survive the termination of this License.
+
+END OF TERMS AND CONDITIONS
+
+ 
+
+**Attachment A**
+
+### **USE RESTRICTIONS**
+
+You agree not to use the Artifact or its Derivatives in any of the following ways:
+
+
+1. Discrimination
+
+   (a) To discriminate or exploit individuals or groups based on legally protected characteristics and/or vulnerabilities.
+
+   (b) For purposes of administration of justice, law enforcement, immigration, or asylum processes, such as predicting that a natural person will commit a crime or the likelihood thereof.
+
+   (c) To engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, or other essential goods and services.
+
+
+2. Military
+
+   (a) For weaponry or warfare.
+
+
+3. Legal
+
+   (a) To engage or enable fully automated decision-making that adversely impacts a natural person\'s legal rights without expressly and intelligibly disclosing the impact to such natural person and providing an appeal process.
+
+   (b) To engage or enable fully automated decision-making that creates, modifies or terminates a binding, enforceable obligation between entities; whether these include natural persons or not.
+
+   (c) In any way that violates any applicable national, federal, state, local or international law or regulation.
+
+
+4. Disinformation
+
+   (a) To create, present or disseminate verifiably false or misleading information for economic gain or to intentionally deceive the public, including creating false impersonations of natural persons.
+
+   (b) To synthesize or modify a natural person\'s appearance, voice, or other individual characteristics, unless prior informed consent of said natural person is obtained.
+
+   (c) To autonomously interact with a natural person, in text or audio format, unless disclosure and consent is given prior to interaction that the system engaging in the interaction is not a natural person.
+
+   (d) To defame or harm a natural person\'s reputation, such as by generating, creating, promoting, or spreading defamatory content (statements, images, or other content).
+
+   (e) To generate or disseminate information (including - but not limited to - images, code, posts, articles), and place the information in any public context without expressly and intelligibly disclaiming that the information and/or content is machine generated.
+
+
+5. Privacy
+
+   (a) To utilize personal information to infer additional personal information about a natural person, including but not limited to legally protected characteristics, vulnerabilities or categories; unless informed consent from the data subject to collect said inferred personal information for a stated purpose and defined duration is received.
+
+   (b) To generate or disseminate personal identifiable information that can be used to harm an individual or to invade the personal privacy of an individual.
+
+
+6. Health
+
+   (a) To provide medical advice or make clinical decisions without necessary (external) accreditation of the system; unless the use is (i) in an internal research context with independent and accountable oversight and/or (ii) with medical professional oversight that is accompanied by any related compulsory certification and/or safety/quality standard for the implementation of the technology.
+
+
+7. Research
+
+   (a) In connection with any academic dishonesty, including submitting any informational content or output of a Model as Your own work in any academic setting.
+
+
+8. Malware
+
+   (a) To generate and/or disseminate malware (including - but not limited to - ransomware) or any other content to be used for the purpose of Harming electronic systems;
+
+
+9. General
+
+   (a) To Intentionally deceive or mislead others, including failing to appropriately disclose to end users any known dangers of your system.
+
+
+10. Model Development and Data Integrity
+
+   (a) To use the Artifact, its Derivatives, or any Output to directly or indirectly train, pre-train, fine-tune, or evaluate any other machine learning model or artificial intelligence system (including, but not limited to, model distillation or the generation of synthetic data for training purposes).
+
+   (b) To intentionally interact with, query, or prompt the Model for the purpose of discovering, extracting, reconstructing, reverse-engineering, or reproducing the Data (as defined in Section 1(e)) or any specific texts or information used to train, pre-train, or evaluate the Model.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,204 @@
+---
+language:
+- en
+- nl
+license: other
+license_name: rail-share
+license_link: LICENSE
+metrics:
+- perplexity
+library_name: transformers
+pipeline_tag: text-generation
+---
+
+# Model Card for SHARE-4B
+
+SHARE-4B (Social-Humanities AI for Research and Education) is a 3.9-billion-parameter decoder-only causal language model pretrained exclusively on content relevant to the social sciences and humanities (SSH). It is intended as a domain-specific base model for SSH research and education, and is designed to be used through the MIRROR interface, which surfaces token-level surprisal rather than generating new text.
+
+This model was introduced in the paper [SHARE: Social-Humanities AI for Research and Education](https://huggingface.co/papers/2604.11152).
+
+**Note:** This is a fully trained base (pretrained-only) model with no SFT, DPO, or RLHF. Due to its smaller size, a quantized version of SHARE-4B can be deployed on local machines with only CPU compute (e.g., student laptops), making it significantly more accessible and carbon-efficient than larger comparable models. This base model is not suitable for chat applications.
+
+## Model Details
+
+### Model Description
+
+SHARE-4B is part of the first family of causal language models fully pretrained by and for the SSH disciplines. It mirrors the Phi-4-mini architecture but uses a custom 50,000-token BPE tokenizer trained on the SHARE corpus, and is pretrained exclusively on a curated SSH dataset drawn from Wikipedia, Project Gutenberg, PeS2o, and (for the larger SHARE-14B) CORE. On a custom SSH Cloze benchmark, SHARE-4B achieves 69.8% raw accuracy and 66.2% prior-corrected accuracy, marginally outperforming the comparable Pythia-3B (63.6% prior-corrected) despite having seen far fewer training tokens.
+
+- **Developed by:** João Gonçalves, Sonia de Jager, Petr Knoth, David Pride, Nick Jelicic
+- **Funded by:** NVIDIA Academic Grant; Dutch Research Council (NWO) VENI grant VI.Veni.221S.154
+- **Model type:** Decoder-only transformer causal language model (Phi-4-mini architecture)
+- **Language(s) (NLP):** Primarily English, with a smaller proportion of Dutch
+- **License:** Custom Responsible AI License (RAIL-SHARE) — non-commercial, no model distillation, restricted text generation use
+
+### Model Sources
+
+- **Repository:** https://github.com/Joaoffg/SHARE
+- **Paper:** [SHARE: Social-Humanities AI for Research and Education](https://huggingface.co/papers/2604.11152)
+- **Demo (MIRROR interface):** [Add link]
+- **Contact:** ferreiragoncalves@eshcc.eur.nl
+
+## Uses
+
+### Direct Use
+
+SHARE-4B is intended primarily as a base model deployed through the MIRROR interface for SSH researchers, educators, and students. Its smaller size makes it particularly suitable for local, low-resource deployments such as on student laptops. Through MIRROR, the model is used to compute token-level surprisal and entropy on user-written texts in order to:
+
+- Identify typos, stylistic anomalies, and possible factual mistakes in academic writing
+- Highlight innovative or unexpected contributions in scholarly texts
+- Surface disciplinary biases and norms encoded in SSH literature
+- Support reflective revision of student and scholarly writing in the SSH
+
+### Downstream Use
+
+Potential downstream uses include perplexity-based analyses of SSH texts, domain-specific text classification, and research on the structure and biases of SSH scholarly discourse. Downstream use is governed by the RAIL-SHARE license (non-commercial; no distillation).
+
+### Out-of-Scope Use
+
+- Commercial applications of any kind (forbidden by license)
+- Model distillation into other models (forbidden by license)
+- Unconstrained text generation, especially in academic contexts where it could enable student or faculty fraud
+- STEM, biomedical, mathematical, or coding tasks — the model was deliberately not trained on these domains
+- Use as a chat assistant — the model is base-pretrained only, with no SFT or alignment
+- Multilingual applications outside of English and (to a lesser extent) Dutch
+- Any safety-critical decision-making
+
+## Bias, Risks, and Limitations
+
+SHARE-4B inherits the systemic biases present in the open-access English-language SSH scholarship it was trained on. As illustrated in the paper, terms associated with non-Western scholarship (e.g. "African" in the context of locations of knowledge production) can register as unexpected, reflecting the field's existing imbalances rather than properties of the topics themselves.
+
+Other limitations and risks:
+
+- **Smaller parameter count:** SHARE-4B flags fewer nuanced stylistic and factual deviations than SHARE-14B. For example, in qualitative testing the 4B model was less confident than the 14B model in identifying incorrect author attributions and more subtle stylistic issues
+- **Smaller training corpus:** SHARE-4B was trained on the Wikipedia, Project Gutenberg, and PeS2o subsets but excluded the CORE dataset, resulting in a total of ~28 billion tokens across 2 epochs
+- **English-dominant data**, which is a meaningful constraint for SSH fields where multilingual scholarship matters
+- **Causal interpretation effect:** because surprisal is computed on preceding tokens, an early mistake in a text propagates and can mask later anomalies
+- **Use in text reading/reviewing** could be misused to shortcut careful reading of academic work
+- **No alignment or safety tuning** has been applied — the model is released as a base model
+- **Outperformed by smaller masked models on some tasks:** on the SSH Cloze benchmark, SHARE-4B ranks below the much smaller SSciBERT model, showing that masked language models with tightly aligned training corpora can still outperform larger causal models in Cloze tasks
+
+### Recommendations
+
+Users should treat MIRROR outputs as prompts for reflection rather than authoritative judgments. Surprisal does not equal correctness, and unexpectedness can signal innovation as readily as error. When using MIRROR for revision, work from the beginning of the text to mitigate the propagation of earlier surprisal into later tokens. For use cases requiring more nuanced detection of stylistic or factual deviations, users may prefer SHARE-14B; SHARE-4B is well-suited for local deployment and lower-resource contexts. Researchers should be aware of the model's biases toward dominant SSH discourses and read its outputs critically. Use of SHARE for direct text generation is discouraged.
+
+
+## Training Details
+
+### Training Data
+
+The training corpus for SHARE-4B combines three SSH-focused subsets (the CORE dataset was added only for SHARE-14B):
+
+- **Wikipedia** (English and Dutch): articles selected by traversing the category tree from SSH-relevant main topic classifications using PetScan and extracted with WikiExtractor
+- **Project Gutenberg:** books filtered by SSH-relevant Library of Congress Classes (B, C, D, G, H, J, K, L, M, N)
+- **Academic publications:** drawn from PeS2o, filtered using AllenAI's Field of Science (FoS) classifier to retain SSH disciplines (Art, Business, Economics, Geography, Education, History, Law, Linguistics, Philosophy, Political Science, Psychology, Sociology), plus additional materials provided through agreements with publishers including Open Humanities Press
+
+The SHARE-4B corpus totals approximately 14 billion tokens, covered across 2 training epochs. See the technical report for details on filtering and selection.
+
+### Training Procedure
+
+#### Preprocessing
+
+Raw data preprocessing was carried out exclusively on EU servers. A custom BPE tokenizer with a 50,000-token vocabulary was trained on the full SHARE corpus.
+
+#### Training Hyperparameters
+
+- **Training regime:** Mixed precision with FlashAttention-2
+- **Architecture:** Phi-4-mini (decoder-only transformer)
+- **Context length:** 4096 tokens
+- **Global batch size:** 64
+- **Warm-up steps:** 3000
+- **Learning rate:** 2e-4 with cosine learning rate scheduler
+- **Weight decay:** 0.01
+- **Epochs:** 2
+
+#### Speeds, Sizes, Times
+
+Training was conducted on Saturn Cloud using 8× NVIDIA A100 GPUs under data parallelism over a period of 656 hours to complete 2 epochs over approximately 28 billion tokens. Training loss, evaluation loss, and gradient normalization values indicated a smooth training run, reaching an evaluation perplexity of 11.94.
+
+## Evaluation
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+- **Perplexity comparison:** Erasmus University Rotterdam research output abstracts from Q3–Q4 2025, out of distribution from the training data
+- **SSH Cloze benchmark:** 275 SSH abstracts published in Q1 2026 (25 per Web of Science field across 11 SSH disciplines), constructed by selecting sentences with equivalent-token decisions (e.g. positive/negative, higher/lower) where SSH knowledge is required to predict the correct token
+
+#### Factors
+
+- Scientific domain (FoS classifier categories)
+- Faculty affiliation of authors at Erasmus University Rotterdam (used as an ecological-validity check)
+
+#### Metrics
+
+- Log-perplexity difference relative to Phi-4-mini (lower means better SHARE fit)
+- Raw and prior-corrected accuracy on the SSH Cloze benchmark (prior correction accounts for models guessing the more frequent token)
+
+### Results
+
+On the SSH Cloze benchmark, SHARE-4B achieves 69.8% raw accuracy and 66.2% prior-corrected accuracy. This marginally outperforms the comparable Pythia-3B (65.8% / 63.6%) and Pythia-12B (67.3% / 61.5%) despite SHARE-4B being trained on substantially fewer tokens (~28B vs 300B). SHARE-4B underperforms Phi-4-mini (73.8% / 69.8%), which was trained on ~5 trillion tokens, but demonstrates meaningfully better compute efficiency. Notably, the much smaller SSciBERT-e2 (110M, masked LM) achieves a higher prior-corrected accuracy (67.6%), reflecting the strength of tightly domain-aligned masked models on Cloze tasks.
+
+Perplexity analyses show that the gap between SHARE-4B and Phi-4-mini is consistently smaller for SSH fields (Art, Education, Sociology) than for STEM fields (Biology, Engineering, Medicine), indicating the intended SSH specialization. At the faculty level, the same pattern holds: Erasmus MC (medical) shows the largest gap, while SSH-focused faculties show the smallest.
+
+#### Summary
+
+SHARE-4B is a fully trained, compact SSH-specialized model that achieves an evaluation perplexity of 11.94 and competitive Cloze performance relative to similarly sized general-purpose causal models. Its smaller footprint makes it suitable for local, CPU-based deployment in educational contexts.
+
+## Model Examination
+
+Early experiments with instruction-tuned variants suggest that, because the training data deliberately excludes domains such as cybersecurity, biological weapons, and CSAM, classical safety risks are limited; the model also tends to default to harm-reducing framings when prompted with SSH-relevant harmful queries. Memorization probes on the SHARE family indicate that the models do not reproduce copyrighted content, with the few instances of memorization corresponding only to disclaimers and standard headers.
+
+## Environmental Impact
+
+- **Hardware Type:** 8× NVIDIA A100 GPUs (Saturn Cloud)
+- **Hours used:** ~656 hours
+- **Cloud Provider:** Saturn Cloud
+- **Compute Region:** United States
+- **Carbon Emitted:** Estimated at approximately 1.2 metric tons of CO₂ equivalents, roughly equivalent to the emissions of an economy one-way flight from Amsterdam to New York
+
+The project applied Chinchilla scaling laws to budget compute and used efficiency techniques (mixed precision, FlashAttention-2, gradient checkpointing) to reduce energy use. A quantized version of SHARE-4B runs on CPU-only hardware such as student laptops, offering a significantly lower-carbon alternative to larger models for educational use.
+
+## Citation
+
+**BibTeX:**
+
+```bibtex
+@misc{gonçalves2026sharesocialhumanitiesairesearch,
+      title={SHARE: Social-Humanities AI for Research and Education}, 
+      author={João Gonçalves and Sonia de Jager and Petr Knoth and David Pride and Nick Jelicic},
+      year={2026},
+      eprint={2604.11152},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2604.11152}, 
+}
+```
+
+**APA:**
+
+Gonçalves, J., de Jager, S., Knoth, P., Pride, D., & Jelicic, N. (2026). SHARE: Social-humanities AI for research and education. arXiv. https://arxiv.org/abs/2604.11152
+
+## Privacy statement
+
+Personal data, such as author names, may be included in the training documents for SHARE; we use legitimate interest as legal basis for processing the data under the EU's GDPR. The full privacy statement can be consulted here: https://surfdrive.surf.nl/s/gFnxgL6f5jer8yy
+
+## Glossary
+
+- **SSH:** Social Sciences and Humanities
+- **MIRROR:** Model Interface for Reflective Research Output Revisions — the user interface that displays per-token surprisal from SHARE rather than generating text
+- **Surprisal:** Negative log probability of an observed token under the model
+- **Prior-corrected accuracy:** Cloze accuracy adjusted to discount correct guesses arising from token frequency priors
+- **FoS:** Field of Science (AllenAI classifier used for disciplinary labelling)
+- **RAIL:** Responsible AI License
+
+## More Information
+
+This model is released alongside SHARE-14B and the MIRROR interface as part of a technical report inviting feedback from the SSH and ML communities. SHARE-4B is particularly intended to enable local, low-resource deployment in educational settings.
+
+## Model Card Authors
+
+João Gonçalves
+
+## Model Card Contact
+
+ferreiragoncalves@eshcc.eur.nl
--- a/config.json
+++ b/config.json
@@ -0,0 +1,31 @@
+{
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 1,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 4096,
+  "model_type": "phi3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 3,
+  "partial_rotary_factor": 1.0,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": false,
+  "vocab_size": 50000
+}
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 3,
+  "transformers_version": "4.49.0",
+  "use_cache": false
+}
--- a/model-00001-of-00004.safetensors
+++ b/model-00001-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:835463d67d758d742dad4d819f62effa9c80abf941a5d970ec5de3add2404587
+size 4842486216
--- a/model-00002-of-00004.safetensors
+++ b/model-00002-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9404a29f5a6300809c4937432f84f7d7f3a84f6b9d1a85a65ec6d9e5656fafd
+size 4983111176
--- a/model-00003-of-00004.safetensors
+++ b/model-00003-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dab651b89eb18a93b304bb79c03564498cf296224946ef0283667addc9312fae
+size 4983111176
--- a/model-00004-of-00004.safetensors
+++ b/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eec54f7dc9c7e0676d894632be7b5c28965d0977dc12797b4dffdee4c9d202e3
+size 916427424
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,202 @@
+{
+  "metadata": {
+    "total_size": 15725113344
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors"
+  }
+}
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,45 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizer",
+  "unk_token": "<unk>",
+  "use_cache": false
+}
--- a/training_args.bin
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:398c4c9789d07ea7b97217d7e86f93769afd2ab15179b043fd4158e45fa255c7
+size 5432