commit 6c739eadac485f466e3d5b6f11be3e7fd15a9ee9
Author: ModelHub XC <noreply@modelhub.org.cn>
Date:   Sun May 10 23:47:59 2026 +0800

    初始化项目，由ModelHub XC社区提供模型
    
    Model: BEE-spoke-data/smol_llama-81M-tied
    Source: Original Platform

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..a6344aa
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..92cfb3b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,95 @@
+---
+license: apache-2.0
+thumbnail: https://i.ibb.co/TvyMrRc/rsz-smol-llama-banner.png
+language:
+- en
+inference:
+  parameters:
+    max_new_tokens: 64
+    do_sample: true
+    temperature: 0.8
+    repetition_penalty: 1.15
+    no_repeat_ngram_size: 4
+    eta_cutoff: 0.0006
+    renormalize_logits: true
+widget:
+- text: My name is El Microondas the Wise and
+  example_title: El Microondas
+- text: Kennesaw State University is a public
+  example_title: Kennesaw State University
+- text: >-
+    Bungie Studios is an American video game developer. They are most famous for
+    developing the award winning Halo series of video games. They also made
+    Destiny. The studio was founded
+  example_title: Bungie
+- text: The Mona Lisa is a world-renowned painting created by
+  example_title: Mona Lisa
+- text: >-
+    The Harry Potter series, written by J.K. Rowling, begins with the book
+    titled
+  example_title: Harry Potter Series
+- text: >-
+    Question: I have cities, but no houses. I have mountains, but no trees. I
+    have water, but no fish. What am I?
+
+    Answer:
+  example_title: Riddle
+- text: The process of photosynthesis involves the conversion of
+  example_title: Photosynthesis
+- text: >-
+    Jane went to the store to buy some groceries. She picked up apples, oranges,
+    and a loaf of bread. When she got home, she realized she forgot
+  example_title: Story Continuation
+- text: >-
+    Problem 2: If a train leaves Station A at 9:00 AM and travels at 60 mph, and
+    another train leaves Station B at 10:00 AM and travels at 80 mph, when will
+    they meet if the distance between the stations is 300 miles?
+
+    To determine
+  example_title: Math Problem
+- text: In the context of computer programming, an algorithm is
+  example_title: Algorithm Definition
+pipeline_tag: text-generation
+tags:
+- smol_llama
+- llama2
+datasets:
+- JeanKaddour/minipile
+- pszemraj/simple_wikipedia_LM
+- BEE-spoke-data/wikipedia-20230901.en-deduped
+- mattymchen/refinedweb-3m
+---
+
+
+# smol_llama-81M-tied
+
+<img src="smol-llama-banner.png" alt="banner" style="max-width:80%; height:auto;">
+
+A small 81M param (total) decoder model, enabled through tying the input/output embeddings. This is the first version of the model.
+
+- 768 hidden size, 6 layers
+- standard multi-head attention (24 heads), context length 1024
+- input/output embeddings **are tied**
+- train-from-scratch
+
+## Notes
+
+**This checkpoint** is the 'raw' pre-trained model and has not been tuned to a more specific task. **It should be fine-tuned** before use in most cases.
+
+- slightly larger 101M param GQA pretrained version: [here](https://huggingface.co/BEE-spoke-data/smol_llama-101M-GQA)
+- For the chat version of this model, please [see here](https://youtu.be/dQw4w9WgXcQ?si=3ePIqrY1dw94KMu4)
+
+---
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_BEE-spoke-data__smol_llama-81M-tied)
+
+| Metric                | Value                     |
+|-----------------------|---------------------------|
+| Avg.                  | 24.52   |
+| ARC (25-shot)         | 22.18          |
+| HellaSwag (10-shot)   | 29.33    |
+| MMLU (5-shot)         | 24.06         |
+| TruthfulQA (0-shot)   | 43.97   |
+| Winogrande (5-shot)   | 49.25   |
+| GSM8K (5-shot)        | 0.23        |
+| DROP (3-shot)         | 2.64         |
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..8a4dc5f
--- /dev/null
+++ b/config.json
@@ -0,0 +1,27 @@
+{
+  "_name_or_path": "BEE-spoke-data/smol_llama-tied-v9-KIx2",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 1024,
+  "model_type": "llama",
+  "num_attention_heads": 24,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 24,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..3c6c66f
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.34.1"
+}
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000..08f2e9e
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83a6c4464b1617249fd99c32b11b4c236e7bc235b0e744c65fe6178ff2af885b
+size 325235784
diff --git a/smol-llama-banner.png b/smol-llama-banner.png
new file mode 100644
index 0000000..f076c2a
Binary files /dev/null and b/smol-llama-banner.png differ
diff --git a/smol_llama-81M-tied-evals/81m_tied.md b/smol_llama-81M-tied-evals/81m_tied.md
new file mode 100644
index 0000000..fb815cb
--- /dev/null
+++ b/smol_llama-81M-tied-evals/81m_tied.md
@@ -0,0 +1,150 @@
+hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-81M-tied,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 0, batch_size: 64
+|     Task     |Version| Metric | Value |   |Stderr|
+|--------------|------:|--------|------:|---|-----:|
+|arc_easy      |      0|acc     | 0.4162|±  |0.0101|
+|              |       |acc_norm| 0.3885|±  |0.0100|
+|boolq         |      1|acc     | 0.5832|±  |0.0086|
+|lambada_openai|      0|ppl     |79.4522|±  |3.1355|
+|              |       |acc     | 0.2523|±  |0.0061|
+|openbookqa    |      0|acc     | 0.1540|±  |0.0162|
+|              |       |acc_norm| 0.2780|±  |0.0201|
+|piqa          |      0|acc     | 0.6050|±  |0.0114|
+|              |       |acc_norm| 0.5898|±  |0.0115|
+|winogrande    |      0|acc     | 0.5272|±  |0.0140|
+
+hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-81M-tied,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 25, batch_size: 64
+|    Task     |Version| Metric |Value |   |Stderr|
+|-------------|------:|--------|-----:|---|-----:|
+|arc_challenge|      0|acc     |0.1672|±  |0.0109|
+|             |       |acc_norm|0.2218|±  |0.0121|
+
+hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-81M-tied,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 10, batch_size: 64
+|  Task   |Version| Metric |Value |   |Stderr|
+|---------|------:|--------|-----:|---|-----:|
+|hellaswag|      0|acc     |0.2769|±  |0.0045|
+|         |       |acc_norm|0.2923|±  |0.0045|
+
+hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-81M-tied,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 0, batch_size: 64
+|    Task     |Version|Metric|Value |   |Stderr|
+|-------------|------:|------|-----:|---|-----:|
+|truthfulqa_mc|      1|mc1   |0.2424|±  |0.0150|
+|             |       |mc2   |0.4353|±  |0.0152|
+
+hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-81M-tied,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 5, batch_size: 64
+|                      Task                       |Version| Metric |Value |   |Stderr|
+|-------------------------------------------------|------:|--------|-----:|---|-----:|
+|hendrycksTest-abstract_algebra                   |      1|acc     |0.2200|±  |0.0416|
+|                                                 |       |acc_norm|0.2200|±  |0.0416|
+|hendrycksTest-anatomy                            |      1|acc     |0.2741|±  |0.0385|
+|                                                 |       |acc_norm|0.2741|±  |0.0385|
+|hendrycksTest-astronomy                          |      1|acc     |0.1776|±  |0.0311|
+|                                                 |       |acc_norm|0.1776|±  |0.0311|
+|hendrycksTest-business_ethics                    |      1|acc     |0.2100|±  |0.0409|
+|                                                 |       |acc_norm|0.2100|±  |0.0409|
+|hendrycksTest-clinical_knowledge                 |      1|acc     |0.2264|±  |0.0258|
+|                                                 |       |acc_norm|0.2264|±  |0.0258|
+|hendrycksTest-college_biology                    |      1|acc     |0.2361|±  |0.0355|
+|                                                 |       |acc_norm|0.2361|±  |0.0355|
+|hendrycksTest-college_chemistry                  |      1|acc     |0.1900|±  |0.0394|
+|                                                 |       |acc_norm|0.1900|±  |0.0394|
+|hendrycksTest-college_computer_science           |      1|acc     |0.2100|±  |0.0409|
+|                                                 |       |acc_norm|0.2100|±  |0.0409|
+|hendrycksTest-college_mathematics                |      1|acc     |0.1800|±  |0.0386|
+|                                                 |       |acc_norm|0.1800|±  |0.0386|
+|hendrycksTest-college_medicine                   |      1|acc     |0.2023|±  |0.0306|
+|                                                 |       |acc_norm|0.2023|±  |0.0306|
+|hendrycksTest-college_physics                    |      1|acc     |0.2157|±  |0.0409|
+|                                                 |       |acc_norm|0.2157|±  |0.0409|
+|hendrycksTest-computer_security                  |      1|acc     |0.2400|±  |0.0429|
+|                                                 |       |acc_norm|0.2400|±  |0.0429|
+|hendrycksTest-conceptual_physics                 |      1|acc     |0.2596|±  |0.0287|
+|                                                 |       |acc_norm|0.2596|±  |0.0287|
+|hendrycksTest-econometrics                       |      1|acc     |0.2544|±  |0.0410|
+|                                                 |       |acc_norm|0.2544|±  |0.0410|
+|hendrycksTest-electrical_engineering             |      1|acc     |0.2207|±  |0.0346|
+|                                                 |       |acc_norm|0.2207|±  |0.0346|
+|hendrycksTest-elementary_mathematics             |      1|acc     |0.2169|±  |0.0212|
+|                                                 |       |acc_norm|0.2169|±  |0.0212|
+|hendrycksTest-formal_logic                       |      1|acc     |0.1587|±  |0.0327|
+|                                                 |       |acc_norm|0.1587|±  |0.0327|
+|hendrycksTest-global_facts                       |      1|acc     |0.1900|±  |0.0394|
+|                                                 |       |acc_norm|0.1900|±  |0.0394|
+|hendrycksTest-high_school_biology                |      1|acc     |0.3000|±  |0.0261|
+|                                                 |       |acc_norm|0.3000|±  |0.0261|
+|hendrycksTest-high_school_chemistry              |      1|acc     |0.2808|±  |0.0316|
+|                                                 |       |acc_norm|0.2808|±  |0.0316|
+|hendrycksTest-high_school_computer_science       |      1|acc     |0.2800|±  |0.0451|
+|                                                 |       |acc_norm|0.2800|±  |0.0451|
+|hendrycksTest-high_school_european_history       |      1|acc     |0.2424|±  |0.0335|
+|                                                 |       |acc_norm|0.2424|±  |0.0335|
+|hendrycksTest-high_school_geography              |      1|acc     |0.2576|±  |0.0312|
+|                                                 |       |acc_norm|0.2576|±  |0.0312|
+|hendrycksTest-high_school_government_and_politics|      1|acc     |0.2228|±  |0.0300|
+|                                                 |       |acc_norm|0.2228|±  |0.0300|
+|hendrycksTest-high_school_macroeconomics         |      1|acc     |0.2231|±  |0.0211|
+|                                                 |       |acc_norm|0.2231|±  |0.0211|
+|hendrycksTest-high_school_mathematics            |      1|acc     |0.2370|±  |0.0259|
+|                                                 |       |acc_norm|0.2370|±  |0.0259|
+|hendrycksTest-high_school_microeconomics         |      1|acc     |0.2227|±  |0.0270|
+|                                                 |       |acc_norm|0.2227|±  |0.0270|
+|hendrycksTest-high_school_physics                |      1|acc     |0.2053|±  |0.0330|
+|                                                 |       |acc_norm|0.2053|±  |0.0330|
+|hendrycksTest-high_school_psychology             |      1|acc     |0.2110|±  |0.0175|
+|                                                 |       |acc_norm|0.2110|±  |0.0175|
+|hendrycksTest-high_school_statistics             |      1|acc     |0.4120|±  |0.0336|
+|                                                 |       |acc_norm|0.4120|±  |0.0336|
+|hendrycksTest-high_school_us_history             |      1|acc     |0.2990|±  |0.0321|
+|                                                 |       |acc_norm|0.2990|±  |0.0321|
+|hendrycksTest-high_school_world_history          |      1|acc     |0.2658|±  |0.0288|
+|                                                 |       |acc_norm|0.2658|±  |0.0288|
+|hendrycksTest-human_aging                        |      1|acc     |0.2287|±  |0.0282|
+|                                                 |       |acc_norm|0.2287|±  |0.0282|
+|hendrycksTest-human_sexuality                    |      1|acc     |0.2595|±  |0.0384|
+|                                                 |       |acc_norm|0.2595|±  |0.0384|
+|hendrycksTest-international_law                  |      1|acc     |0.2975|±  |0.0417|
+|                                                 |       |acc_norm|0.2975|±  |0.0417|
+|hendrycksTest-jurisprudence                      |      1|acc     |0.2315|±  |0.0408|
+|                                                 |       |acc_norm|0.2315|±  |0.0408|
+|hendrycksTest-logical_fallacies                  |      1|acc     |0.2822|±  |0.0354|
+|                                                 |       |acc_norm|0.2822|±  |0.0354|
+|hendrycksTest-machine_learning                   |      1|acc     |0.2321|±  |0.0401|
+|                                                 |       |acc_norm|0.2321|±  |0.0401|
+|hendrycksTest-management                         |      1|acc     |0.1748|±  |0.0376|
+|                                                 |       |acc_norm|0.1748|±  |0.0376|
+|hendrycksTest-marketing                          |      1|acc     |0.2308|±  |0.0276|
+|                                                 |       |acc_norm|0.2308|±  |0.0276|
+|hendrycksTest-medical_genetics                   |      1|acc     |0.3000|±  |0.0461|
+|                                                 |       |acc_norm|0.3000|±  |0.0461|
+|hendrycksTest-miscellaneous                      |      1|acc     |0.2375|±  |0.0152|
+|                                                 |       |acc_norm|0.2375|±  |0.0152|
+|hendrycksTest-moral_disputes                     |      1|acc     |0.2486|±  |0.0233|
+|                                                 |       |acc_norm|0.2486|±  |0.0233|
+|hendrycksTest-moral_scenarios                    |      1|acc     |0.2425|±  |0.0143|
+|                                                 |       |acc_norm|0.2425|±  |0.0143|
+|hendrycksTest-nutrition                          |      1|acc     |0.2288|±  |0.0241|
+|                                                 |       |acc_norm|0.2288|±  |0.0241|
+|hendrycksTest-philosophy                         |      1|acc     |0.2090|±  |0.0231|
+|                                                 |       |acc_norm|0.2090|±  |0.0231|
+|hendrycksTest-prehistory                         |      1|acc     |0.2377|±  |0.0237|
+|                                                 |       |acc_norm|0.2377|±  |0.0237|
+|hendrycksTest-professional_accounting            |      1|acc     |0.2234|±  |0.0248|
+|                                                 |       |acc_norm|0.2234|±  |0.0248|
+|hendrycksTest-professional_law                   |      1|acc     |0.2471|±  |0.0110|
+|                                                 |       |acc_norm|0.2471|±  |0.0110|
+|hendrycksTest-professional_medicine              |      1|acc     |0.4081|±  |0.0299|
+|                                                 |       |acc_norm|0.4081|±  |0.0299|
+|hendrycksTest-professional_psychology            |      1|acc     |0.2565|±  |0.0177|
+|                                                 |       |acc_norm|0.2565|±  |0.0177|
+|hendrycksTest-public_relations                   |      1|acc     |0.2182|±  |0.0396|
+|                                                 |       |acc_norm|0.2182|±  |0.0396|
+|hendrycksTest-security_studies                   |      1|acc     |0.2408|±  |0.0274|
+|                                                 |       |acc_norm|0.2408|±  |0.0274|
+|hendrycksTest-sociology                          |      1|acc     |0.2338|±  |0.0299|
+|                                                 |       |acc_norm|0.2338|±  |0.0299|
+|hendrycksTest-us_foreign_policy                  |      1|acc     |0.2500|±  |0.0435|
+|                                                 |       |acc_norm|0.2500|±  |0.0435|
+|hendrycksTest-virology                           |      1|acc     |0.2892|±  |0.0353|
+|                                                 |       |acc_norm|0.2892|±  |0.0353|
+|hendrycksTest-world_religions                    |      1|acc     |0.2105|±  |0.0313|
+|                                                 |       |acc_norm|0.2105|±  |0.0313|
+
diff --git a/smol_llama-81M-tied-evals/json_object_1.json b/smol_llama-81M-tied-evals/json_object_1.json
new file mode 100644
index 0000000..22eedef
--- /dev/null
+++ b/smol_llama-81M-tied-evals/json_object_1.json
@@ -0,0 +1,56 @@
+{
+  "results": {
+    "arc_easy": {
+      "acc": 0.41624579124579125,
+      "acc_stderr": 0.010114819404500878,
+      "acc_norm": 0.38846801346801346,
+      "acc_norm_stderr": 0.01000127604448523
+    },
+    "boolq": {
+      "acc": 0.5831804281345566,
+      "acc_stderr": 0.00862319210884368
+    },
+    "lambada_openai": {
+      "ppl": 79.45218123817662,
+      "ppl_stderr": 3.1355336623454866,
+      "acc": 0.2522802251115855,
+      "acc_stderr": 0.006050943684570117
+    },
+    "openbookqa": {
+      "acc": 0.154,
+      "acc_stderr": 0.016158285192455334,
+      "acc_norm": 0.278,
+      "acc_norm_stderr": 0.02005583388807091
+    },
+    "piqa": {
+      "acc": 0.6050054406964092,
+      "acc_stderr": 0.011405665187969021,
+      "acc_norm": 0.5897714907508161,
+      "acc_norm_stderr": 0.011476256036359109
+    },
+    "winogrande": {
+      "acc": 0.5272296764009471,
+      "acc_stderr": 0.014031631629827708
+    }
+  },
+  "versions": {
+    "arc_easy": 0,
+    "boolq": 1,
+    "lambada_openai": 0,
+    "openbookqa": 0,
+    "piqa": 0,
+    "winogrande": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=BEE-spoke-data/smol_llama-81M-tied,revision=main,trust_remote_code=True,dtype='float'",
+    "num_fewshot": 0,
+    "batch_size": "16",
+    "batch_sizes": [],
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/smol_llama-81M-tied-evals/json_object_2.json b/smol_llama-81M-tied-evals/json_object_2.json
new file mode 100644
index 0000000..2b93869
--- /dev/null
+++ b/smol_llama-81M-tied-evals/json_object_2.json
@@ -0,0 +1,25 @@
+{
+  "results": {
+    "arc_challenge": {
+      "acc": 0.16723549488054607,
+      "acc_stderr": 0.01090553272460121,
+      "acc_norm": 0.22184300341296928,
+      "acc_norm_stderr": 0.012141659068147884
+    }
+  },
+  "versions": {
+    "arc_challenge": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=BEE-spoke-data/smol_llama-81M-tied,revision=main,trust_remote_code=True,dtype='float'",
+    "num_fewshot": 25,
+    "batch_size": "16",
+    "batch_sizes": [],
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/smol_llama-81M-tied-evals/json_object_3.json b/smol_llama-81M-tied-evals/json_object_3.json
new file mode 100644
index 0000000..f6ebf7d
--- /dev/null
+++ b/smol_llama-81M-tied-evals/json_object_3.json
@@ -0,0 +1,25 @@
+{
+  "results": {
+    "hellaswag": {
+      "acc": 0.27450199203187253,
+      "acc_stderr": 0.008909237404005179,
+      "acc_norm": 0.28884462151394424,
+      "acc_norm_stderr": 0.009048238955347484
+    }
+  },
+  "versions": {
+    "hellaswag": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=BEE-spoke-data/smol_llama-81M-tied,revision=main,trust_remote_code=True,dtype='float'",
+    "num_fewshot": 10,
+    "batch_size": "16",
+    "batch_sizes": [],
+    "device": "cuda",
+    "no_cache": false,
+    "limit": 0.25,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/smol_llama-81M-tied-evals/json_object_4.json b/smol_llama-81M-tied-evals/json_object_4.json
new file mode 100644
index 0000000..5185676
--- /dev/null
+++ b/smol_llama-81M-tied-evals/json_object_4.json
@@ -0,0 +1,25 @@
+{
+  "results": {
+    "truthfulqa_mc": {
+      "mc1": 0.2423500611995104,
+      "mc1_stderr": 0.01500067437357034,
+      "mc2": 0.4352666140818066,
+      "mc2_stderr": 0.015240603531006328
+    }
+  },
+  "versions": {
+    "truthfulqa_mc": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=BEE-spoke-data/smol_llama-81M-tied,revision=main,trust_remote_code=True,dtype='float'",
+    "num_fewshot": 0,
+    "batch_size": "16",
+    "batch_sizes": [],
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/smol_llama-81M-tied-evals/json_object_5.json b/smol_llama-81M-tied-evals/json_object_5.json
new file mode 100644
index 0000000..b0863dd
--- /dev/null
+++ b/smol_llama-81M-tied-evals/json_object_5.json
@@ -0,0 +1,417 @@
+{
+  "results": {
+    "hendrycksTest-abstract_algebra": {
+      "acc": 0.16,
+      "acc_stderr": 0.07483314773547882,
+      "acc_norm": 0.16,
+      "acc_norm_stderr": 0.07483314773547882
+    },
+    "hendrycksTest-anatomy": {
+      "acc": 0.12,
+      "acc_stderr": 0.066332495807108,
+      "acc_norm": 0.12,
+      "acc_norm_stderr": 0.066332495807108
+    },
+    "hendrycksTest-astronomy": {
+      "acc": 0.28,
+      "acc_stderr": 0.0916515138991168,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.0916515138991168
+    },
+    "hendrycksTest-business_ethics": {
+      "acc": 0.32,
+      "acc_stderr": 0.09521904571390466,
+      "acc_norm": 0.32,
+      "acc_norm_stderr": 0.09521904571390466
+    },
+    "hendrycksTest-clinical_knowledge": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081347,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081347
+    },
+    "hendrycksTest-college_biology": {
+      "acc": 0.36,
+      "acc_stderr": 0.09797958971132711,
+      "acc_norm": 0.36,
+      "acc_norm_stderr": 0.09797958971132711
+    },
+    "hendrycksTest-college_chemistry": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-college_computer_science": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-college_mathematics": {
+      "acc": 0.16,
+      "acc_stderr": 0.0748331477354788,
+      "acc_norm": 0.16,
+      "acc_norm_stderr": 0.0748331477354788
+    },
+    "hendrycksTest-college_medicine": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-college_physics": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-computer_security": {
+      "acc": 0.44,
+      "acc_stderr": 0.10132456102380442,
+      "acc_norm": 0.44,
+      "acc_norm_stderr": 0.10132456102380442
+    },
+    "hendrycksTest-conceptual_physics": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081345,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081345
+    },
+    "hendrycksTest-econometrics": {
+      "acc": 0.28,
+      "acc_stderr": 0.0916515138991168,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.0916515138991168
+    },
+    "hendrycksTest-electrical_engineering": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081347,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081347
+    },
+    "hendrycksTest-elementary_mathematics": {
+      "acc": 0.12,
+      "acc_stderr": 0.066332495807108,
+      "acc_norm": 0.12,
+      "acc_norm_stderr": 0.066332495807108
+    },
+    "hendrycksTest-formal_logic": {
+      "acc": 0.16,
+      "acc_stderr": 0.07483314773547882,
+      "acc_norm": 0.16,
+      "acc_norm_stderr": 0.07483314773547882
+    },
+    "hendrycksTest-global_facts": {
+      "acc": 0.12,
+      "acc_stderr": 0.066332495807108,
+      "acc_norm": 0.12,
+      "acc_norm_stderr": 0.066332495807108
+    },
+    "hendrycksTest-high_school_biology": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-high_school_chemistry": {
+      "acc": 0.32,
+      "acc_stderr": 0.09521904571390466,
+      "acc_norm": 0.32,
+      "acc_norm_stderr": 0.09521904571390466
+    },
+    "hendrycksTest-high_school_computer_science": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081345,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081345
+    },
+    "hendrycksTest-high_school_european_history": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-high_school_geography": {
+      "acc": 0.28,
+      "acc_stderr": 0.09165151389911678,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.09165151389911678
+    },
+    "hendrycksTest-high_school_government_and_politics": {
+      "acc": 0.16,
+      "acc_stderr": 0.0748331477354788,
+      "acc_norm": 0.16,
+      "acc_norm_stderr": 0.0748331477354788
+    },
+    "hendrycksTest-high_school_macroeconomics": {
+      "acc": 0.32,
+      "acc_stderr": 0.09521904571390466,
+      "acc_norm": 0.32,
+      "acc_norm_stderr": 0.09521904571390466
+    },
+    "hendrycksTest-high_school_mathematics": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081345,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081345
+    },
+    "hendrycksTest-high_school_microeconomics": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-high_school_physics": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081345,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081345
+    },
+    "hendrycksTest-high_school_psychology": {
+      "acc": 0.16,
+      "acc_stderr": 0.07483314773547879,
+      "acc_norm": 0.16,
+      "acc_norm_stderr": 0.07483314773547879
+    },
+    "hendrycksTest-high_school_statistics": {
+      "acc": 0.36,
+      "acc_stderr": 0.09797958971132711,
+      "acc_norm": 0.36,
+      "acc_norm_stderr": 0.09797958971132711
+    },
+    "hendrycksTest-high_school_us_history": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-high_school_world_history": {
+      "acc": 0.36,
+      "acc_stderr": 0.09797958971132713,
+      "acc_norm": 0.36,
+      "acc_norm_stderr": 0.09797958971132713
+    },
+    "hendrycksTest-human_aging": {
+      "acc": 0.28,
+      "acc_stderr": 0.09165151389911678,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.09165151389911678
+    },
+    "hendrycksTest-human_sexuality": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277262,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277262
+    },
+    "hendrycksTest-international_law": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081345,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081345
+    },
+    "hendrycksTest-jurisprudence": {
+      "acc": 0.36,
+      "acc_stderr": 0.09797958971132711,
+      "acc_norm": 0.36,
+      "acc_norm_stderr": 0.09797958971132711
+    },
+    "hendrycksTest-logical_fallacies": {
+      "acc": 0.36,
+      "acc_stderr": 0.09797958971132711,
+      "acc_norm": 0.36,
+      "acc_norm_stderr": 0.09797958971132711
+    },
+    "hendrycksTest-machine_learning": {
+      "acc": 0.36,
+      "acc_stderr": 0.09797958971132713,
+      "acc_norm": 0.36,
+      "acc_norm_stderr": 0.09797958971132713
+    },
+    "hendrycksTest-management": {
+      "acc": 0.28,
+      "acc_stderr": 0.09165151389911677,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.09165151389911677
+    },
+    "hendrycksTest-marketing": {
+      "acc": 0.28,
+      "acc_stderr": 0.09165151389911678,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.09165151389911678
+    },
+    "hendrycksTest-medical_genetics": {
+      "acc": 0.36,
+      "acc_stderr": 0.09797958971132711,
+      "acc_norm": 0.36,
+      "acc_norm_stderr": 0.09797958971132711
+    },
+    "hendrycksTest-miscellaneous": {
+      "acc": 0.32,
+      "acc_stderr": 0.09521904571390466,
+      "acc_norm": 0.32,
+      "acc_norm_stderr": 0.09521904571390466
+    },
+    "hendrycksTest-moral_disputes": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-moral_scenarios": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-nutrition": {
+      "acc": 0.2,
+      "acc_stderr": 0.08164965809277261,
+      "acc_norm": 0.2,
+      "acc_norm_stderr": 0.08164965809277261
+    },
+    "hendrycksTest-philosophy": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081345,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081345
+    },
+    "hendrycksTest-prehistory": {
+      "acc": 0.16,
+      "acc_stderr": 0.07483314773547882,
+      "acc_norm": 0.16,
+      "acc_norm_stderr": 0.07483314773547882
+    },
+    "hendrycksTest-professional_accounting": {
+      "acc": 0.12,
+      "acc_stderr": 0.066332495807108,
+      "acc_norm": 0.12,
+      "acc_norm_stderr": 0.066332495807108
+    },
+    "hendrycksTest-professional_law": {
+      "acc": 0.24,
+      "acc_stderr": 0.08717797887081347,
+      "acc_norm": 0.24,
+      "acc_norm_stderr": 0.08717797887081347
+    },
+    "hendrycksTest-professional_medicine": {
+      "acc": 0.28,
+      "acc_stderr": 0.09165151389911678,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.09165151389911678
+    },
+    "hendrycksTest-professional_psychology": {
+      "acc": 0.28,
+      "acc_stderr": 0.09165151389911678,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.09165151389911678
+    },
+    "hendrycksTest-public_relations": {
+      "acc": 0.16,
+      "acc_stderr": 0.0748331477354788,
+      "acc_norm": 0.16,
+      "acc_norm_stderr": 0.0748331477354788
+    },
+    "hendrycksTest-security_studies": {
+      "acc": 0.28,
+      "acc_stderr": 0.09165151389911678,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.09165151389911678
+    },
+    "hendrycksTest-sociology": {
+      "acc": 0.32,
+      "acc_stderr": 0.09521904571390467,
+      "acc_norm": 0.32,
+      "acc_norm_stderr": 0.09521904571390467
+    },
+    "hendrycksTest-us_foreign_policy": {
+      "acc": 0.4,
+      "acc_stderr": 0.10000000000000002,
+      "acc_norm": 0.4,
+      "acc_norm_stderr": 0.10000000000000002
+    },
+    "hendrycksTest-virology": {
+      "acc": 0.28,
+      "acc_stderr": 0.0916515138991168,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.0916515138991168
+    },
+    "hendrycksTest-world_religions": {
+      "acc": 0.28,
+      "acc_stderr": 0.0916515138991168,
+      "acc_norm": 0.28,
+      "acc_norm_stderr": 0.0916515138991168
+    }
+  },
+  "versions": {
+    "hendrycksTest-abstract_algebra": 1,
+    "hendrycksTest-anatomy": 1,
+    "hendrycksTest-astronomy": 1,
+    "hendrycksTest-business_ethics": 1,
+    "hendrycksTest-clinical_knowledge": 1,
+    "hendrycksTest-college_biology": 1,
+    "hendrycksTest-college_chemistry": 1,
+    "hendrycksTest-college_computer_science": 1,
+    "hendrycksTest-college_mathematics": 1,
+    "hendrycksTest-college_medicine": 1,
+    "hendrycksTest-college_physics": 1,
+    "hendrycksTest-computer_security": 1,
+    "hendrycksTest-conceptual_physics": 1,
+    "hendrycksTest-econometrics": 1,
+    "hendrycksTest-electrical_engineering": 1,
+    "hendrycksTest-elementary_mathematics": 1,
+    "hendrycksTest-formal_logic": 1,
+    "hendrycksTest-global_facts": 1,
+    "hendrycksTest-high_school_biology": 1,
+    "hendrycksTest-high_school_chemistry": 1,
+    "hendrycksTest-high_school_computer_science": 1,
+    "hendrycksTest-high_school_european_history": 1,
+    "hendrycksTest-high_school_geography": 1,
+    "hendrycksTest-high_school_government_and_politics": 1,
+    "hendrycksTest-high_school_macroeconomics": 1,
+    "hendrycksTest-high_school_mathematics": 1,
+    "hendrycksTest-high_school_microeconomics": 1,
+    "hendrycksTest-high_school_physics": 1,
+    "hendrycksTest-high_school_psychology": 1,
+    "hendrycksTest-high_school_statistics": 1,
+    "hendrycksTest-high_school_us_history": 1,
+    "hendrycksTest-high_school_world_history": 1,
+    "hendrycksTest-human_aging": 1,
+    "hendrycksTest-human_sexuality": 1,
+    "hendrycksTest-international_law": 1,
+    "hendrycksTest-jurisprudence": 1,
+    "hendrycksTest-logical_fallacies": 1,
+    "hendrycksTest-machine_learning": 1,
+    "hendrycksTest-management": 1,
+    "hendrycksTest-marketing": 1,
+    "hendrycksTest-medical_genetics": 1,
+    "hendrycksTest-miscellaneous": 1,
+    "hendrycksTest-moral_disputes": 1,
+    "hendrycksTest-moral_scenarios": 1,
+    "hendrycksTest-nutrition": 1,
+    "hendrycksTest-philosophy": 1,
+    "hendrycksTest-prehistory": 1,
+    "hendrycksTest-professional_accounting": 1,
+    "hendrycksTest-professional_law": 1,
+    "hendrycksTest-professional_medicine": 1,
+    "hendrycksTest-professional_psychology": 1,
+    "hendrycksTest-public_relations": 1,
+    "hendrycksTest-security_studies": 1,
+    "hendrycksTest-sociology": 1,
+    "hendrycksTest-us_foreign_policy": 1,
+    "hendrycksTest-virology": 1,
+    "hendrycksTest-world_religions": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=BEE-spoke-data/smol_llama-81M-tied,revision=main,trust_remote_code=True,dtype='float'",
+    "num_fewshot": 5,
+    "batch_size": "16",
+    "batch_sizes": [],
+    "device": "cuda",
+    "no_cache": false,
+    "limit": 0.25,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000..451134b
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000..6c00c74
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..ae0d58b
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,42 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}