From e3b4cb1fe1a10b56361dc40b20905fba463c8c3b Mon Sep 17 00:00:00 2001
From: ModelHub XC <noreply@modelhub.org.cn>
Date: Fri, 24 Apr 2026 21:13:03 +0800
Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?=
 =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?=
 =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Model: codingmonster1234/chess-sft-modelv2
Source: Original Platform
---
 .gitattributes         |   36 +
 README.md              |   58 ++
 chat_template.jinja    |   61 ++
 config.json            |   71 ++
 generation_config.json |   12 +
 model.safetensors      |    3 +
 optimizer.pt           |    3 +
 rng_state.pth          |    3 +
 scheduler.pt           |    3 +
 tokenizer.json         |    3 +
 tokenizer_config.json  |  239 ++++++
 trainer_state.json     | 1738 ++++++++++++++++++++++++++++++++++++++++
 training_args.bin      |    3 +
 13 files changed, 2233 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 README.md
 create mode 100644 chat_template.jinja
 create mode 100644 config.json
 create mode 100644 generation_config.json
 create mode 100644 model.safetensors
 create mode 100644 optimizer.pt
 create mode 100644 rng_state.pth
 create mode 100644 scheduler.pt
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer_config.json
 create mode 100644 trainer_state.json
 create mode 100644 training_args.bin

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..52373fe
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,36 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..35d801f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,58 @@
+---
+base_model: Qwen/Qwen3-4B-Instruct-2507
+library_name: transformers
+model_name: output-Qwen3-4B-Instruct-2507
+tags:
+- generated_from_trainer
+- sft
+- trl
+licence: license
+---
+
+# Model Card for output-Qwen3-4B-Instruct-2507
+
+This model is a fine-tuned version of [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/easwar-chess-none/chess-reasoning-v1/runs/aknx54sd) 
+
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.29.1
+- Transformers: 5.4.0
+- Pytorch: 2.11.0
+- Datasets: 4.8.4
+- Tokenizers: 0.22.2
+
+## Citations
+
+
+
+Cite TRL as:
+    
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```
\ No newline at end of file
diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000..70adff8
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,61 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..94b1c92
--- /dev/null
+++ b/config.json
@@ -0,0 +1,71 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 5000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.4.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..adfee01
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,12 @@
+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.4.0"
+}
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000..ceb43c5
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db62c16fc7e72b520780f950405f0befffc05eb21dea3b0429c54aaa0fac5f63
+size 8044982080
diff --git a/optimizer.pt b/optimizer.pt
new file mode 100644
index 0000000..62a14e2
--- /dev/null
+++ b/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:188874149b517f101f308cac71f71874db6f1c5e40d45d31f15d3a3ad0c7a240
+size 16090225449
diff --git a/rng_state.pth b/rng_state.pth
new file mode 100644
index 0000000..e92cf8f
--- /dev/null
+++ b/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
+size 14645
diff --git a/scheduler.pt b/scheduler.pt
new file mode 100644
index 0000000..7759360
--- /dev/null
+++ b/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc24cc580d6b93c8a95bc57b42299118656280a2fcbfb6854e41b57414837d71
+size 1465
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000..c7afbed
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..51c1be0
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,239 @@
+{
+    "add_prefix_space": false,
+    "added_tokens_decoder": {
+        "151643": {
+            "content": "<|endoftext|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151644": {
+            "content": "<|im_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151645": {
+            "content": "<|im_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151646": {
+            "content": "<|object_ref_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151647": {
+            "content": "<|object_ref_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151648": {
+            "content": "<|box_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151649": {
+            "content": "<|box_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151650": {
+            "content": "<|quad_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151651": {
+            "content": "<|quad_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151652": {
+            "content": "<|vision_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151653": {
+            "content": "<|vision_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151654": {
+            "content": "<|vision_pad|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151655": {
+            "content": "<|image_pad|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151656": {
+            "content": "<|video_pad|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151657": {
+            "content": "<tool_call>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151658": {
+            "content": "</tool_call>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151659": {
+            "content": "<|fim_prefix|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151660": {
+            "content": "<|fim_middle|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151661": {
+            "content": "<|fim_suffix|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151662": {
+            "content": "<|fim_pad|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151663": {
+            "content": "<|repo_name|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151664": {
+            "content": "<|file_sep|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151665": {
+            "content": "<tool_response>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151666": {
+            "content": "</tool_response>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151667": {
+            "content": "<think>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151668": {
+            "content": "</think>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        }
+    },
+    "additional_special_tokens": [
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|object_ref_start|>",
+        "<|object_ref_end|>",
+        "<|box_start|>",
+        "<|box_end|>",
+        "<|quad_start|>",
+        "<|quad_end|>",
+        "<|vision_start|>",
+        "<|vision_end|>",
+        "<|vision_pad|>",
+        "<|image_pad|>",
+        "<|video_pad|>"
+    ],
+    "bos_token": null,
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = '' %}\n    {%- endif %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}",
+    "clean_up_tokenization_spaces": false,
+    "eos_token": "<|im_end|>",
+    "errors": "replace",
+    "model_max_length": 1010000,
+    "pad_token": "<|endoftext|>",
+    "split_special_tokens": false,
+    "tokenizer_class": "Qwen2Tokenizer",
+    "unk_token": null,
+    "add_bos_token": false
+}
\ No newline at end of file
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000..fdf340b
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,1738 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 168,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.0061692222952843,
+      "epoch": 0.011904761904761904,
+      "grad_norm": 3.390625,
+      "learning_rate": 2e-05,
+      "loss": 2.293125867843628,
+      "mean_token_accuracy": 0.5738132819533348,
+      "num_tokens": 29832.0,
+      "step": 1
+    },
+    {
+      "entropy": 1.385195016860962,
+      "epoch": 0.023809523809523808,
+      "grad_norm": 0.78125,
+      "learning_rate": 1.999922292480975e-05,
+      "loss": 1.5697591304779053,
+      "mean_token_accuracy": 0.6427712365984917,
+      "num_tokens": 58835.0,
+      "step": 2
+    },
+    {
+      "entropy": 1.5784537345170975,
+      "epoch": 0.03571428571428571,
+      "grad_norm": 0.62890625,
+      "learning_rate": 1.9996891820008165e-05,
+      "loss": 1.5061622858047485,
+      "mean_token_accuracy": 0.654805600643158,
+      "num_tokens": 88089.0,
+      "step": 3
+    },
+    {
+      "entropy": 1.5019408017396927,
+      "epoch": 0.047619047619047616,
+      "grad_norm": 0.43359375,
+      "learning_rate": 1.9993007047883988e-05,
+      "loss": 1.3531173467636108,
+      "mean_token_accuracy": 0.6810621172189713,
+      "num_tokens": 116996.0,
+      "step": 4
+    },
+    {
+      "entropy": 1.442432388663292,
+      "epoch": 0.05952380952380952,
+      "grad_norm": 0.369140625,
+      "learning_rate": 1.9987569212189224e-05,
+      "loss": 1.2870382070541382,
+      "mean_token_accuracy": 0.6946646422147751,
+      "num_tokens": 146502.0,
+      "step": 5
+    },
+    {
+      "entropy": 1.383298322558403,
+      "epoch": 0.07142857142857142,
+      "grad_norm": 0.318359375,
+      "learning_rate": 1.9980579158045322e-05,
+      "loss": 1.2606914043426514,
+      "mean_token_accuracy": 0.6914810612797737,
+      "num_tokens": 175000.0,
+      "step": 6
+    },
+    {
+      "entropy": 1.3554321229457855,
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.359375,
+      "learning_rate": 1.9972037971811802e-05,
+      "loss": 1.2325180768966675,
+      "mean_token_accuracy": 0.6992553323507309,
+      "num_tokens": 203581.0,
+      "step": 7
+    },
+    {
+      "entropy": 1.301919937133789,
+      "epoch": 0.09523809523809523,
+      "grad_norm": 0.3046875,
+      "learning_rate": 1.9961946980917457e-05,
+      "loss": 1.1691060066223145,
+      "mean_token_accuracy": 0.714451938867569,
+      "num_tokens": 233225.0,
+      "step": 8
+    },
+    {
+      "entropy": 1.3274528235197067,
+      "epoch": 0.10714285714285714,
+      "grad_norm": 0.296875,
+      "learning_rate": 1.9950307753654016e-05,
+      "loss": 1.22238290309906,
+      "mean_token_accuracy": 0.6991388499736786,
+      "num_tokens": 261557.0,
+      "step": 9
+    },
+    {
+      "entropy": 1.3020492941141129,
+      "epoch": 0.11904761904761904,
+      "grad_norm": 0.279296875,
+      "learning_rate": 1.9937122098932428e-05,
+      "loss": 1.1407413482666016,
+      "mean_token_accuracy": 0.7115657702088356,
+      "num_tokens": 290843.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.2911820262670517,
+      "epoch": 0.13095238095238096,
+      "grad_norm": 0.263671875,
+      "learning_rate": 1.9922392066001724e-05,
+      "loss": 1.1007871627807617,
+      "mean_token_accuracy": 0.7251745313405991,
+      "num_tokens": 320963.0,
+      "step": 11
+    },
+    {
+      "entropy": 1.305821493268013,
+      "epoch": 0.14285714285714285,
+      "grad_norm": 0.2890625,
+      "learning_rate": 1.9906119944130527e-05,
+      "loss": 1.0885382890701294,
+      "mean_token_accuracy": 0.7273061871528625,
+      "num_tokens": 350648.0,
+      "step": 12
+    },
+    {
+      "entropy": 1.3162220120429993,
+      "epoch": 0.15476190476190477,
+      "grad_norm": 0.265625,
+      "learning_rate": 1.9888308262251286e-05,
+      "loss": 1.0963213443756104,
+      "mean_token_accuracy": 0.7211973443627357,
+      "num_tokens": 380096.0,
+      "step": 13
+    },
+    {
+      "entropy": 1.3141592741012573,
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.2734375,
+      "learning_rate": 1.9868959788567213e-05,
+      "loss": 1.0897754430770874,
+      "mean_token_accuracy": 0.7258400693535805,
+      "num_tokens": 407435.0,
+      "step": 14
+    },
+    {
+      "entropy": 1.3073242455720901,
+      "epoch": 0.17857142857142858,
+      "grad_norm": 0.2578125,
+      "learning_rate": 1.9848077530122083e-05,
+      "loss": 1.0494160652160645,
+      "mean_token_accuracy": 0.7293207123875618,
+      "num_tokens": 435734.0,
+      "step": 15
+    },
+    {
+      "entropy": 1.3367096036672592,
+      "epoch": 0.19047619047619047,
+      "grad_norm": 0.255859375,
+      "learning_rate": 1.9825664732332886e-05,
+      "loss": 1.1211317777633667,
+      "mean_token_accuracy": 0.7143202275037766,
+      "num_tokens": 464973.0,
+      "step": 16
+    },
+    {
+      "entropy": 1.3097643703222275,
+      "epoch": 0.20238095238095238,
+      "grad_norm": 0.25,
+      "learning_rate": 1.9801724878485438e-05,
+      "loss": 1.0753662586212158,
+      "mean_token_accuracy": 0.7259641215205193,
+      "num_tokens": 493135.0,
+      "step": 17
+    },
+    {
+      "entropy": 1.2622702419757843,
+      "epoch": 0.21428571428571427,
+      "grad_norm": 0.232421875,
+      "learning_rate": 1.977626168919305e-05,
+      "loss": 1.007223129272461,
+      "mean_token_accuracy": 0.744126707315445,
+      "num_tokens": 522656.0,
+      "step": 18
+    },
+    {
+      "entropy": 1.2859665155410767,
+      "epoch": 0.2261904761904762,
+      "grad_norm": 0.23046875,
+      "learning_rate": 1.9749279121818235e-05,
+      "loss": 1.0457340478897095,
+      "mean_token_accuracy": 0.7328037023544312,
+      "num_tokens": 551875.0,
+      "step": 19
+    },
+    {
+      "entropy": 1.275212675333023,
+      "epoch": 0.23809523809523808,
+      "grad_norm": 0.251953125,
+      "learning_rate": 1.9720781369857747e-05,
+      "loss": 1.0395888090133667,
+      "mean_token_accuracy": 0.7307759299874306,
+      "num_tokens": 580523.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.3000101447105408,
+      "epoch": 0.25,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 1.969077286229078e-05,
+      "loss": 1.0626932382583618,
+      "mean_token_accuracy": 0.7271415144205093,
+      "num_tokens": 609771.0,
+      "step": 21
+    },
+    {
+      "entropy": 1.242678239941597,
+      "epoch": 0.2619047619047619,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 1.9659258262890683e-05,
+      "loss": 0.9827122092247009,
+      "mean_token_accuracy": 0.7448626458644867,
+      "num_tokens": 639104.0,
+      "step": 22
+    },
+    {
+      "entropy": 1.2583424746990204,
+      "epoch": 0.27380952380952384,
+      "grad_norm": 0.228515625,
+      "learning_rate": 1.962624246950012e-05,
+      "loss": 1.0062870979309082,
+      "mean_token_accuracy": 0.7375933676958084,
+      "num_tokens": 667792.0,
+      "step": 23
+    },
+    {
+      "entropy": 1.2531014680862427,
+      "epoch": 0.2857142857142857,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 1.9591730613269878e-05,
+      "loss": 1.0229589939117432,
+      "mean_token_accuracy": 0.7366377785801888,
+      "num_tokens": 696742.0,
+      "step": 24
+    },
+    {
+      "entropy": 1.2342166602611542,
+      "epoch": 0.2976190476190476,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 1.955572805786141e-05,
+      "loss": 0.9788997769355774,
+      "mean_token_accuracy": 0.7421486154198647,
+      "num_tokens": 725968.0,
+      "step": 25
+    },
+    {
+      "entropy": 1.2210585623979568,
+      "epoch": 0.30952380952380953,
+      "grad_norm": 0.2578125,
+      "learning_rate": 1.9518240398613226e-05,
+      "loss": 0.987277090549469,
+      "mean_token_accuracy": 0.7420973554253578,
+      "num_tokens": 755689.0,
+      "step": 26
+    },
+    {
+      "entropy": 1.24309404194355,
+      "epoch": 0.32142857142857145,
+      "grad_norm": 0.2421875,
+      "learning_rate": 1.947927346167132e-05,
+      "loss": 1.0301053524017334,
+      "mean_token_accuracy": 0.7300752699375153,
+      "num_tokens": 784977.0,
+      "step": 27
+    },
+    {
+      "entropy": 1.2028213143348694,
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 1.9438833303083677e-05,
+      "loss": 0.9393562078475952,
+      "mean_token_accuracy": 0.7491495907306671,
+      "num_tokens": 814048.0,
+      "step": 28
+    },
+    {
+      "entropy": 1.2287103980779648,
+      "epoch": 0.34523809523809523,
+      "grad_norm": 0.228515625,
+      "learning_rate": 1.9396926207859085e-05,
+      "loss": 1.0168366432189941,
+      "mean_token_accuracy": 0.7329602986574173,
+      "num_tokens": 843602.0,
+      "step": 29
+    },
+    {
+      "entropy": 1.2081626951694489,
+      "epoch": 0.35714285714285715,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 1.935355868899034e-05,
+      "loss": 0.958310604095459,
+      "mean_token_accuracy": 0.7456908002495766,
+      "num_tokens": 871915.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2221457809209824,
+      "epoch": 0.36904761904761907,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 1.9308737486442045e-05,
+      "loss": 0.9946644902229309,
+      "mean_token_accuracy": 0.7383344992995262,
+      "num_tokens": 900851.0,
+      "step": 31
+    },
+    {
+      "entropy": 1.1801428943872452,
+      "epoch": 0.38095238095238093,
+      "grad_norm": 0.21484375,
+      "learning_rate": 1.926246956610309e-05,
+      "loss": 0.9103766083717346,
+      "mean_token_accuracy": 0.7624464929103851,
+      "num_tokens": 929498.0,
+      "step": 32
+    },
+    {
+      "entropy": 1.2152698189020157,
+      "epoch": 0.39285714285714285,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 1.921476211870408e-05,
+      "loss": 0.9737407565116882,
+      "mean_token_accuracy": 0.7427262291312218,
+      "num_tokens": 958933.0,
+      "step": 33
+    },
+    {
+      "entropy": 1.2030568569898605,
+      "epoch": 0.40476190476190477,
+      "grad_norm": 0.22265625,
+      "learning_rate": 1.9165622558699763e-05,
+      "loss": 0.9593278169631958,
+      "mean_token_accuracy": 0.7506603300571442,
+      "num_tokens": 987731.0,
+      "step": 34
+    },
+    {
+      "entropy": 1.1957021951675415,
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 1.9115058523116734e-05,
+      "loss": 0.9239043593406677,
+      "mean_token_accuracy": 0.7555749863386154,
+      "num_tokens": 1017002.0,
+      "step": 35
+    },
+    {
+      "entropy": 1.2133885324001312,
+      "epoch": 0.42857142857142855,
+      "grad_norm": 0.216796875,
+      "learning_rate": 1.9063077870366504e-05,
+      "loss": 0.9809866547584534,
+      "mean_token_accuracy": 0.7437998279929161,
+      "num_tokens": 1046678.0,
+      "step": 36
+    },
+    {
+      "entropy": 1.2098581492900848,
+      "epoch": 0.44047619047619047,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 1.900968867902419e-05,
+      "loss": 0.938984215259552,
+      "mean_token_accuracy": 0.7494841367006302,
+      "num_tokens": 1074445.0,
+      "step": 37
+    },
+    {
+      "entropy": 1.1815967112779617,
+      "epoch": 0.4523809523809524,
+      "grad_norm": 0.236328125,
+      "learning_rate": 1.895489924657301e-05,
+      "loss": 0.8934326767921448,
+      "mean_token_accuracy": 0.7595476359128952,
+      "num_tokens": 1103620.0,
+      "step": 38
+    },
+    {
+      "entropy": 1.2028009444475174,
+      "epoch": 0.4642857142857143,
+      "grad_norm": 0.2265625,
+      "learning_rate": 1.8898718088114688e-05,
+      "loss": 0.922984778881073,
+      "mean_token_accuracy": 0.7540801167488098,
+      "num_tokens": 1132637.0,
+      "step": 39
+    },
+    {
+      "entropy": 1.2034422308206558,
+      "epoch": 0.47619047619047616,
+      "grad_norm": 1.171875,
+      "learning_rate": 1.8841153935046098e-05,
+      "loss": 0.9033240675926208,
+      "mean_token_accuracy": 0.7560576424002647,
+      "num_tokens": 1161527.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.1716476827859879,
+      "epoch": 0.4880952380952381,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 1.8782215733702286e-05,
+      "loss": 0.8880018591880798,
+      "mean_token_accuracy": 0.7613470479846001,
+      "num_tokens": 1190701.0,
+      "step": 41
+    },
+    {
+      "entropy": 1.2157341986894608,
+      "epoch": 0.5,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 1.8721912643966055e-05,
+      "loss": 0.9609653949737549,
+      "mean_token_accuracy": 0.7453824803233147,
+      "num_tokens": 1218835.0,
+      "step": 42
+    },
+    {
+      "entropy": 1.197568565607071,
+      "epoch": 0.5119047619047619,
+      "grad_norm": 0.216796875,
+      "learning_rate": 1.866025403784439e-05,
+      "loss": 0.9219189882278442,
+      "mean_token_accuracy": 0.7547592371702194,
+      "num_tokens": 1248679.0,
+      "step": 43
+    },
+    {
+      "entropy": 1.1708803623914719,
+      "epoch": 0.5238095238095238,
+      "grad_norm": 0.373046875,
+      "learning_rate": 1.8597249498011906e-05,
+      "loss": 0.8802202343940735,
+      "mean_token_accuracy": 0.7667126134037971,
+      "num_tokens": 1277106.0,
+      "step": 44
+    },
+    {
+      "entropy": 1.191767856478691,
+      "epoch": 0.5357142857142857,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 1.8532908816321557e-05,
+      "loss": 0.9313769936561584,
+      "mean_token_accuracy": 0.7529165670275688,
+      "num_tokens": 1305983.0,
+      "step": 45
+    },
+    {
+      "entropy": 1.2066084146499634,
+      "epoch": 0.5476190476190477,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 1.8467241992282842e-05,
+      "loss": 0.9347527027130127,
+      "mean_token_accuracy": 0.7446473762392998,
+      "num_tokens": 1334578.0,
+      "step": 46
+    },
+    {
+      "entropy": 1.177584484219551,
+      "epoch": 0.5595238095238095,
+      "grad_norm": 0.25,
+      "learning_rate": 1.8400259231507716e-05,
+      "loss": 0.8884726166725159,
+      "mean_token_accuracy": 0.7611024901270866,
+      "num_tokens": 1362873.0,
+      "step": 47
+    },
+    {
+      "entropy": 1.1629594564437866,
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.2265625,
+      "learning_rate": 1.833197094412449e-05,
+      "loss": 0.8861435651779175,
+      "mean_token_accuracy": 0.76307063549757,
+      "num_tokens": 1391315.0,
+      "step": 48
+    },
+    {
+      "entropy": 1.168922871351242,
+      "epoch": 0.5833333333333334,
+      "grad_norm": 0.23046875,
+      "learning_rate": 1.826238774315995e-05,
+      "loss": 0.8765286207199097,
+      "mean_token_accuracy": 0.76119015365839,
+      "num_tokens": 1419829.0,
+      "step": 49
+    },
+    {
+      "entropy": 1.1843004375696182,
+      "epoch": 0.5952380952380952,
+      "grad_norm": 0.234375,
+      "learning_rate": 1.819152044288992e-05,
+      "loss": 0.9242440462112427,
+      "mean_token_accuracy": 0.7494527697563171,
+      "num_tokens": 1447790.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.1673331260681152,
+      "epoch": 0.6071428571428571,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 1.811938005715857e-05,
+      "loss": 0.8822228312492371,
+      "mean_token_accuracy": 0.7585421577095985,
+      "num_tokens": 1476278.0,
+      "step": 51
+    },
+    {
+      "entropy": 1.2116869688034058,
+      "epoch": 0.6190476190476191,
+      "grad_norm": 0.2421875,
+      "learning_rate": 1.8045977797666685e-05,
+      "loss": 0.9784308671951294,
+      "mean_token_accuracy": 0.7404012456536293,
+      "num_tokens": 1503947.0,
+      "step": 52
+    },
+    {
+      "entropy": 1.162365809082985,
+      "epoch": 0.6309523809523809,
+      "grad_norm": 0.2265625,
+      "learning_rate": 1.7971325072229227e-05,
+      "loss": 0.9283543825149536,
+      "mean_token_accuracy": 0.7499738857150078,
+      "num_tokens": 1533531.0,
+      "step": 53
+    },
+    {
+      "entropy": 1.1863622218370438,
+      "epoch": 0.6428571428571429,
+      "grad_norm": 0.2421875,
+      "learning_rate": 1.7895433483002356e-05,
+      "loss": 0.9471738934516907,
+      "mean_token_accuracy": 0.7532860413193703,
+      "num_tokens": 1561412.0,
+      "step": 54
+    },
+    {
+      "entropy": 1.1698070168495178,
+      "epoch": 0.6547619047619048,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 1.78183148246803e-05,
+      "loss": 0.9019606709480286,
+      "mean_token_accuracy": 0.7543124184012413,
+      "num_tokens": 1590336.0,
+      "step": 55
+    },
+    {
+      "entropy": 1.1683688312768936,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.208984375,
+      "learning_rate": 1.7739981082662275e-05,
+      "loss": 0.9020405411720276,
+      "mean_token_accuracy": 0.7580606490373611,
+      "num_tokens": 1620442.0,
+      "step": 56
+    },
+    {
+      "entropy": 1.1867523938417435,
+      "epoch": 0.6785714285714286,
+      "grad_norm": 0.216796875,
+      "learning_rate": 1.766044443118978e-05,
+      "loss": 0.917300283908844,
+      "mean_token_accuracy": 0.7553394213318825,
+      "num_tokens": 1648762.0,
+      "step": 57
+    },
+    {
+      "entropy": 1.1505564451217651,
+      "epoch": 0.6904761904761905,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 1.757971723145453e-05,
+      "loss": 0.8627029061317444,
+      "mean_token_accuracy": 0.7657916098833084,
+      "num_tokens": 1677464.0,
+      "step": 58
+    },
+    {
+      "entropy": 1.1766629666090012,
+      "epoch": 0.7023809523809523,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 1.7497812029677344e-05,
+      "loss": 0.8795939087867737,
+      "mean_token_accuracy": 0.7613174989819527,
+      "num_tokens": 1704994.0,
+      "step": 59
+    },
+    {
+      "entropy": 1.1731744706630707,
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 1.741474155515827e-05,
+      "loss": 0.8988810777664185,
+      "mean_token_accuracy": 0.7579676881432533,
+      "num_tokens": 1734202.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.1697156727313995,
+      "epoch": 0.7261904761904762,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 1.7330518718298263e-05,
+      "loss": 0.9070097804069519,
+      "mean_token_accuracy": 0.7564781159162521,
+      "num_tokens": 1763541.0,
+      "step": 61
+    },
+    {
+      "entropy": 1.1686383485794067,
+      "epoch": 0.7380952380952381,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 1.7245156608592727e-05,
+      "loss": 0.8804867267608643,
+      "mean_token_accuracy": 0.7639917582273483,
+      "num_tokens": 1793196.0,
+      "step": 62
+    },
+    {
+      "entropy": 1.195967510342598,
+      "epoch": 0.75,
+      "grad_norm": 0.25390625,
+      "learning_rate": 1.7158668492597186e-05,
+      "loss": 0.9389015436172485,
+      "mean_token_accuracy": 0.747251845896244,
+      "num_tokens": 1821023.0,
+      "step": 63
+    },
+    {
+      "entropy": 1.1664810329675674,
+      "epoch": 0.7619047619047619,
+      "grad_norm": 0.21875,
+      "learning_rate": 1.7071067811865477e-05,
+      "loss": 0.9056146740913391,
+      "mean_token_accuracy": 0.7550350353121758,
+      "num_tokens": 1849586.0,
+      "step": 64
+    },
+    {
+      "entropy": 1.171183928847313,
+      "epoch": 0.7738095238095238,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 1.698236818086073e-05,
+      "loss": 0.929341197013855,
+      "mean_token_accuracy": 0.7491638883948326,
+      "num_tokens": 1878622.0,
+      "step": 65
+    },
+    {
+      "entropy": 1.1465008854866028,
+      "epoch": 0.7857142857142857,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 1.689258338483947e-05,
+      "loss": 0.8692110776901245,
+      "mean_token_accuracy": 0.765314869582653,
+      "num_tokens": 1907725.0,
+      "step": 66
+    },
+    {
+      "entropy": 1.1706128865480423,
+      "epoch": 0.7976190476190477,
+      "grad_norm": 0.216796875,
+      "learning_rate": 1.6801727377709195e-05,
+      "loss": 0.886278510093689,
+      "mean_token_accuracy": 0.7576193287968636,
+      "num_tokens": 1936209.0,
+      "step": 67
+    },
+    {
+      "entropy": 1.1479064524173737,
+      "epoch": 0.8095238095238095,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 1.67098142798597e-05,
+      "loss": 0.8587610125541687,
+      "mean_token_accuracy": 0.7682890966534615,
+      "num_tokens": 1964915.0,
+      "step": 68
+    },
+    {
+      "entropy": 1.1495172083377838,
+      "epoch": 0.8214285714285714,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 1.6616858375968596e-05,
+      "loss": 0.8885282874107361,
+      "mean_token_accuracy": 0.7598370909690857,
+      "num_tokens": 1993606.0,
+      "step": 69
+    },
+    {
+      "entropy": 1.1534761786460876,
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 1.6522874112781213e-05,
+      "loss": 0.8863916993141174,
+      "mean_token_accuracy": 0.7640347108244896,
+      "num_tokens": 2022472.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.14171202480793,
+      "epoch": 0.8452380952380952,
+      "grad_norm": 0.220703125,
+      "learning_rate": 1.6427876096865394e-05,
+      "loss": 0.8785849809646606,
+      "mean_token_accuracy": 0.7604316994547844,
+      "num_tokens": 2052746.0,
+      "step": 71
+    },
+    {
+      "entropy": 1.1478676050901413,
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.212890625,
+      "learning_rate": 1.6331879092341402e-05,
+      "loss": 0.8796285390853882,
+      "mean_token_accuracy": 0.7586944848299026,
+      "num_tokens": 2081889.0,
+      "step": 72
+    },
+    {
+      "entropy": 1.1222540885210037,
+      "epoch": 0.8690476190476191,
+      "grad_norm": 0.193359375,
+      "learning_rate": 1.6234898018587336e-05,
+      "loss": 0.8146858811378479,
+      "mean_token_accuracy": 0.7756616845726967,
+      "num_tokens": 2111616.0,
+      "step": 73
+    },
+    {
+      "entropy": 1.153001144528389,
+      "epoch": 0.8809523809523809,
+      "grad_norm": 0.224609375,
+      "learning_rate": 1.6136947947920477e-05,
+      "loss": 0.8884707689285278,
+      "mean_token_accuracy": 0.7565625682473183,
+      "num_tokens": 2140433.0,
+      "step": 74
+    },
+    {
+      "entropy": 1.1275182217359543,
+      "epoch": 0.8928571428571429,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 1.6038044103254775e-05,
+      "loss": 0.8272450566291809,
+      "mean_token_accuracy": 0.7704622000455856,
+      "num_tokens": 2170414.0,
+      "step": 75
+    },
+    {
+      "entropy": 1.1576026529073715,
+      "epoch": 0.9047619047619048,
+      "grad_norm": 0.22265625,
+      "learning_rate": 1.5938201855735017e-05,
+      "loss": 0.9035623669624329,
+      "mean_token_accuracy": 0.7542874589562416,
+      "num_tokens": 2198868.0,
+      "step": 76
+    },
+    {
+      "entropy": 1.1199318170547485,
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.20703125,
+      "learning_rate": 1.5837436722347902e-05,
+      "loss": 0.8039325475692749,
+      "mean_token_accuracy": 0.783287987112999,
+      "num_tokens": 2228134.0,
+      "step": 77
+    },
+    {
+      "entropy": 1.1484037339687347,
+      "epoch": 0.9285714285714286,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 1.573576436351046e-05,
+      "loss": 0.8699290752410889,
+      "mean_token_accuracy": 0.7641323357820511,
+      "num_tokens": 2257447.0,
+      "step": 78
+    },
+    {
+      "entropy": 1.1295416802167892,
+      "epoch": 0.9404761904761905,
+      "grad_norm": 0.205078125,
+      "learning_rate": 1.563320058063622e-05,
+      "loss": 0.8303874731063843,
+      "mean_token_accuracy": 0.7720286920666695,
+      "num_tokens": 2286749.0,
+      "step": 79
+    },
+    {
+      "entropy": 1.1563286185264587,
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.21875,
+      "learning_rate": 1.5529761313679396e-05,
+      "loss": 0.8524646759033203,
+      "mean_token_accuracy": 0.7633371129631996,
+      "num_tokens": 2315039.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.1543449014425278,
+      "epoch": 0.9642857142857143,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 1.5425462638657597e-05,
+      "loss": 0.9120794534683228,
+      "mean_token_accuracy": 0.756316527724266,
+      "num_tokens": 2344737.0,
+      "step": 81
+    },
+    {
+      "entropy": 1.13828843832016,
+      "epoch": 0.9761904761904762,
+      "grad_norm": 0.2265625,
+      "learning_rate": 1.5320320765153367e-05,
+      "loss": 0.824118971824646,
+      "mean_token_accuracy": 0.7736462280154228,
+      "num_tokens": 2373710.0,
+      "step": 82
+    },
+    {
+      "entropy": 1.145560473203659,
+      "epoch": 0.9880952380952381,
+      "grad_norm": 0.220703125,
+      "learning_rate": 1.5214352033794981e-05,
+      "loss": 0.8729808926582336,
+      "mean_token_accuracy": 0.7629412487149239,
+      "num_tokens": 2402610.0,
+      "step": 83
+    },
+    {
+      "entropy": 1.1476428806781769,
+      "epoch": 1.0,
+      "grad_norm": 0.22265625,
+      "learning_rate": 1.5107572913716859e-05,
+      "loss": 0.8972144722938538,
+      "mean_token_accuracy": 0.757901057600975,
+      "num_tokens": 2430019.0,
+      "step": 84
+    },
+    {
+      "epoch": 1.0,
+      "eval_entropy": 1.1429666471481323,
+      "eval_loss": 0.8658801317214966,
+      "eval_mean_token_accuracy": 0.7630383356412251,
+      "eval_model_preparation_time": 0.0051,
+      "eval_num_tokens": 2430019.0,
+      "eval_runtime": 19.169,
+      "eval_samples_per_second": 7.825,
+      "eval_steps_per_second": 7.825,
+      "step": 84
+    },
+    {
+      "entropy": 1.1193113178014755,
+      "epoch": 1.0119047619047619,
+      "grad_norm": 0.205078125,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 0.8069751858711243,
+      "mean_token_accuracy": 0.7752386555075645,
+      "num_tokens": 2459653.0,
+      "step": 85
+    },
+    {
+      "entropy": 1.14054836332798,
+      "epoch": 1.0238095238095237,
+      "grad_norm": 0.2109375,
+      "learning_rate": 1.4891650011092896e-05,
+      "loss": 0.8288445472717285,
+      "mean_token_accuracy": 0.7729767188429832,
+      "num_tokens": 2488217.0,
+      "step": 86
+    },
+    {
+      "entropy": 1.1414664089679718,
+      "epoch": 1.0357142857142858,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 1.4782539786213184e-05,
+      "loss": 0.8254880905151367,
+      "mean_token_accuracy": 0.7727913111448288,
+      "num_tokens": 2517578.0,
+      "step": 87
+    },
+    {
+      "entropy": 1.1179616451263428,
+      "epoch": 1.0476190476190477,
+      "grad_norm": 0.205078125,
+      "learning_rate": 1.4672686282730622e-05,
+      "loss": 0.8098872303962708,
+      "mean_token_accuracy": 0.7769448384642601,
+      "num_tokens": 2546116.0,
+      "step": 88
+    },
+    {
+      "entropy": 1.1239117681980133,
+      "epoch": 1.0595238095238095,
+      "grad_norm": 0.318359375,
+      "learning_rate": 1.4562106573531632e-05,
+      "loss": 0.8263017535209656,
+      "mean_token_accuracy": 0.7758133932948112,
+      "num_tokens": 2574681.0,
+      "step": 89
+    },
+    {
+      "entropy": 1.1026111543178558,
+      "epoch": 1.0714285714285714,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 1.4450817844365924e-05,
+      "loss": 0.8099116086959839,
+      "mean_token_accuracy": 0.7731629684567451,
+      "num_tokens": 2603807.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.1024491339921951,
+      "epoch": 1.0833333333333333,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 1.4338837391175582e-05,
+      "loss": 0.8093633055686951,
+      "mean_token_accuracy": 0.7739714533090591,
+      "num_tokens": 2632614.0,
+      "step": 91
+    },
+    {
+      "entropy": 1.1085499972105026,
+      "epoch": 1.0952380952380953,
+      "grad_norm": 0.216796875,
+      "learning_rate": 1.4226182617406996e-05,
+      "loss": 0.8473532199859619,
+      "mean_token_accuracy": 0.7683232203125954,
+      "num_tokens": 2661538.0,
+      "step": 92
+    },
+    {
+      "entropy": 1.0892803370952606,
+      "epoch": 1.1071428571428572,
+      "grad_norm": 0.220703125,
+      "learning_rate": 1.4112871031306118e-05,
+      "loss": 0.8294469118118286,
+      "mean_token_accuracy": 0.7713945508003235,
+      "num_tokens": 2690777.0,
+      "step": 93
+    },
+    {
+      "entropy": 1.1031535863876343,
+      "epoch": 1.119047619047619,
+      "grad_norm": 0.224609375,
+      "learning_rate": 1.3998920243197408e-05,
+      "loss": 0.8391809463500977,
+      "mean_token_accuracy": 0.7676805257797241,
+      "num_tokens": 2719730.0,
+      "step": 94
+    },
+    {
+      "entropy": 1.0815589874982834,
+      "epoch": 1.130952380952381,
+      "grad_norm": 0.21484375,
+      "learning_rate": 1.3884347962746949e-05,
+      "loss": 0.7862935066223145,
+      "mean_token_accuracy": 0.7806214541196823,
+      "num_tokens": 2749156.0,
+      "step": 95
+    },
+    {
+      "entropy": 1.084671527147293,
+      "epoch": 1.1428571428571428,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 1.3769171996210053e-05,
+      "loss": 0.840523898601532,
+      "mean_token_accuracy": 0.7695459797978401,
+      "num_tokens": 2778531.0,
+      "step": 96
+    },
+    {
+      "entropy": 1.0894652903079987,
+      "epoch": 1.1547619047619047,
+      "grad_norm": 0.216796875,
+      "learning_rate": 1.3653410243663953e-05,
+      "loss": 0.7974240779876709,
+      "mean_token_accuracy": 0.7744667157530785,
+      "num_tokens": 2806462.0,
+      "step": 97
+    },
+    {
+      "entropy": 1.0971969813108444,
+      "epoch": 1.1666666666666667,
+      "grad_norm": 0.22265625,
+      "learning_rate": 1.3537080696225815e-05,
+      "loss": 0.8246796131134033,
+      "mean_token_accuracy": 0.7684177905321121,
+      "num_tokens": 2835497.0,
+      "step": 98
+    },
+    {
+      "entropy": 1.1123791635036469,
+      "epoch": 1.1785714285714286,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 1.342020143325669e-05,
+      "loss": 0.8859103322029114,
+      "mean_token_accuracy": 0.7534352988004684,
+      "num_tokens": 2865231.0,
+      "step": 99
+    },
+    {
+      "entropy": 1.075607344508171,
+      "epoch": 1.1904761904761905,
+      "grad_norm": 0.2265625,
+      "learning_rate": 1.3302790619551673e-05,
+      "loss": 0.7980949878692627,
+      "mean_token_accuracy": 0.7762870118021965,
+      "num_tokens": 2894329.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.1072215735912323,
+      "epoch": 1.2023809523809523,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 1.3184866502516846e-05,
+      "loss": 0.8650733232498169,
+      "mean_token_accuracy": 0.764843761920929,
+      "num_tokens": 2923660.0,
+      "step": 101
+    },
+    {
+      "entropy": 1.0887151509523392,
+      "epoch": 1.2142857142857142,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 1.3066447409333345e-05,
+      "loss": 0.790311336517334,
+      "mean_token_accuracy": 0.7792445793747902,
+      "num_tokens": 2952054.0,
+      "step": 102
+    },
+    {
+      "entropy": 1.1025346666574478,
+      "epoch": 1.2261904761904763,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 1.2947551744109044e-05,
+      "loss": 0.8180376887321472,
+      "mean_token_accuracy": 0.7729773372411728,
+      "num_tokens": 2981426.0,
+      "step": 103
+    },
+    {
+      "entropy": 1.0916212499141693,
+      "epoch": 1.2380952380952381,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 1.2828197985018276e-05,
+      "loss": 0.7971659898757935,
+      "mean_token_accuracy": 0.7799450904130936,
+      "num_tokens": 3009579.0,
+      "step": 104
+    },
+    {
+      "entropy": 1.1104163080453873,
+      "epoch": 1.25,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 1.2708404681430054e-05,
+      "loss": 0.8455361127853394,
+      "mean_token_accuracy": 0.7681760489940643,
+      "num_tokens": 3038292.0,
+      "step": 105
+    },
+    {
+      "entropy": 1.1180581152439117,
+      "epoch": 1.2619047619047619,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 1.2588190451025209e-05,
+      "loss": 0.8946309685707092,
+      "mean_token_accuracy": 0.755538322031498,
+      "num_tokens": 3068231.0,
+      "step": 106
+    },
+    {
+      "entropy": 1.0994994044303894,
+      "epoch": 1.2738095238095237,
+      "grad_norm": 0.265625,
+      "learning_rate": 1.2467573976902936e-05,
+      "loss": 0.7855837345123291,
+      "mean_token_accuracy": 0.7798345908522606,
+      "num_tokens": 3096640.0,
+      "step": 107
+    },
+    {
+      "entropy": 1.0958448350429535,
+      "epoch": 1.2857142857142856,
+      "grad_norm": 0.22265625,
+      "learning_rate": 1.2346574004677154e-05,
+      "loss": 0.8080664277076721,
+      "mean_token_accuracy": 0.775592751801014,
+      "num_tokens": 3125619.0,
+      "step": 108
+    },
+    {
+      "entropy": 1.1057351678609848,
+      "epoch": 1.2976190476190477,
+      "grad_norm": 0.375,
+      "learning_rate": 1.2225209339563144e-05,
+      "loss": 0.8222600817680359,
+      "mean_token_accuracy": 0.7683183401823044,
+      "num_tokens": 3155256.0,
+      "step": 109
+    },
+    {
+      "entropy": 1.1132191121578217,
+      "epoch": 1.3095238095238095,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 1.210349884345496e-05,
+      "loss": 0.8248376250267029,
+      "mean_token_accuracy": 0.7687205746769905,
+      "num_tokens": 3183948.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.0987165123224258,
+      "epoch": 1.3214285714285714,
+      "grad_norm": 0.31640625,
+      "learning_rate": 1.1981461431993978e-05,
+      "loss": 0.8191619515419006,
+      "mean_token_accuracy": 0.772399052977562,
+      "num_tokens": 3212463.0,
+      "step": 111
+    },
+    {
+      "entropy": 1.1073571592569351,
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.232421875,
+      "learning_rate": 1.1859116071629148e-05,
+      "loss": 0.8318334221839905,
+      "mean_token_accuracy": 0.7649757117033005,
+      "num_tokens": 3241487.0,
+      "step": 112
+    },
+    {
+      "entropy": 1.102282091975212,
+      "epoch": 1.3452380952380953,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 1.1736481776669307e-05,
+      "loss": 0.8375995755195618,
+      "mean_token_accuracy": 0.7682436108589172,
+      "num_tokens": 3270436.0,
+      "step": 113
+    },
+    {
+      "entropy": 1.0837299078702927,
+      "epoch": 1.3571428571428572,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 1.1613577606328068e-05,
+      "loss": 0.7833430767059326,
+      "mean_token_accuracy": 0.7823601812124252,
+      "num_tokens": 3299814.0,
+      "step": 114
+    },
+    {
+      "entropy": 1.0879952907562256,
+      "epoch": 1.369047619047619,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 1.1490422661761744e-05,
+      "loss": 0.7993915677070618,
+      "mean_token_accuracy": 0.7771986275911331,
+      "num_tokens": 3328509.0,
+      "step": 115
+    },
+    {
+      "entropy": 1.112231805920601,
+      "epoch": 1.380952380952381,
+      "grad_norm": 0.224609375,
+      "learning_rate": 1.1367036083100735e-05,
+      "loss": 0.8307598233222961,
+      "mean_token_accuracy": 0.7695401236414909,
+      "num_tokens": 3356953.0,
+      "step": 116
+    },
+    {
+      "entropy": 1.0973141938447952,
+      "epoch": 1.3928571428571428,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 1.1243437046474854e-05,
+      "loss": 0.8001049757003784,
+      "mean_token_accuracy": 0.7750882878899574,
+      "num_tokens": 3385659.0,
+      "step": 117
+    },
+    {
+      "entropy": 1.1106764674186707,
+      "epoch": 1.4047619047619047,
+      "grad_norm": 0.228515625,
+      "learning_rate": 1.1119644761033079e-05,
+      "loss": 0.820791482925415,
+      "mean_token_accuracy": 0.7748995646834373,
+      "num_tokens": 3414046.0,
+      "step": 118
+    },
+    {
+      "entropy": 1.0989094227552414,
+      "epoch": 1.4166666666666667,
+      "grad_norm": 0.26171875,
+      "learning_rate": 1.0995678465958168e-05,
+      "loss": 0.8132579326629639,
+      "mean_token_accuracy": 0.7685609012842178,
+      "num_tokens": 3442153.0,
+      "step": 119
+    },
+    {
+      "entropy": 1.123728185892105,
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.2265625,
+      "learning_rate": 1.0871557427476585e-05,
+      "loss": 0.8655298948287964,
+      "mean_token_accuracy": 0.7622044086456299,
+      "num_tokens": 3471630.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.1011102497577667,
+      "epoch": 1.4404761904761905,
+      "grad_norm": 0.21875,
+      "learning_rate": 1.0747300935864245e-05,
+      "loss": 0.8160438537597656,
+      "mean_token_accuracy": 0.7715617045760155,
+      "num_tokens": 3500341.0,
+      "step": 121
+    },
+    {
+      "entropy": 1.0802496522665024,
+      "epoch": 1.4523809523809523,
+      "grad_norm": 0.22265625,
+      "learning_rate": 1.0622928302448523e-05,
+      "loss": 0.795846700668335,
+      "mean_token_accuracy": 0.7745838463306427,
+      "num_tokens": 3530737.0,
+      "step": 122
+    },
+    {
+      "entropy": 1.111521065235138,
+      "epoch": 1.4642857142857144,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 1.0498458856606972e-05,
+      "loss": 0.8259180188179016,
+      "mean_token_accuracy": 0.7704022750258446,
+      "num_tokens": 3559280.0,
+      "step": 123
+    },
+    {
+      "entropy": 1.1052347421646118,
+      "epoch": 1.4761904761904763,
+      "grad_norm": 0.240234375,
+      "learning_rate": 1.037391194276326e-05,
+      "loss": 0.8490574359893799,
+      "mean_token_accuracy": 0.7630502879619598,
+      "num_tokens": 3588223.0,
+      "step": 124
+    },
+    {
+      "entropy": 1.1114005744457245,
+      "epoch": 1.4880952380952381,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 1.0249306917380731e-05,
+      "loss": 0.8460506796836853,
+      "mean_token_accuracy": 0.766345664858818,
+      "num_tokens": 3617347.0,
+      "step": 125
+    },
+    {
+      "entropy": 1.115243524312973,
+      "epoch": 1.5,
+      "grad_norm": 0.21484375,
+      "learning_rate": 1.0124663145954152e-05,
+      "loss": 0.8421509265899658,
+      "mean_token_accuracy": 0.7646084725856781,
+      "num_tokens": 3646452.0,
+      "step": 126
+    },
+    {
+      "entropy": 1.1102195531129837,
+      "epoch": 1.5119047619047619,
+      "grad_norm": 0.68359375,
+      "learning_rate": 1e-05,
+      "loss": 0.8465963006019592,
+      "mean_token_accuracy": 0.7651297971606255,
+      "num_tokens": 3674684.0,
+      "step": 127
+    },
+    {
+      "entropy": 1.1138557642698288,
+      "epoch": 1.5238095238095237,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 9.87533685404585e-06,
+      "loss": 0.8462578058242798,
+      "mean_token_accuracy": 0.7656397670507431,
+      "num_tokens": 3701972.0,
+      "step": 128
+    },
+    {
+      "entropy": 1.0684314519166946,
+      "epoch": 1.5357142857142856,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 9.750693082619274e-06,
+      "loss": 0.7849152684211731,
+      "mean_token_accuracy": 0.7857328802347183,
+      "num_tokens": 3731223.0,
+      "step": 129
+    },
+    {
+      "entropy": 1.0986532717943192,
+      "epoch": 1.5476190476190477,
+      "grad_norm": 0.234375,
+      "learning_rate": 9.626088057236745e-06,
+      "loss": 0.8162216544151306,
+      "mean_token_accuracy": 0.7728657871484756,
+      "num_tokens": 3759466.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.1011187136173248,
+      "epoch": 1.5595238095238095,
+      "grad_norm": 0.220703125,
+      "learning_rate": 9.501541143393028e-06,
+      "loss": 0.8209044933319092,
+      "mean_token_accuracy": 0.7711444199085236,
+      "num_tokens": 3788276.0,
+      "step": 131
+    },
+    {
+      "entropy": 1.0799630433321,
+      "epoch": 1.5714285714285714,
+      "grad_norm": 0.21875,
+      "learning_rate": 9.377071697551479e-06,
+      "loss": 0.7802744507789612,
+      "mean_token_accuracy": 0.7825465202331543,
+      "num_tokens": 3817834.0,
+      "step": 132
+    },
+    {
+      "entropy": 1.1000354290008545,
+      "epoch": 1.5833333333333335,
+      "grad_norm": 0.21875,
+      "learning_rate": 9.252699064135759e-06,
+      "loss": 0.8035217523574829,
+      "mean_token_accuracy": 0.7786530405282974,
+      "num_tokens": 3846803.0,
+      "step": 133
+    },
+    {
+      "entropy": 1.106198564171791,
+      "epoch": 1.5952380952380953,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 9.128442572523418e-06,
+      "loss": 0.8161381483078003,
+      "mean_token_accuracy": 0.7741018161177635,
+      "num_tokens": 3875363.0,
+      "step": 134
+    },
+    {
+      "entropy": 1.094715103507042,
+      "epoch": 1.6071428571428572,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 9.004321534041836e-06,
+      "loss": 0.797153115272522,
+      "mean_token_accuracy": 0.7743495553731918,
+      "num_tokens": 3904020.0,
+      "step": 135
+    },
+    {
+      "entropy": 1.0934260189533234,
+      "epoch": 1.619047619047619,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 8.880355238966923e-06,
+      "loss": 0.7957767248153687,
+      "mean_token_accuracy": 0.7722097188234329,
+      "num_tokens": 3932554.0,
+      "step": 136
+    },
+    {
+      "entropy": 1.0932885110378265,
+      "epoch": 1.630952380952381,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 8.756562953525151e-06,
+      "loss": 0.8285123109817505,
+      "mean_token_accuracy": 0.7748213410377502,
+      "num_tokens": 3963124.0,
+      "step": 137
+    },
+    {
+      "entropy": 1.0973111540079117,
+      "epoch": 1.6428571428571428,
+      "grad_norm": 0.220703125,
+      "learning_rate": 8.632963916899268e-06,
+      "loss": 0.8037251234054565,
+      "mean_token_accuracy": 0.7732022255659103,
+      "num_tokens": 3991485.0,
+      "step": 138
+    },
+    {
+      "entropy": 1.1007558554410934,
+      "epoch": 1.6547619047619047,
+      "grad_norm": 0.22265625,
+      "learning_rate": 8.509577338238255e-06,
+      "loss": 0.8211590051651001,
+      "mean_token_accuracy": 0.770406000316143,
+      "num_tokens": 4020583.0,
+      "step": 139
+    },
+    {
+      "entropy": 1.0826598927378654,
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 8.386422393671934e-06,
+      "loss": 0.7706205248832703,
+      "mean_token_accuracy": 0.7830442562699318,
+      "num_tokens": 4049853.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.1024836301803589,
+      "epoch": 1.6785714285714286,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 8.263518223330698e-06,
+      "loss": 0.8168199062347412,
+      "mean_token_accuracy": 0.7706331759691238,
+      "num_tokens": 4079030.0,
+      "step": 141
+    },
+    {
+      "entropy": 1.1091957688331604,
+      "epoch": 1.6904761904761905,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 8.140883928370855e-06,
+      "loss": 0.8526521325111389,
+      "mean_token_accuracy": 0.7632784247398376,
+      "num_tokens": 4108702.0,
+      "step": 142
+    },
+    {
+      "entropy": 1.0926142483949661,
+      "epoch": 1.7023809523809523,
+      "grad_norm": 0.251953125,
+      "learning_rate": 8.018538568006027e-06,
+      "loss": 0.800937294960022,
+      "mean_token_accuracy": 0.7739113718271255,
+      "num_tokens": 4138456.0,
+      "step": 143
+    },
+    {
+      "entropy": 1.085595116019249,
+      "epoch": 1.7142857142857144,
+      "grad_norm": 0.21875,
+      "learning_rate": 7.896501156545044e-06,
+      "loss": 0.7860180735588074,
+      "mean_token_accuracy": 0.7786939144134521,
+      "num_tokens": 4168706.0,
+      "step": 144
+    },
+    {
+      "entropy": 1.072287455201149,
+      "epoch": 1.7261904761904763,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 7.774790660436857e-06,
+      "loss": 0.7674008011817932,
+      "mean_token_accuracy": 0.7878812775015831,
+      "num_tokens": 4197229.0,
+      "step": 145
+    },
+    {
+      "entropy": 1.1147316098213196,
+      "epoch": 1.7380952380952381,
+      "grad_norm": 0.21484375,
+      "learning_rate": 7.653425995322852e-06,
+      "loss": 0.8494656682014465,
+      "mean_token_accuracy": 0.7613426074385643,
+      "num_tokens": 4226241.0,
+      "step": 146
+    },
+    {
+      "entropy": 1.0884745866060257,
+      "epoch": 1.75,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 7.532426023097063e-06,
+      "loss": 0.7670794129371643,
+      "mean_token_accuracy": 0.7854177579283714,
+      "num_tokens": 4254275.0,
+      "step": 147
+    },
+    {
+      "entropy": 1.0930557996034622,
+      "epoch": 1.7619047619047619,
+      "grad_norm": 0.208984375,
+      "learning_rate": 7.411809548974792e-06,
+      "loss": 0.8160566091537476,
+      "mean_token_accuracy": 0.7741377875208855,
+      "num_tokens": 4283150.0,
+      "step": 148
+    },
+    {
+      "entropy": 1.0953784435987473,
+      "epoch": 1.7738095238095237,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 7.291595318569951e-06,
+      "loss": 0.8185824751853943,
+      "mean_token_accuracy": 0.7722392901778221,
+      "num_tokens": 4312410.0,
+      "step": 149
+    },
+    {
+      "entropy": 1.0579064786434174,
+      "epoch": 1.7857142857142856,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 7.171802014981726e-06,
+      "loss": 0.748650848865509,
+      "mean_token_accuracy": 0.79205472022295,
+      "num_tokens": 4341254.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.0844353437423706,
+      "epoch": 1.7976190476190477,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 7.052448255890958e-06,
+      "loss": 0.7923084497451782,
+      "mean_token_accuracy": 0.7777365446090698,
+      "num_tokens": 4369573.0,
+      "step": 151
+    },
+    {
+      "entropy": 1.098150685429573,
+      "epoch": 1.8095238095238095,
+      "grad_norm": 0.23046875,
+      "learning_rate": 6.933552590666659e-06,
+      "loss": 0.8330479860305786,
+      "mean_token_accuracy": 0.7675687223672867,
+      "num_tokens": 4397753.0,
+      "step": 152
+    },
+    {
+      "entropy": 1.0822398364543915,
+      "epoch": 1.8214285714285714,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 6.815133497483157e-06,
+      "loss": 0.7889379262924194,
+      "mean_token_accuracy": 0.7777184247970581,
+      "num_tokens": 4427257.0,
+      "step": 153
+    },
+    {
+      "entropy": 1.0754519402980804,
+      "epoch": 1.8333333333333335,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 6.697209380448333e-06,
+      "loss": 0.784767746925354,
+      "mean_token_accuracy": 0.7786002233624458,
+      "num_tokens": 4456709.0,
+      "step": 154
+    },
+    {
+      "entropy": 1.084225744009018,
+      "epoch": 1.8452380952380953,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 6.579798566743314e-06,
+      "loss": 0.8074082732200623,
+      "mean_token_accuracy": 0.7725819870829582,
+      "num_tokens": 4485653.0,
+      "step": 155
+    },
+    {
+      "entropy": 1.0734328627586365,
+      "epoch": 1.8571428571428572,
+      "grad_norm": 0.203125,
+      "learning_rate": 6.462919303774186e-06,
+      "loss": 0.7693166136741638,
+      "mean_token_accuracy": 0.7844012156128883,
+      "num_tokens": 4515131.0,
+      "step": 156
+    },
+    {
+      "entropy": 1.0944669842720032,
+      "epoch": 1.869047619047619,
+      "grad_norm": 0.224609375,
+      "learning_rate": 6.34658975633605e-06,
+      "loss": 0.8283172249794006,
+      "mean_token_accuracy": 0.7699304968118668,
+      "num_tokens": 4544554.0,
+      "step": 157
+    },
+    {
+      "entropy": 1.0710095912218094,
+      "epoch": 1.880952380952381,
+      "grad_norm": 0.20703125,
+      "learning_rate": 6.230828003789949e-06,
+      "loss": 0.7723422050476074,
+      "mean_token_accuracy": 0.785270169377327,
+      "num_tokens": 4574526.0,
+      "step": 158
+    },
+    {
+      "entropy": 1.1019357591867447,
+      "epoch": 1.8928571428571428,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 6.115652037253054e-06,
+      "loss": 0.842171847820282,
+      "mean_token_accuracy": 0.7660646587610245,
+      "num_tokens": 4603221.0,
+      "step": 159
+    },
+    {
+      "entropy": 1.0798636227846146,
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 6.001079756802592e-06,
+      "loss": 0.7799994945526123,
+      "mean_token_accuracy": 0.7830873727798462,
+      "num_tokens": 4632086.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.0717933773994446,
+      "epoch": 1.9166666666666665,
+      "grad_norm": 0.21875,
+      "learning_rate": 5.887128968693887e-06,
+      "loss": 0.7654195427894592,
+      "mean_token_accuracy": 0.7824560701847076,
+      "num_tokens": 4660316.0,
+      "step": 161
+    },
+    {
+      "entropy": 1.069181576371193,
+      "epoch": 1.9285714285714286,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 5.773817382593008e-06,
+      "loss": 0.7898048162460327,
+      "mean_token_accuracy": 0.7769228145480156,
+      "num_tokens": 4689587.0,
+      "step": 162
+    },
+    {
+      "entropy": 1.0788903683423996,
+      "epoch": 1.9404761904761905,
+      "grad_norm": 0.212890625,
+      "learning_rate": 5.66116260882442e-06,
+      "loss": 0.7974780797958374,
+      "mean_token_accuracy": 0.7752274572849274,
+      "num_tokens": 4719335.0,
+      "step": 163
+    },
+    {
+      "entropy": 1.1007077991962433,
+      "epoch": 1.9523809523809523,
+      "grad_norm": 0.23046875,
+      "learning_rate": 5.549182155634076e-06,
+      "loss": 0.7892836332321167,
+      "mean_token_accuracy": 0.7779370620846748,
+      "num_tokens": 4746463.0,
+      "step": 164
+    },
+    {
+      "entropy": 1.0850374549627304,
+      "epoch": 1.9642857142857144,
+      "grad_norm": 0.2109375,
+      "learning_rate": 5.43789342646837e-06,
+      "loss": 0.7919931411743164,
+      "mean_token_accuracy": 0.7770635932683945,
+      "num_tokens": 4775141.0,
+      "step": 165
+    },
+    {
+      "entropy": 1.081614837050438,
+      "epoch": 1.9761904761904763,
+      "grad_norm": 0.21875,
+      "learning_rate": 5.32731371726938e-06,
+      "loss": 0.7762281894683838,
+      "mean_token_accuracy": 0.7814824879169464,
+      "num_tokens": 4803229.0,
+      "step": 166
+    },
+    {
+      "entropy": 1.086833968758583,
+      "epoch": 1.9880952380952381,
+      "grad_norm": 0.216796875,
+      "learning_rate": 5.217460213786822e-06,
+      "loss": 0.8244621157646179,
+      "mean_token_accuracy": 0.7730955481529236,
+      "num_tokens": 4832506.0,
+      "step": 167
+    },
+    {
+      "entropy": 1.0700944513082504,
+      "epoch": 2.0,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 5.108349988907111e-06,
+      "loss": 0.7783507704734802,
+      "mean_token_accuracy": 0.7822717130184174,
+      "num_tokens": 4860038.0,
+      "step": 168
+    },
+    {
+      "epoch": 2.0,
+      "eval_entropy": 1.0883367625872293,
+      "eval_loss": 0.8387430906295776,
+      "eval_mean_token_accuracy": 0.7681301248073578,
+      "eval_model_preparation_time": 0.0051,
+      "eval_num_tokens": 4860038.0,
+      "eval_runtime": 19.5881,
+      "eval_samples_per_second": 7.658,
+      "eval_steps_per_second": 7.658,
+      "step": 168
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 252,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1131690390237286e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000..a65364f
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df04f2387ceaaf0af4f50c3c27439b4b3b5bb4a366490e82fbcb5ddc98d615ef
+size 5649