commit 8d4dae588688508683003d649dc312da9d3cc13d
Author: ModelHub XC <noreply@modelhub.org.cn>
Date:   Sat Jun 20 17:43:19 2026 +0800

    初始化项目，由ModelHub XC社区提供模型
    
    Model: guangyangnlp/Qwen3-1.7B-SFT-medical-2e-5
    Source: Original Platform

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..52373fe
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,36 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2baf9d8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,70 @@
+---
+library_name: transformers
+license: other
+base_model: Qwen/Qwen3-1.7B
+tags:
+- llama-factory
+- full
+- generated_from_trainer
+model-index:
+- name: medical-o1-sft-full
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+# medical-o1-sft-full
+
+This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B) on the medical_o1_train dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.4089
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 128
+- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 0.05
+- num_epochs: 3.0
+
+### Training results
+
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.4962        | 0.3419 | 50   | 1.4686          |
+| 1.4215        | 0.6838 | 100  | 1.4337          |
+| 1.3304        | 1.0205 | 150  | 1.4194          |
+| 1.3097        | 1.3624 | 200  | 1.4159          |
+| 1.3175        | 1.7043 | 250  | 1.4089          |
+| 1.2195        | 2.0410 | 300  | 1.4176          |
+| 1.2726        | 2.3829 | 350  | 1.4229          |
+| 1.1895        | 2.7248 | 400  | 1.4216          |
+
+
+### Framework versions
+
+- Transformers 5.0.0
+- Pytorch 2.10.0+cu128
+- Datasets 4.0.0
+- Tokenizers 0.22.2
diff --git a/all_results.json b/all_results.json
new file mode 100644
index 0000000..ce41b8c
--- /dev/null
+++ b/all_results.json
@@ -0,0 +1,12 @@
+{
+    "epoch": 3.0,
+    "eval_loss": 1.4088929891586304,
+    "eval_runtime": 13.9279,
+    "eval_samples_per_second": 70.793,
+    "eval_steps_per_second": 8.903,
+    "total_flos": 5.3379040973665075e+17,
+    "train_loss": 1.3470534840408637,
+    "train_runtime": 3006.8003,
+    "train_samples_per_second": 18.676,
+    "train_steps_per_second": 0.147
+}
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..64481b1
--- /dev/null
+++ b/config.json
@@ -0,0 +1,63 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/eval_results.json b/eval_results.json
new file mode 100644
index 0000000..b6faabf
--- /dev/null
+++ b/eval_results.json
@@ -0,0 +1,7 @@
+{
+    "epoch": 3.0,
+    "eval_loss": 1.4088929891586304,
+    "eval_runtime": 13.9279,
+    "eval_samples_per_second": 70.793,
+    "eval_steps_per_second": 8.903
+}
\ No newline at end of file
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..c33fb76
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,12 @@
+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.0.0"
+}
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000..2cff034
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b98f6d75daac7cb177c54105480062a353474e167a59ddbeae8d9df10294546
+size 8126995136
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000..c7afbed
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..145e2c7
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,30 @@
+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
diff --git a/train_results.json b/train_results.json
new file mode 100644
index 0000000..a4ff596
--- /dev/null
+++ b/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.0,
+    "total_flos": 5.3379040973665075e+17,
+    "train_loss": 1.3470534840408637,
+    "train_runtime": 3006.8003,
+    "train_samples_per_second": 18.676,
+    "train_steps_per_second": 0.147
+}
\ No newline at end of file
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000..129af96
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,3194 @@
+{
+  "best_global_step": 250,
+  "best_metric": 1.4088929891586304,
+  "best_model_checkpoint": "saves/qwen3-1.7B/medical-o1-sft-full/checkpoint-250",
+  "epoch": 3.0,
+  "eval_steps": 50,
+  "global_step": 441,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006837606837606838,
+      "grad_norm": 83.15293884277344,
+      "learning_rate": 0.0,
+      "loss": 2.8199405670166016,
+      "step": 1
+    },
+    {
+      "epoch": 0.013675213675213675,
+      "grad_norm": 81.79350280761719,
+      "learning_rate": 8.695652173913044e-07,
+      "loss": 2.7888758182525635,
+      "step": 2
+    },
+    {
+      "epoch": 0.020512820512820513,
+      "grad_norm": 83.25151824951172,
+      "learning_rate": 1.7391304347826088e-06,
+      "loss": 2.820769786834717,
+      "step": 3
+    },
+    {
+      "epoch": 0.02735042735042735,
+      "grad_norm": 75.52108001708984,
+      "learning_rate": 2.6086956521739132e-06,
+      "loss": 2.734041690826416,
+      "step": 4
+    },
+    {
+      "epoch": 0.03418803418803419,
+      "grad_norm": 72.11664581298828,
+      "learning_rate": 3.4782608695652175e-06,
+      "loss": 2.7135212421417236,
+      "step": 5
+    },
+    {
+      "epoch": 0.041025641025641026,
+      "grad_norm": 55.534324645996094,
+      "learning_rate": 4.347826086956522e-06,
+      "loss": 2.4443650245666504,
+      "step": 6
+    },
+    {
+      "epoch": 0.04786324786324787,
+      "grad_norm": 48.14010238647461,
+      "learning_rate": 5.2173913043478265e-06,
+      "loss": 2.3162710666656494,
+      "step": 7
+    },
+    {
+      "epoch": 0.0547008547008547,
+      "grad_norm": 20.861207962036133,
+      "learning_rate": 6.086956521739132e-06,
+      "loss": 2.0038950443267822,
+      "step": 8
+    },
+    {
+      "epoch": 0.06153846153846154,
+      "grad_norm": 15.49008846282959,
+      "learning_rate": 6.956521739130435e-06,
+      "loss": 1.8993940353393555,
+      "step": 9
+    },
+    {
+      "epoch": 0.06837606837606838,
+      "grad_norm": 5.190984725952148,
+      "learning_rate": 7.82608695652174e-06,
+      "loss": 1.7324286699295044,
+      "step": 10
+    },
+    {
+      "epoch": 0.07521367521367521,
+      "grad_norm": 4.630637168884277,
+      "learning_rate": 8.695652173913044e-06,
+      "loss": 1.654750943183899,
+      "step": 11
+    },
+    {
+      "epoch": 0.08205128205128205,
+      "grad_norm": 3.784055233001709,
+      "learning_rate": 9.565217391304349e-06,
+      "loss": 1.7394911050796509,
+      "step": 12
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 3.4299561977386475,
+      "learning_rate": 1.0434782608695653e-05,
+      "loss": 1.6633565425872803,
+      "step": 13
+    },
+    {
+      "epoch": 0.09572649572649573,
+      "grad_norm": 4.693484306335449,
+      "learning_rate": 1.1304347826086957e-05,
+      "loss": 1.670560359954834,
+      "step": 14
+    },
+    {
+      "epoch": 0.10256410256410256,
+      "grad_norm": 5.14279317855835,
+      "learning_rate": 1.2173913043478263e-05,
+      "loss": 1.647332787513733,
+      "step": 15
+    },
+    {
+      "epoch": 0.1094017094017094,
+      "grad_norm": 3.8385608196258545,
+      "learning_rate": 1.3043478260869566e-05,
+      "loss": 1.6399732828140259,
+      "step": 16
+    },
+    {
+      "epoch": 0.11623931623931624,
+      "grad_norm": 2.6695456504821777,
+      "learning_rate": 1.391304347826087e-05,
+      "loss": 1.5681482553482056,
+      "step": 17
+    },
+    {
+      "epoch": 0.12307692307692308,
+      "grad_norm": 2.117490291595459,
+      "learning_rate": 1.4782608695652174e-05,
+      "loss": 1.6053783893585205,
+      "step": 18
+    },
+    {
+      "epoch": 0.12991452991452992,
+      "grad_norm": 1.9541882276535034,
+      "learning_rate": 1.565217391304348e-05,
+      "loss": 1.5954205989837646,
+      "step": 19
+    },
+    {
+      "epoch": 0.13675213675213677,
+      "grad_norm": 2.011003255844116,
+      "learning_rate": 1.6521739130434785e-05,
+      "loss": 1.5820363759994507,
+      "step": 20
+    },
+    {
+      "epoch": 0.14358974358974358,
+      "grad_norm": 1.9789162874221802,
+      "learning_rate": 1.739130434782609e-05,
+      "loss": 1.532997727394104,
+      "step": 21
+    },
+    {
+      "epoch": 0.15042735042735042,
+      "grad_norm": 1.8961035013198853,
+      "learning_rate": 1.8260869565217393e-05,
+      "loss": 1.5475587844848633,
+      "step": 22
+    },
+    {
+      "epoch": 0.15726495726495726,
+      "grad_norm": 1.5811997652053833,
+      "learning_rate": 1.9130434782608697e-05,
+      "loss": 1.580260992050171,
+      "step": 23
+    },
+    {
+      "epoch": 0.1641025641025641,
+      "grad_norm": 1.4591213464736938,
+      "learning_rate": 2e-05,
+      "loss": 1.5463660955429077,
+      "step": 24
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 1.4459729194641113,
+      "learning_rate": 1.999971756719333e-05,
+      "loss": 1.5187675952911377,
+      "step": 25
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 1.4411983489990234,
+      "learning_rate": 1.9998870284726968e-05,
+      "loss": 1.529025673866272,
+      "step": 26
+    },
+    {
+      "epoch": 0.18461538461538463,
+      "grad_norm": 1.3215960264205933,
+      "learning_rate": 1.9997458200460994e-05,
+      "loss": 1.513730525970459,
+      "step": 27
+    },
+    {
+      "epoch": 0.19145299145299147,
+      "grad_norm": 1.324648141860962,
+      "learning_rate": 1.999548139415919e-05,
+      "loss": 1.5576432943344116,
+      "step": 28
+    },
+    {
+      "epoch": 0.19829059829059828,
+      "grad_norm": 1.1139763593673706,
+      "learning_rate": 1.999293997748454e-05,
+      "loss": 1.5223976373672485,
+      "step": 29
+    },
+    {
+      "epoch": 0.20512820512820512,
+      "grad_norm": 1.175620675086975,
+      "learning_rate": 1.9989834093992945e-05,
+      "loss": 1.529496431350708,
+      "step": 30
+    },
+    {
+      "epoch": 0.21196581196581196,
+      "grad_norm": 1.2628631591796875,
+      "learning_rate": 1.9986163919125077e-05,
+      "loss": 1.5556331872940063,
+      "step": 31
+    },
+    {
+      "epoch": 0.2188034188034188,
+      "grad_norm": 1.121780276298523,
+      "learning_rate": 1.9981929660196492e-05,
+      "loss": 1.522382140159607,
+      "step": 32
+    },
+    {
+      "epoch": 0.22564102564102564,
+      "grad_norm": 1.057112693786621,
+      "learning_rate": 1.997713155638592e-05,
+      "loss": 1.5269778966903687,
+      "step": 33
+    },
+    {
+      "epoch": 0.23247863247863249,
+      "grad_norm": 1.1212079524993896,
+      "learning_rate": 1.9971769878721747e-05,
+      "loss": 1.5179802179336548,
+      "step": 34
+    },
+    {
+      "epoch": 0.23931623931623933,
+      "grad_norm": 1.1053107976913452,
+      "learning_rate": 1.99658449300667e-05,
+      "loss": 1.4600404500961304,
+      "step": 35
+    },
+    {
+      "epoch": 0.24615384615384617,
+      "grad_norm": 1.0344611406326294,
+      "learning_rate": 1.9959357045100764e-05,
+      "loss": 1.4895355701446533,
+      "step": 36
+    },
+    {
+      "epoch": 0.252991452991453,
+      "grad_norm": 1.0998711585998535,
+      "learning_rate": 1.9952306590302247e-05,
+      "loss": 1.498748779296875,
+      "step": 37
+    },
+    {
+      "epoch": 0.25982905982905985,
+      "grad_norm": 1.0810974836349487,
+      "learning_rate": 1.9944693963927092e-05,
+      "loss": 1.4847540855407715,
+      "step": 38
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 1.0349794626235962,
+      "learning_rate": 1.9936519595986395e-05,
+      "loss": 1.4850821495056152,
+      "step": 39
+    },
+    {
+      "epoch": 0.27350427350427353,
+      "grad_norm": 0.9509456157684326,
+      "learning_rate": 1.9927783948222084e-05,
+      "loss": 1.4879685640335083,
+      "step": 40
+    },
+    {
+      "epoch": 0.28034188034188035,
+      "grad_norm": 0.9873176217079163,
+      "learning_rate": 1.9918487514080867e-05,
+      "loss": 1.5055975914001465,
+      "step": 41
+    },
+    {
+      "epoch": 0.28717948717948716,
+      "grad_norm": 0.9554620385169983,
+      "learning_rate": 1.990863081868634e-05,
+      "loss": 1.4576541185379028,
+      "step": 42
+    },
+    {
+      "epoch": 0.294017094017094,
+      "grad_norm": 0.915795087814331,
+      "learning_rate": 1.989821441880933e-05,
+      "loss": 1.469474196434021,
+      "step": 43
+    },
+    {
+      "epoch": 0.30085470085470084,
+      "grad_norm": 1.006457805633545,
+      "learning_rate": 1.988723890283645e-05,
+      "loss": 1.5073033571243286,
+      "step": 44
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.9496122598648071,
+      "learning_rate": 1.9875704890736853e-05,
+      "loss": 1.496271014213562,
+      "step": 45
+    },
+    {
+      "epoch": 0.3145299145299145,
+      "grad_norm": 0.9319558143615723,
+      "learning_rate": 1.9863613034027224e-05,
+      "loss": 1.4825000762939453,
+      "step": 46
+    },
+    {
+      "epoch": 0.3213675213675214,
+      "grad_norm": 0.9389411807060242,
+      "learning_rate": 1.985096401573497e-05,
+      "loss": 1.4443243741989136,
+      "step": 47
+    },
+    {
+      "epoch": 0.3282051282051282,
+      "grad_norm": 0.9735950827598572,
+      "learning_rate": 1.9837758550359637e-05,
+      "loss": 1.4762128591537476,
+      "step": 48
+    },
+    {
+      "epoch": 0.335042735042735,
+      "grad_norm": 0.9494331479072571,
+      "learning_rate": 1.982399738383255e-05,
+      "loss": 1.5045385360717773,
+      "step": 49
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.9520753026008606,
+      "learning_rate": 1.9809681293474693e-05,
+      "loss": 1.496164321899414,
+      "step": 50
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "eval_loss": 1.4685521125793457,
+      "eval_runtime": 14.1604,
+      "eval_samples_per_second": 69.631,
+      "eval_steps_per_second": 8.757,
+      "step": 50
+    },
+    {
+      "epoch": 0.3487179487179487,
+      "grad_norm": 0.9688102602958679,
+      "learning_rate": 1.979481108795278e-05,
+      "loss": 1.4734501838684082,
+      "step": 51
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.9477071166038513,
+      "learning_rate": 1.9779387607233587e-05,
+      "loss": 1.4600017070770264,
+      "step": 52
+    },
+    {
+      "epoch": 0.3623931623931624,
+      "grad_norm": 0.9507799744606018,
+      "learning_rate": 1.9763411722536503e-05,
+      "loss": 1.455001711845398,
+      "step": 53
+    },
+    {
+      "epoch": 0.36923076923076925,
+      "grad_norm": 0.9292111992835999,
+      "learning_rate": 1.9746884336284316e-05,
+      "loss": 1.4742114543914795,
+      "step": 54
+    },
+    {
+      "epoch": 0.37606837606837606,
+      "grad_norm": 0.9916467666625977,
+      "learning_rate": 1.972980638205225e-05,
+      "loss": 1.5147836208343506,
+      "step": 55
+    },
+    {
+      "epoch": 0.38290598290598293,
+      "grad_norm": 0.9744175672531128,
+      "learning_rate": 1.971217882451521e-05,
+      "loss": 1.4713977575302124,
+      "step": 56
+    },
+    {
+      "epoch": 0.38974358974358975,
+      "grad_norm": 1.0033540725708008,
+      "learning_rate": 1.9694002659393306e-05,
+      "loss": 1.4538943767547607,
+      "step": 57
+    },
+    {
+      "epoch": 0.39658119658119656,
+      "grad_norm": 0.946854293346405,
+      "learning_rate": 1.9675278913395605e-05,
+      "loss": 1.4287432432174683,
+      "step": 58
+    },
+    {
+      "epoch": 0.40341880341880343,
+      "grad_norm": 1.0013198852539062,
+      "learning_rate": 1.9656008644162134e-05,
+      "loss": 1.4492701292037964,
+      "step": 59
+    },
+    {
+      "epoch": 0.41025641025641024,
+      "grad_norm": 1.0438623428344727,
+      "learning_rate": 1.9636192940204134e-05,
+      "loss": 1.4924561977386475,
+      "step": 60
+    },
+    {
+      "epoch": 0.4170940170940171,
+      "grad_norm": 0.9705636501312256,
+      "learning_rate": 1.961583292084259e-05,
+      "loss": 1.4596234560012817,
+      "step": 61
+    },
+    {
+      "epoch": 0.4239316239316239,
+      "grad_norm": 0.9079157114028931,
+      "learning_rate": 1.9594929736144978e-05,
+      "loss": 1.44952392578125,
+      "step": 62
+    },
+    {
+      "epoch": 0.4307692307692308,
+      "grad_norm": 0.9640805125236511,
+      "learning_rate": 1.957348456686032e-05,
+      "loss": 1.4430960416793823,
+      "step": 63
+    },
+    {
+      "epoch": 0.4376068376068376,
+      "grad_norm": 0.9475866556167603,
+      "learning_rate": 1.9551498624352497e-05,
+      "loss": 1.446009635925293,
+      "step": 64
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.948258638381958,
+      "learning_rate": 1.9528973150531787e-05,
+      "loss": 1.4411481618881226,
+      "step": 65
+    },
+    {
+      "epoch": 0.4512820512820513,
+      "grad_norm": 0.9805014133453369,
+      "learning_rate": 1.9505909417784758e-05,
+      "loss": 1.4417314529418945,
+      "step": 66
+    },
+    {
+      "epoch": 0.4581196581196581,
+      "grad_norm": 0.9225365519523621,
+      "learning_rate": 1.9482308728902358e-05,
+      "loss": 1.480376958847046,
+      "step": 67
+    },
+    {
+      "epoch": 0.46495726495726497,
+      "grad_norm": 0.9221044182777405,
+      "learning_rate": 1.9458172417006347e-05,
+      "loss": 1.4625794887542725,
+      "step": 68
+    },
+    {
+      "epoch": 0.4717948717948718,
+      "grad_norm": 0.9901456832885742,
+      "learning_rate": 1.9433501845473996e-05,
+      "loss": 1.4856598377227783,
+      "step": 69
+    },
+    {
+      "epoch": 0.47863247863247865,
+      "grad_norm": 0.9551020860671997,
+      "learning_rate": 1.9408298407861045e-05,
+      "loss": 1.4896745681762695,
+      "step": 70
+    },
+    {
+      "epoch": 0.48547008547008547,
+      "grad_norm": 0.9381822943687439,
+      "learning_rate": 1.9382563527823026e-05,
+      "loss": 1.4343875646591187,
+      "step": 71
+    },
+    {
+      "epoch": 0.49230769230769234,
+      "grad_norm": 0.8770731091499329,
+      "learning_rate": 1.935629865903482e-05,
+      "loss": 1.4482182264328003,
+      "step": 72
+    },
+    {
+      "epoch": 0.49914529914529915,
+      "grad_norm": 0.934929609298706,
+      "learning_rate": 1.9329505285108544e-05,
+      "loss": 1.4524080753326416,
+      "step": 73
+    },
+    {
+      "epoch": 0.505982905982906,
+      "grad_norm": 0.9203254580497742,
+      "learning_rate": 1.9302184919509758e-05,
+      "loss": 1.4096636772155762,
+      "step": 74
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.9084986448287964,
+      "learning_rate": 1.927433910547197e-05,
+      "loss": 1.423622488975525,
+      "step": 75
+    },
+    {
+      "epoch": 0.5196581196581197,
+      "grad_norm": 0.8734993934631348,
+      "learning_rate": 1.9245969415909464e-05,
+      "loss": 1.4265828132629395,
+      "step": 76
+    },
+    {
+      "epoch": 0.5264957264957265,
+      "grad_norm": 0.8964496850967407,
+      "learning_rate": 1.921707745332845e-05,
+      "loss": 1.4725595712661743,
+      "step": 77
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.9096109867095947,
+      "learning_rate": 1.9187664849736542e-05,
+      "loss": 1.457470417022705,
+      "step": 78
+    },
+    {
+      "epoch": 0.5401709401709401,
+      "grad_norm": 0.8932516574859619,
+      "learning_rate": 1.9157733266550577e-05,
+      "loss": 1.454951286315918,
+      "step": 79
+    },
+    {
+      "epoch": 0.5470085470085471,
+      "grad_norm": 0.8940214514732361,
+      "learning_rate": 1.9127284394502765e-05,
+      "loss": 1.4776511192321777,
+      "step": 80
+    },
+    {
+      "epoch": 0.5538461538461539,
+      "grad_norm": 0.8789263963699341,
+      "learning_rate": 1.9096319953545186e-05,
+      "loss": 1.4376585483551025,
+      "step": 81
+    },
+    {
+      "epoch": 0.5606837606837607,
+      "grad_norm": 0.9395255446434021,
+      "learning_rate": 1.906484169275263e-05,
+      "loss": 1.4360781908035278,
+      "step": 82
+    },
+    {
+      "epoch": 0.5675213675213675,
+      "grad_norm": 0.8618428707122803,
+      "learning_rate": 1.903285139022381e-05,
+      "loss": 1.4329712390899658,
+      "step": 83
+    },
+    {
+      "epoch": 0.5743589743589743,
+      "grad_norm": 0.9313262104988098,
+      "learning_rate": 1.900035085298091e-05,
+      "loss": 1.446253776550293,
+      "step": 84
+    },
+    {
+      "epoch": 0.5811965811965812,
+      "grad_norm": 0.8763355016708374,
+      "learning_rate": 1.896734191686752e-05,
+      "loss": 1.4160209894180298,
+      "step": 85
+    },
+    {
+      "epoch": 0.588034188034188,
+      "grad_norm": 0.8777135610580444,
+      "learning_rate": 1.8933826446444933e-05,
+      "loss": 1.449493408203125,
+      "step": 86
+    },
+    {
+      "epoch": 0.5948717948717949,
+      "grad_norm": 0.8737928867340088,
+      "learning_rate": 1.889980633488683e-05,
+      "loss": 1.377128005027771,
+      "step": 87
+    },
+    {
+      "epoch": 0.6017094017094017,
+      "grad_norm": 0.923620343208313,
+      "learning_rate": 1.8865283503872325e-05,
+      "loss": 1.422142505645752,
+      "step": 88
+    },
+    {
+      "epoch": 0.6085470085470085,
+      "grad_norm": 0.9419258832931519,
+      "learning_rate": 1.8830259903477427e-05,
+      "loss": 1.4897931814193726,
+      "step": 89
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.9292656779289246,
+      "learning_rate": 1.879473751206489e-05,
+      "loss": 1.4244943857192993,
+      "step": 90
+    },
+    {
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.9174057841300964,
+      "learning_rate": 1.8758718336172462e-05,
+      "loss": 1.432208776473999,
+      "step": 91
+    },
+    {
+      "epoch": 0.629059829059829,
+      "grad_norm": 0.9447773694992065,
+      "learning_rate": 1.8722204410399524e-05,
+      "loss": 1.4501725435256958,
+      "step": 92
+    },
+    {
+      "epoch": 0.6358974358974359,
+      "grad_norm": 0.8907484412193298,
+      "learning_rate": 1.868519779729218e-05,
+      "loss": 1.4563168287277222,
+      "step": 93
+    },
+    {
+      "epoch": 0.6427350427350428,
+      "grad_norm": 0.8975157141685486,
+      "learning_rate": 1.864770058722676e-05,
+      "loss": 1.4320740699768066,
+      "step": 94
+    },
+    {
+      "epoch": 0.6495726495726496,
+      "grad_norm": 0.9034259915351868,
+      "learning_rate": 1.8609714898291716e-05,
+      "loss": 1.4002689123153687,
+      "step": 95
+    },
+    {
+      "epoch": 0.6564102564102564,
+      "grad_norm": 0.9356617331504822,
+      "learning_rate": 1.8571242876167995e-05,
+      "loss": 1.4669139385223389,
+      "step": 96
+    },
+    {
+      "epoch": 0.6632478632478632,
+      "grad_norm": 0.9355176091194153,
+      "learning_rate": 1.853228669400784e-05,
+      "loss": 1.4444191455841064,
+      "step": 97
+    },
+    {
+      "epoch": 0.67008547008547,
+      "grad_norm": 0.8931655883789062,
+      "learning_rate": 1.8492848552312016e-05,
+      "loss": 1.4415756464004517,
+      "step": 98
+    },
+    {
+      "epoch": 0.676923076923077,
+      "grad_norm": 0.8951373100280762,
+      "learning_rate": 1.8452930678805536e-05,
+      "loss": 1.4061449766159058,
+      "step": 99
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.9179074168205261,
+      "learning_rate": 1.8412535328311813e-05,
+      "loss": 1.4215387105941772,
+      "step": 100
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "eval_loss": 1.4336893558502197,
+      "eval_runtime": 13.7947,
+      "eval_samples_per_second": 71.477,
+      "eval_steps_per_second": 8.989,
+      "step": 100
+    },
+    {
+      "epoch": 0.6905982905982906,
+      "grad_norm": 0.977781355381012,
+      "learning_rate": 1.8371664782625287e-05,
+      "loss": 1.4540152549743652,
+      "step": 101
+    },
+    {
+      "epoch": 0.6974358974358974,
+      "grad_norm": 0.9076094627380371,
+      "learning_rate": 1.8330321350382545e-05,
+      "loss": 1.415886640548706,
+      "step": 102
+    },
+    {
+      "epoch": 0.7042735042735043,
+      "grad_norm": 0.8912188410758972,
+      "learning_rate": 1.8288507366931907e-05,
+      "loss": 1.4277691841125488,
+      "step": 103
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.8660780787467957,
+      "learning_rate": 1.8246225194201517e-05,
+      "loss": 1.39166259765625,
+      "step": 104
+    },
+    {
+      "epoch": 0.717948717948718,
+      "grad_norm": 0.9204691648483276,
+      "learning_rate": 1.8203477220565912e-05,
+      "loss": 1.4161370992660522,
+      "step": 105
+    },
+    {
+      "epoch": 0.7247863247863248,
+      "grad_norm": 0.9661011695861816,
+      "learning_rate": 1.8160265860711134e-05,
+      "loss": 1.4492610692977905,
+      "step": 106
+    },
+    {
+      "epoch": 0.7316239316239316,
+      "grad_norm": 0.9005808234214783,
+      "learning_rate": 1.8116593555498308e-05,
+      "loss": 1.4389468431472778,
+      "step": 107
+    },
+    {
+      "epoch": 0.7384615384615385,
+      "grad_norm": 0.9088156223297119,
+      "learning_rate": 1.807246277182578e-05,
+      "loss": 1.4940838813781738,
+      "step": 108
+    },
+    {
+      "epoch": 0.7452991452991453,
+      "grad_norm": 0.9402887225151062,
+      "learning_rate": 1.802787600248977e-05,
+      "loss": 1.4154539108276367,
+      "step": 109
+    },
+    {
+      "epoch": 0.7521367521367521,
+      "grad_norm": 0.9380722045898438,
+      "learning_rate": 1.798283576604356e-05,
+      "loss": 1.4318289756774902,
+      "step": 110
+    },
+    {
+      "epoch": 0.7589743589743589,
+      "grad_norm": 0.9319474101066589,
+      "learning_rate": 1.7937344606655228e-05,
+      "loss": 1.4192531108856201,
+      "step": 111
+    },
+    {
+      "epoch": 0.7658119658119659,
+      "grad_norm": 0.9068304896354675,
+      "learning_rate": 1.789140509396394e-05,
+      "loss": 1.4170390367507935,
+      "step": 112
+    },
+    {
+      "epoch": 0.7726495726495727,
+      "grad_norm": 0.8808281421661377,
+      "learning_rate": 1.784501982293479e-05,
+      "loss": 1.432860016822815,
+      "step": 113
+    },
+    {
+      "epoch": 0.7794871794871795,
+      "grad_norm": 0.8805544376373291,
+      "learning_rate": 1.7798191413712244e-05,
+      "loss": 1.4037058353424072,
+      "step": 114
+    },
+    {
+      "epoch": 0.7863247863247863,
+      "grad_norm": 0.8959332704544067,
+      "learning_rate": 1.775092251147211e-05,
+      "loss": 1.4175316095352173,
+      "step": 115
+    },
+    {
+      "epoch": 0.7931623931623931,
+      "grad_norm": 0.8379173278808594,
+      "learning_rate": 1.770321578627213e-05,
+      "loss": 1.404625654220581,
+      "step": 116
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.8591132164001465,
+      "learning_rate": 1.765507393290117e-05,
+      "loss": 1.4534145593643188,
+      "step": 117
+    },
+    {
+      "epoch": 0.8068376068376069,
+      "grad_norm": 0.8517522215843201,
+      "learning_rate": 1.7606499670726972e-05,
+      "loss": 1.4170221090316772,
+      "step": 118
+    },
+    {
+      "epoch": 0.8136752136752137,
+      "grad_norm": 0.8700085282325745,
+      "learning_rate": 1.7557495743542586e-05,
+      "loss": 1.4001213312149048,
+      "step": 119
+    },
+    {
+      "epoch": 0.8205128205128205,
+      "grad_norm": 0.8774170875549316,
+      "learning_rate": 1.7508064919411344e-05,
+      "loss": 1.418135643005371,
+      "step": 120
+    },
+    {
+      "epoch": 0.8273504273504273,
+      "grad_norm": 0.8984478116035461,
+      "learning_rate": 1.745820999051053e-05,
+      "loss": 1.4195680618286133,
+      "step": 121
+    },
+    {
+      "epoch": 0.8341880341880342,
+      "grad_norm": 0.8648718595504761,
+      "learning_rate": 1.7407933772973638e-05,
+      "loss": 1.383607029914856,
+      "step": 122
+    },
+    {
+      "epoch": 0.841025641025641,
+      "grad_norm": 0.9336929321289062,
+      "learning_rate": 1.735723910673132e-05,
+      "loss": 1.4406161308288574,
+      "step": 123
+    },
+    {
+      "epoch": 0.8478632478632478,
+      "grad_norm": 0.8780763149261475,
+      "learning_rate": 1.730612885535094e-05,
+      "loss": 1.4191570281982422,
+      "step": 124
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.8674494624137878,
+      "learning_rate": 1.7254605905874862e-05,
+      "loss": 1.437395691871643,
+      "step": 125
+    },
+    {
+      "epoch": 0.8615384615384616,
+      "grad_norm": 0.9440014958381653,
+      "learning_rate": 1.7202673168657318e-05,
+      "loss": 1.4250893592834473,
+      "step": 126
+    },
+    {
+      "epoch": 0.8683760683760684,
+      "grad_norm": 0.9403019547462463,
+      "learning_rate": 1.7150333577200062e-05,
+      "loss": 1.435499906539917,
+      "step": 127
+    },
+    {
+      "epoch": 0.8752136752136752,
+      "grad_norm": 0.863822877407074,
+      "learning_rate": 1.709759008798663e-05,
+      "loss": 1.409804105758667,
+      "step": 128
+    },
+    {
+      "epoch": 0.882051282051282,
+      "grad_norm": 0.9274973273277283,
+      "learning_rate": 1.7044445680315374e-05,
+      "loss": 1.433601975440979,
+      "step": 129
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.9369088411331177,
+      "learning_rate": 1.6990903356131125e-05,
+      "loss": 1.4320355653762817,
+      "step": 130
+    },
+    {
+      "epoch": 0.8957264957264958,
+      "grad_norm": 0.8703179955482483,
+      "learning_rate": 1.6936966139855664e-05,
+      "loss": 1.4167561531066895,
+      "step": 131
+    },
+    {
+      "epoch": 0.9025641025641026,
+      "grad_norm": 0.9144904017448425,
+      "learning_rate": 1.6882637078216867e-05,
+      "loss": 1.4223415851593018,
+      "step": 132
+    },
+    {
+      "epoch": 0.9094017094017094,
+      "grad_norm": 0.9126601219177246,
+      "learning_rate": 1.6827919240076612e-05,
+      "loss": 1.4480727910995483,
+      "step": 133
+    },
+    {
+      "epoch": 0.9162393162393162,
+      "grad_norm": 0.8591611981391907,
+      "learning_rate": 1.6772815716257414e-05,
+      "loss": 1.40584135055542,
+      "step": 134
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.8316404223442078,
+      "learning_rate": 1.671732961936785e-05,
+      "loss": 1.449837565422058,
+      "step": 135
+    },
+    {
+      "epoch": 0.9299145299145299,
+      "grad_norm": 0.8785284757614136,
+      "learning_rate": 1.6661464083626734e-05,
+      "loss": 1.440337061882019,
+      "step": 136
+    },
+    {
+      "epoch": 0.9367521367521368,
+      "grad_norm": 0.8786150813102722,
+      "learning_rate": 1.6605222264686085e-05,
+      "loss": 1.440657138824463,
+      "step": 137
+    },
+    {
+      "epoch": 0.9435897435897436,
+      "grad_norm": 0.8501399159431458,
+      "learning_rate": 1.6548607339452853e-05,
+      "loss": 1.397615671157837,
+      "step": 138
+    },
+    {
+      "epoch": 0.9504273504273504,
+      "grad_norm": 0.8737369775772095,
+      "learning_rate": 1.6491622505909483e-05,
+      "loss": 1.4285824298858643,
+      "step": 139
+    },
+    {
+      "epoch": 0.9572649572649573,
+      "grad_norm": 0.8369284868240356,
+      "learning_rate": 1.6434270982933272e-05,
+      "loss": 1.3992527723312378,
+      "step": 140
+    },
+    {
+      "epoch": 0.9641025641025641,
+      "grad_norm": 0.8740672469139099,
+      "learning_rate": 1.637655601011454e-05,
+      "loss": 1.4451634883880615,
+      "step": 141
+    },
+    {
+      "epoch": 0.9709401709401709,
+      "grad_norm": 0.873289942741394,
+      "learning_rate": 1.631848084757364e-05,
+      "loss": 1.3965365886688232,
+      "step": 142
+    },
+    {
+      "epoch": 0.9777777777777777,
+      "grad_norm": 0.9107730984687805,
+      "learning_rate": 1.6260048775776804e-05,
+      "loss": 1.4110256433486938,
+      "step": 143
+    },
+    {
+      "epoch": 0.9846153846153847,
+      "grad_norm": 0.8785021305084229,
+      "learning_rate": 1.6201263095350833e-05,
+      "loss": 1.4294975996017456,
+      "step": 144
+    },
+    {
+      "epoch": 0.9914529914529915,
+      "grad_norm": 0.8321818113327026,
+      "learning_rate": 1.6142127126896682e-05,
+      "loss": 1.4016475677490234,
+      "step": 145
+    },
+    {
+      "epoch": 0.9982905982905983,
+      "grad_norm": 0.8866358399391174,
+      "learning_rate": 1.6082644210801846e-05,
+      "loss": 1.3802778720855713,
+      "step": 146
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.623956561088562,
+      "learning_rate": 1.602281770705172e-05,
+      "loss": 1.4806468486785889,
+      "step": 147
+    },
+    {
+      "epoch": 1.0068376068376068,
+      "grad_norm": 1.1759995222091675,
+      "learning_rate": 1.5962650995039783e-05,
+      "loss": 1.3020893335342407,
+      "step": 148
+    },
+    {
+      "epoch": 1.0136752136752136,
+      "grad_norm": 1.0619325637817383,
+      "learning_rate": 1.5902147473376695e-05,
+      "loss": 1.2844979763031006,
+      "step": 149
+    },
+    {
+      "epoch": 1.0205128205128204,
+      "grad_norm": 0.9689248204231262,
+      "learning_rate": 1.5841310559698346e-05,
+      "loss": 1.3303570747375488,
+      "step": 150
+    },
+    {
+      "epoch": 1.0205128205128204,
+      "eval_loss": 1.4194111824035645,
+      "eval_runtime": 13.7873,
+      "eval_samples_per_second": 71.515,
+      "eval_steps_per_second": 8.994,
+      "step": 150
+    },
+    {
+      "epoch": 1.0273504273504273,
+      "grad_norm": 0.9153519868850708,
+      "learning_rate": 1.578014369047279e-05,
+      "loss": 1.3417026996612549,
+      "step": 151
+    },
+    {
+      "epoch": 1.0341880341880343,
+      "grad_norm": 0.9799442887306213,
+      "learning_rate": 1.5718650320806145e-05,
+      "loss": 1.293771743774414,
+      "step": 152
+    },
+    {
+      "epoch": 1.041025641025641,
+      "grad_norm": 1.0599641799926758,
+      "learning_rate": 1.56568339242474e-05,
+      "loss": 1.3117493391036987,
+      "step": 153
+    },
+    {
+      "epoch": 1.047863247863248,
+      "grad_norm": 0.9470742344856262,
+      "learning_rate": 1.5594697992592232e-05,
+      "loss": 1.2798222303390503,
+      "step": 154
+    },
+    {
+      "epoch": 1.0547008547008547,
+      "grad_norm": 0.9936373829841614,
+      "learning_rate": 1.5532246035685755e-05,
+      "loss": 1.3070576190948486,
+      "step": 155
+    },
+    {
+      "epoch": 1.0615384615384615,
+      "grad_norm": 0.9454049468040466,
+      "learning_rate": 1.5469481581224274e-05,
+      "loss": 1.3386294841766357,
+      "step": 156
+    },
+    {
+      "epoch": 1.0683760683760684,
+      "grad_norm": 0.9544969797134399,
+      "learning_rate": 1.5406408174555978e-05,
+      "loss": 1.302185297012329,
+      "step": 157
+    },
+    {
+      "epoch": 1.0752136752136752,
+      "grad_norm": 0.9065172076225281,
+      "learning_rate": 1.5343029378480733e-05,
+      "loss": 1.3039960861206055,
+      "step": 158
+    },
+    {
+      "epoch": 1.082051282051282,
+      "grad_norm": 0.867220938205719,
+      "learning_rate": 1.527934877304879e-05,
+      "loss": 1.3006991147994995,
+      "step": 159
+    },
+    {
+      "epoch": 1.0888888888888888,
+      "grad_norm": 0.9097728133201599,
+      "learning_rate": 1.5215369955358568e-05,
+      "loss": 1.2785807847976685,
+      "step": 160
+    },
+    {
+      "epoch": 1.0957264957264958,
+      "grad_norm": 0.9294711351394653,
+      "learning_rate": 1.5151096539353481e-05,
+      "loss": 1.3051520586013794,
+      "step": 161
+    },
+    {
+      "epoch": 1.1025641025641026,
+      "grad_norm": 0.9427935481071472,
+      "learning_rate": 1.5086532155617785e-05,
+      "loss": 1.3146125078201294,
+      "step": 162
+    },
+    {
+      "epoch": 1.1094017094017095,
+      "grad_norm": 0.9104812741279602,
+      "learning_rate": 1.5021680451171499e-05,
+      "loss": 1.2878390550613403,
+      "step": 163
+    },
+    {
+      "epoch": 1.1162393162393163,
+      "grad_norm": 0.8972042202949524,
+      "learning_rate": 1.4956545089264408e-05,
+      "loss": 1.3068175315856934,
+      "step": 164
+    },
+    {
+      "epoch": 1.123076923076923,
+      "grad_norm": 0.9040313959121704,
+      "learning_rate": 1.489112974916912e-05,
+      "loss": 1.2897545099258423,
+      "step": 165
+    },
+    {
+      "epoch": 1.12991452991453,
+      "grad_norm": 0.9337772727012634,
+      "learning_rate": 1.4825438125973263e-05,
+      "loss": 1.301710844039917,
+      "step": 166
+    },
+    {
+      "epoch": 1.1367521367521367,
+      "grad_norm": 0.8870652914047241,
+      "learning_rate": 1.4759473930370738e-05,
+      "loss": 1.3163543939590454,
+      "step": 167
+    },
+    {
+      "epoch": 1.1435897435897435,
+      "grad_norm": 0.8637550473213196,
+      "learning_rate": 1.4693240888452121e-05,
+      "loss": 1.3200492858886719,
+      "step": 168
+    },
+    {
+      "epoch": 1.1504273504273503,
+      "grad_norm": 0.8388293981552124,
+      "learning_rate": 1.4626742741494207e-05,
+      "loss": 1.307487964630127,
+      "step": 169
+    },
+    {
+      "epoch": 1.1572649572649572,
+      "grad_norm": 0.9050071835517883,
+      "learning_rate": 1.4559983245748639e-05,
+      "loss": 1.2808455228805542,
+      "step": 170
+    },
+    {
+      "epoch": 1.1641025641025642,
+      "grad_norm": 0.965691089630127,
+      "learning_rate": 1.449296617222978e-05,
+      "loss": 1.332348346710205,
+      "step": 171
+    },
+    {
+      "epoch": 1.170940170940171,
+      "grad_norm": 0.8704518675804138,
+      "learning_rate": 1.4425695306501656e-05,
+      "loss": 1.306895136833191,
+      "step": 172
+    },
+    {
+      "epoch": 1.1777777777777778,
+      "grad_norm": 0.8741139769554138,
+      "learning_rate": 1.4358174448464155e-05,
+      "loss": 1.2980892658233643,
+      "step": 173
+    },
+    {
+      "epoch": 1.1846153846153846,
+      "grad_norm": 0.9941467642784119,
+      "learning_rate": 1.4290407412138365e-05,
+      "loss": 1.2821602821350098,
+      "step": 174
+    },
+    {
+      "epoch": 1.1914529914529914,
+      "grad_norm": 0.9268296957015991,
+      "learning_rate": 1.4222398025451137e-05,
+      "loss": 1.302233338356018,
+      "step": 175
+    },
+    {
+      "epoch": 1.1982905982905983,
+      "grad_norm": 0.8978403806686401,
+      "learning_rate": 1.4154150130018867e-05,
+      "loss": 1.265356421470642,
+      "step": 176
+    },
+    {
+      "epoch": 1.205128205128205,
+      "grad_norm": 0.9328585267066956,
+      "learning_rate": 1.4085667580930482e-05,
+      "loss": 1.320369005203247,
+      "step": 177
+    },
+    {
+      "epoch": 1.2119658119658119,
+      "grad_norm": 0.9113616943359375,
+      "learning_rate": 1.4016954246529697e-05,
+      "loss": 1.2897846698760986,
+      "step": 178
+    },
+    {
+      "epoch": 1.218803418803419,
+      "grad_norm": 0.9257543087005615,
+      "learning_rate": 1.3948014008196486e-05,
+      "loss": 1.3368397951126099,
+      "step": 179
+    },
+    {
+      "epoch": 1.2256410256410257,
+      "grad_norm": 0.8960409164428711,
+      "learning_rate": 1.3878850760127848e-05,
+      "loss": 1.3266628980636597,
+      "step": 180
+    },
+    {
+      "epoch": 1.2324786324786325,
+      "grad_norm": 0.9111725687980652,
+      "learning_rate": 1.3809468409117845e-05,
+      "loss": 1.2674126625061035,
+      "step": 181
+    },
+    {
+      "epoch": 1.2393162393162394,
+      "grad_norm": 0.9564438462257385,
+      "learning_rate": 1.3739870874336898e-05,
+      "loss": 1.2953293323516846,
+      "step": 182
+    },
+    {
+      "epoch": 1.2461538461538462,
+      "grad_norm": 1.0268452167510986,
+      "learning_rate": 1.3670062087110423e-05,
+      "loss": 1.3054559230804443,
+      "step": 183
+    },
+    {
+      "epoch": 1.252991452991453,
+      "grad_norm": 0.8995468020439148,
+      "learning_rate": 1.3600045990696762e-05,
+      "loss": 1.3053619861602783,
+      "step": 184
+    },
+    {
+      "epoch": 1.2598290598290598,
+      "grad_norm": 0.8805936574935913,
+      "learning_rate": 1.352982654006444e-05,
+      "loss": 1.3140225410461426,
+      "step": 185
+    },
+    {
+      "epoch": 1.2666666666666666,
+      "grad_norm": 0.9060247540473938,
+      "learning_rate": 1.3459407701668762e-05,
+      "loss": 1.3046287298202515,
+      "step": 186
+    },
+    {
+      "epoch": 1.2735042735042734,
+      "grad_norm": 0.8805747628211975,
+      "learning_rate": 1.3388793453227766e-05,
+      "loss": 1.3128578662872314,
+      "step": 187
+    },
+    {
+      "epoch": 1.2803418803418802,
+      "grad_norm": 0.8997815847396851,
+      "learning_rate": 1.331798778349752e-05,
+      "loss": 1.3107125759124756,
+      "step": 188
+    },
+    {
+      "epoch": 1.287179487179487,
+      "grad_norm": 0.9592490792274475,
+      "learning_rate": 1.3246994692046837e-05,
+      "loss": 1.3269885778427124,
+      "step": 189
+    },
+    {
+      "epoch": 1.294017094017094,
+      "grad_norm": 0.9726372957229614,
+      "learning_rate": 1.3175818189031326e-05,
+      "loss": 1.337971806526184,
+      "step": 190
+    },
+    {
+      "epoch": 1.300854700854701,
+      "grad_norm": 0.9480524659156799,
+      "learning_rate": 1.3104462294966895e-05,
+      "loss": 1.287239670753479,
+      "step": 191
+    },
+    {
+      "epoch": 1.3076923076923077,
+      "grad_norm": 0.9071521162986755,
+      "learning_rate": 1.3032931040502627e-05,
+      "loss": 1.2962584495544434,
+      "step": 192
+    },
+    {
+      "epoch": 1.3145299145299145,
+      "grad_norm": 0.9058794379234314,
+      "learning_rate": 1.2961228466193116e-05,
+      "loss": 1.280348300933838,
+      "step": 193
+    },
+    {
+      "epoch": 1.3213675213675213,
+      "grad_norm": 0.9048560261726379,
+      "learning_rate": 1.2889358622270225e-05,
+      "loss": 1.3330844640731812,
+      "step": 194
+    },
+    {
+      "epoch": 1.3282051282051281,
+      "grad_norm": 0.945749819278717,
+      "learning_rate": 1.2817325568414299e-05,
+      "loss": 1.3170994520187378,
+      "step": 195
+    },
+    {
+      "epoch": 1.335042735042735,
+      "grad_norm": 0.9457980394363403,
+      "learning_rate": 1.2745133373524855e-05,
+      "loss": 1.3166072368621826,
+      "step": 196
+    },
+    {
+      "epoch": 1.341880341880342,
+      "grad_norm": 0.9297810196876526,
+      "learning_rate": 1.267278611549073e-05,
+      "loss": 1.3273459672927856,
+      "step": 197
+    },
+    {
+      "epoch": 1.3487179487179488,
+      "grad_norm": 0.9370136260986328,
+      "learning_rate": 1.2600287880959762e-05,
+      "loss": 1.3432742357254028,
+      "step": 198
+    },
+    {
+      "epoch": 1.3555555555555556,
+      "grad_norm": 0.904547393321991,
+      "learning_rate": 1.2527642765107919e-05,
+      "loss": 1.3275690078735352,
+      "step": 199
+    },
+    {
+      "epoch": 1.3623931623931624,
+      "grad_norm": 0.9034311175346375,
+      "learning_rate": 1.2454854871407993e-05,
+      "loss": 1.3097259998321533,
+      "step": 200
+    },
+    {
+      "epoch": 1.3623931623931624,
+      "eval_loss": 1.4159187078475952,
+      "eval_runtime": 13.7977,
+      "eval_samples_per_second": 71.461,
+      "eval_steps_per_second": 8.987,
+      "step": 200
+    },
+    {
+      "epoch": 1.3692307692307693,
+      "grad_norm": 0.8713945150375366,
+      "learning_rate": 1.2381928311397806e-05,
+      "loss": 1.2865114212036133,
+      "step": 201
+    },
+    {
+      "epoch": 1.376068376068376,
+      "grad_norm": 0.8947977423667908,
+      "learning_rate": 1.2308867204447958e-05,
+      "loss": 1.277376651763916,
+      "step": 202
+    },
+    {
+      "epoch": 1.3829059829059829,
+      "grad_norm": 0.9047794342041016,
+      "learning_rate": 1.2235675677529158e-05,
+      "loss": 1.288478970527649,
+      "step": 203
+    },
+    {
+      "epoch": 1.3897435897435897,
+      "grad_norm": 0.8953425884246826,
+      "learning_rate": 1.2162357864979073e-05,
+      "loss": 1.2861666679382324,
+      "step": 204
+    },
+    {
+      "epoch": 1.3965811965811965,
+      "grad_norm": 0.9369704723358154,
+      "learning_rate": 1.2088917908268822e-05,
+      "loss": 1.2857511043548584,
+      "step": 205
+    },
+    {
+      "epoch": 1.4034188034188033,
+      "grad_norm": 0.887296736240387,
+      "learning_rate": 1.2015359955769021e-05,
+      "loss": 1.2925364971160889,
+      "step": 206
+    },
+    {
+      "epoch": 1.4102564102564101,
+      "grad_norm": 0.875452995300293,
+      "learning_rate": 1.1941688162515468e-05,
+      "loss": 1.3017300367355347,
+      "step": 207
+    },
+    {
+      "epoch": 1.4170940170940172,
+      "grad_norm": 0.8836603760719299,
+      "learning_rate": 1.186790668997443e-05,
+      "loss": 1.2731754779815674,
+      "step": 208
+    },
+    {
+      "epoch": 1.423931623931624,
+      "grad_norm": 0.8866926431655884,
+      "learning_rate": 1.1794019705807584e-05,
+      "loss": 1.3009804487228394,
+      "step": 209
+    },
+    {
+      "epoch": 1.4307692307692308,
+      "grad_norm": 0.8414238095283508,
+      "learning_rate": 1.1720031383636585e-05,
+      "loss": 1.3082433938980103,
+      "step": 210
+    },
+    {
+      "epoch": 1.4376068376068376,
+      "grad_norm": 0.8662127256393433,
+      "learning_rate": 1.164594590280734e-05,
+      "loss": 1.2641851902008057,
+      "step": 211
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.9151703119277954,
+      "learning_rate": 1.15717674481539e-05,
+      "loss": 1.3064939975738525,
+      "step": 212
+    },
+    {
+      "epoch": 1.4512820512820512,
+      "grad_norm": 0.9086518883705139,
+      "learning_rate": 1.1497500209762102e-05,
+      "loss": 1.3118016719818115,
+      "step": 213
+    },
+    {
+      "epoch": 1.458119658119658,
+      "grad_norm": 0.9340091347694397,
+      "learning_rate": 1.1423148382732854e-05,
+      "loss": 1.3228766918182373,
+      "step": 214
+    },
+    {
+      "epoch": 1.464957264957265,
+      "grad_norm": 0.865403950214386,
+      "learning_rate": 1.1348716166945195e-05,
+      "loss": 1.2863235473632812,
+      "step": 215
+    },
+    {
+      "epoch": 1.471794871794872,
+      "grad_norm": 0.8879923224449158,
+      "learning_rate": 1.127420776681905e-05,
+      "loss": 1.3169306516647339,
+      "step": 216
+    },
+    {
+      "epoch": 1.4786324786324787,
+      "grad_norm": 0.8761537075042725,
+      "learning_rate": 1.1199627391077732e-05,
+      "loss": 1.2758698463439941,
+      "step": 217
+    },
+    {
+      "epoch": 1.4854700854700855,
+      "grad_norm": 0.905274510383606,
+      "learning_rate": 1.1124979252510209e-05,
+      "loss": 1.3158073425292969,
+      "step": 218
+    },
+    {
+      "epoch": 1.4923076923076923,
+      "grad_norm": 0.9052457213401794,
+      "learning_rate": 1.105026756773314e-05,
+      "loss": 1.3242114782333374,
+      "step": 219
+    },
+    {
+      "epoch": 1.4991452991452991,
+      "grad_norm": 0.8539809584617615,
+      "learning_rate": 1.0975496556952683e-05,
+      "loss": 1.295405387878418,
+      "step": 220
+    },
+    {
+      "epoch": 1.505982905982906,
+      "grad_norm": 0.9171442985534668,
+      "learning_rate": 1.0900670443726136e-05,
+      "loss": 1.3160406351089478,
+      "step": 221
+    },
+    {
+      "epoch": 1.5128205128205128,
+      "grad_norm": 0.877983570098877,
+      "learning_rate": 1.0825793454723325e-05,
+      "loss": 1.315245509147644,
+      "step": 222
+    },
+    {
+      "epoch": 1.5196581196581196,
+      "grad_norm": 0.8745649456977844,
+      "learning_rate": 1.0750869819487884e-05,
+      "loss": 1.3248393535614014,
+      "step": 223
+    },
+    {
+      "epoch": 1.5264957264957264,
+      "grad_norm": 0.8661232590675354,
+      "learning_rate": 1.0675903770198333e-05,
+      "loss": 1.2788147926330566,
+      "step": 224
+    },
+    {
+      "epoch": 1.5333333333333332,
+      "grad_norm": 0.8793037533760071,
+      "learning_rate": 1.0600899541429004e-05,
+      "loss": 1.288352608680725,
+      "step": 225
+    },
+    {
+      "epoch": 1.54017094017094,
+      "grad_norm": 0.9148133397102356,
+      "learning_rate": 1.0525861369910877e-05,
+      "loss": 1.3211514949798584,
+      "step": 226
+    },
+    {
+      "epoch": 1.547008547008547,
+      "grad_norm": 0.9006965160369873,
+      "learning_rate": 1.0450793494292223e-05,
+      "loss": 1.3327584266662598,
+      "step": 227
+    },
+    {
+      "epoch": 1.5538461538461539,
+      "grad_norm": 0.8701738119125366,
+      "learning_rate": 1.0375700154899208e-05,
+      "loss": 1.3010832071304321,
+      "step": 228
+    },
+    {
+      "epoch": 1.5606837606837607,
+      "grad_norm": 0.880436360836029,
+      "learning_rate": 1.0300585593496348e-05,
+      "loss": 1.3152333498001099,
+      "step": 229
+    },
+    {
+      "epoch": 1.5675213675213675,
+      "grad_norm": 0.8781545758247375,
+      "learning_rate": 1.0225454053046922e-05,
+      "loss": 1.2808175086975098,
+      "step": 230
+    },
+    {
+      "epoch": 1.5743589743589743,
+      "grad_norm": 0.8630225658416748,
+      "learning_rate": 1.0150309777473305e-05,
+      "loss": 1.2873480319976807,
+      "step": 231
+    },
+    {
+      "epoch": 1.5811965811965814,
+      "grad_norm": 0.8928260803222656,
+      "learning_rate": 1.007515701141722e-05,
+      "loss": 1.28458571434021,
+      "step": 232
+    },
+    {
+      "epoch": 1.5880341880341882,
+      "grad_norm": 0.8699108958244324,
+      "learning_rate": 1e-05,
+      "loss": 1.2885918617248535,
+      "step": 233
+    },
+    {
+      "epoch": 1.594871794871795,
+      "grad_norm": 0.8759332895278931,
+      "learning_rate": 9.924842988582783e-06,
+      "loss": 1.2787448167800903,
+      "step": 234
+    },
+    {
+      "epoch": 1.6017094017094018,
+      "grad_norm": 0.8956566452980042,
+      "learning_rate": 9.849690222526698e-06,
+      "loss": 1.304962158203125,
+      "step": 235
+    },
+    {
+      "epoch": 1.6085470085470086,
+      "grad_norm": 0.8675941824913025,
+      "learning_rate": 9.77454594695308e-06,
+      "loss": 1.2871266603469849,
+      "step": 236
+    },
+    {
+      "epoch": 1.6153846153846154,
+      "grad_norm": 0.9092246294021606,
+      "learning_rate": 9.699414406503655e-06,
+      "loss": 1.327986240386963,
+      "step": 237
+    },
+    {
+      "epoch": 1.6222222222222222,
+      "grad_norm": 0.8909919857978821,
+      "learning_rate": 9.624299845100795e-06,
+      "loss": 1.2647631168365479,
+      "step": 238
+    },
+    {
+      "epoch": 1.629059829059829,
+      "grad_norm": 0.8657082915306091,
+      "learning_rate": 9.549206505707778e-06,
+      "loss": 1.294311761856079,
+      "step": 239
+    },
+    {
+      "epoch": 1.6358974358974359,
+      "grad_norm": 0.8618515133857727,
+      "learning_rate": 9.474138630089124e-06,
+      "loss": 1.3014901876449585,
+      "step": 240
+    },
+    {
+      "epoch": 1.6427350427350427,
+      "grad_norm": 0.8630589246749878,
+      "learning_rate": 9.399100458570998e-06,
+      "loss": 1.293131709098816,
+      "step": 241
+    },
+    {
+      "epoch": 1.6495726495726495,
+      "grad_norm": 0.8735710978507996,
+      "learning_rate": 9.324096229801673e-06,
+      "loss": 1.290333867073059,
+      "step": 242
+    },
+    {
+      "epoch": 1.6564102564102563,
+      "grad_norm": 0.8574416041374207,
+      "learning_rate": 9.249130180512118e-06,
+      "loss": 1.3111311197280884,
+      "step": 243
+    },
+    {
+      "epoch": 1.6632478632478631,
+      "grad_norm": 0.9102303981781006,
+      "learning_rate": 9.174206545276678e-06,
+      "loss": 1.271691083908081,
+      "step": 244
+    },
+    {
+      "epoch": 1.67008547008547,
+      "grad_norm": 0.867579996585846,
+      "learning_rate": 9.099329556273866e-06,
+      "loss": 1.3228224515914917,
+      "step": 245
+    },
+    {
+      "epoch": 1.676923076923077,
+      "grad_norm": 0.8179166316986084,
+      "learning_rate": 9.024503443047318e-06,
+      "loss": 1.3084717988967896,
+      "step": 246
+    },
+    {
+      "epoch": 1.6837606837606838,
+      "grad_norm": 0.8923108577728271,
+      "learning_rate": 8.949732432266867e-06,
+      "loss": 1.2903640270233154,
+      "step": 247
+    },
+    {
+      "epoch": 1.6905982905982906,
+      "grad_norm": 0.9241410493850708,
+      "learning_rate": 8.875020747489795e-06,
+      "loss": 1.302449345588684,
+      "step": 248
+    },
+    {
+      "epoch": 1.6974358974358974,
+      "grad_norm": 0.8430485129356384,
+      "learning_rate": 8.800372608922272e-06,
+      "loss": 1.2765015363693237,
+      "step": 249
+    },
+    {
+      "epoch": 1.7042735042735044,
+      "grad_norm": 0.8592954874038696,
+      "learning_rate": 8.72579223318095e-06,
+      "loss": 1.317484736442566,
+      "step": 250
+    },
+    {
+      "epoch": 1.7042735042735044,
+      "eval_loss": 1.4088929891586304,
+      "eval_runtime": 13.7993,
+      "eval_samples_per_second": 71.453,
+      "eval_steps_per_second": 8.986,
+      "step": 250
+    },
+    {
+      "epoch": 1.7111111111111112,
+      "grad_norm": 0.916032612323761,
+      "learning_rate": 8.65128383305481e-06,
+      "loss": 1.300941824913025,
+      "step": 251
+    },
+    {
+      "epoch": 1.717948717948718,
+      "grad_norm": 0.8675019145011902,
+      "learning_rate": 8.576851617267151e-06,
+      "loss": 1.3122076988220215,
+      "step": 252
+    },
+    {
+      "epoch": 1.7247863247863249,
+      "grad_norm": 0.8310043811798096,
+      "learning_rate": 8.5024997902379e-06,
+      "loss": 1.3160263299942017,
+      "step": 253
+    },
+    {
+      "epoch": 1.7316239316239317,
+      "grad_norm": 0.8706823587417603,
+      "learning_rate": 8.428232551846101e-06,
+      "loss": 1.2773703336715698,
+      "step": 254
+    },
+    {
+      "epoch": 1.7384615384615385,
+      "grad_norm": 0.8875864744186401,
+      "learning_rate": 8.35405409719266e-06,
+      "loss": 1.288883090019226,
+      "step": 255
+    },
+    {
+      "epoch": 1.7452991452991453,
+      "grad_norm": 0.9055056571960449,
+      "learning_rate": 8.279968616363417e-06,
+      "loss": 1.3028110265731812,
+      "step": 256
+    },
+    {
+      "epoch": 1.7521367521367521,
+      "grad_norm": 0.905623197555542,
+      "learning_rate": 8.205980294192421e-06,
+      "loss": 1.3112901449203491,
+      "step": 257
+    },
+    {
+      "epoch": 1.758974358974359,
+      "grad_norm": 0.847100555896759,
+      "learning_rate": 8.132093310025572e-06,
+      "loss": 1.311500906944275,
+      "step": 258
+    },
+    {
+      "epoch": 1.7658119658119658,
+      "grad_norm": 0.8671444058418274,
+      "learning_rate": 8.058311837484537e-06,
+      "loss": 1.308862566947937,
+      "step": 259
+    },
+    {
+      "epoch": 1.7726495726495726,
+      "grad_norm": 0.844569742679596,
+      "learning_rate": 7.984640044230984e-06,
+      "loss": 1.3032524585723877,
+      "step": 260
+    },
+    {
+      "epoch": 1.7794871794871794,
+      "grad_norm": 0.9013960957527161,
+      "learning_rate": 7.911082091731182e-06,
+      "loss": 1.2791337966918945,
+      "step": 261
+    },
+    {
+      "epoch": 1.7863247863247862,
+      "grad_norm": 0.8714650869369507,
+      "learning_rate": 7.837642135020929e-06,
+      "loss": 1.2602317333221436,
+      "step": 262
+    },
+    {
+      "epoch": 1.793162393162393,
+      "grad_norm": 0.9024747014045715,
+      "learning_rate": 7.764324322470842e-06,
+      "loss": 1.279998540878296,
+      "step": 263
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.8714993596076965,
+      "learning_rate": 7.691132795552044e-06,
+      "loss": 1.284783959388733,
+      "step": 264
+    },
+    {
+      "epoch": 1.8068376068376069,
+      "grad_norm": 0.8371661305427551,
+      "learning_rate": 7.618071688602199e-06,
+      "loss": 1.3234297037124634,
+      "step": 265
+    },
+    {
+      "epoch": 1.8136752136752137,
+      "grad_norm": 0.8943991661071777,
+      "learning_rate": 7.545145128592009e-06,
+      "loss": 1.2969616651535034,
+      "step": 266
+    },
+    {
+      "epoch": 1.8205128205128205,
+      "grad_norm": 0.8753275275230408,
+      "learning_rate": 7.472357234892083e-06,
+      "loss": 1.2795380353927612,
+      "step": 267
+    },
+    {
+      "epoch": 1.8273504273504273,
+      "grad_norm": 0.8614721894264221,
+      "learning_rate": 7.3997121190402375e-06,
+      "loss": 1.3064361810684204,
+      "step": 268
+    },
+    {
+      "epoch": 1.8341880341880343,
+      "grad_norm": 0.853656530380249,
+      "learning_rate": 7.3272138845092725e-06,
+      "loss": 1.3017405271530151,
+      "step": 269
+    },
+    {
+      "epoch": 1.8410256410256411,
+      "grad_norm": 0.8655431866645813,
+      "learning_rate": 7.254866626475152e-06,
+      "loss": 1.304486632347107,
+      "step": 270
+    },
+    {
+      "epoch": 1.847863247863248,
+      "grad_norm": 0.87064528465271,
+      "learning_rate": 7.182674431585703e-06,
+      "loss": 1.2795239686965942,
+      "step": 271
+    },
+    {
+      "epoch": 1.8547008547008548,
+      "grad_norm": 0.8889244198799133,
+      "learning_rate": 7.110641377729778e-06,
+      "loss": 1.294914960861206,
+      "step": 272
+    },
+    {
+      "epoch": 1.8615384615384616,
+      "grad_norm": 0.9096329212188721,
+      "learning_rate": 7.038771533806884e-06,
+      "loss": 1.2885854244232178,
+      "step": 273
+    },
+    {
+      "epoch": 1.8683760683760684,
+      "grad_norm": 0.8873443007469177,
+      "learning_rate": 6.967068959497376e-06,
+      "loss": 1.297377347946167,
+      "step": 274
+    },
+    {
+      "epoch": 1.8752136752136752,
+      "grad_norm": 0.8182293772697449,
+      "learning_rate": 6.895537705033108e-06,
+      "loss": 1.3091909885406494,
+      "step": 275
+    },
+    {
+      "epoch": 1.882051282051282,
+      "grad_norm": 0.849620521068573,
+      "learning_rate": 6.824181810968675e-06,
+      "loss": 1.2712843418121338,
+      "step": 276
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.8953171372413635,
+      "learning_rate": 6.7530053079531664e-06,
+      "loss": 1.305629849433899,
+      "step": 277
+    },
+    {
+      "epoch": 1.8957264957264957,
+      "grad_norm": 0.8743292689323425,
+      "learning_rate": 6.6820122165024845e-06,
+      "loss": 1.3009774684906006,
+      "step": 278
+    },
+    {
+      "epoch": 1.9025641025641025,
+      "grad_norm": 0.8852370977401733,
+      "learning_rate": 6.6112065467722375e-06,
+      "loss": 1.2898852825164795,
+      "step": 279
+    },
+    {
+      "epoch": 1.9094017094017093,
+      "grad_norm": 0.8812291026115417,
+      "learning_rate": 6.540592298331239e-06,
+      "loss": 1.3161499500274658,
+      "step": 280
+    },
+    {
+      "epoch": 1.916239316239316,
+      "grad_norm": 0.8949340581893921,
+      "learning_rate": 6.4701734599355605e-06,
+      "loss": 1.2947360277175903,
+      "step": 281
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.8372949957847595,
+      "learning_rate": 6.3999540093032396e-06,
+      "loss": 1.263576626777649,
+      "step": 282
+    },
+    {
+      "epoch": 1.92991452991453,
+      "grad_norm": 0.8882158398628235,
+      "learning_rate": 6.329937912889582e-06,
+      "loss": 1.2893450260162354,
+      "step": 283
+    },
+    {
+      "epoch": 1.9367521367521368,
+      "grad_norm": 0.838527500629425,
+      "learning_rate": 6.260129125663106e-06,
+      "loss": 1.2985213994979858,
+      "step": 284
+    },
+    {
+      "epoch": 1.9435897435897436,
+      "grad_norm": 0.8823593258857727,
+      "learning_rate": 6.1905315908821584e-06,
+      "loss": 1.306897521018982,
+      "step": 285
+    },
+    {
+      "epoch": 1.9504273504273504,
+      "grad_norm": 0.8618027567863464,
+      "learning_rate": 6.121149239872151e-06,
+      "loss": 1.2990589141845703,
+      "step": 286
+    },
+    {
+      "epoch": 1.9572649572649574,
+      "grad_norm": 0.8389527797698975,
+      "learning_rate": 6.051985991803517e-06,
+      "loss": 1.2886924743652344,
+      "step": 287
+    },
+    {
+      "epoch": 1.9641025641025642,
+      "grad_norm": 0.8738916516304016,
+      "learning_rate": 5.983045753470308e-06,
+      "loss": 1.3003113269805908,
+      "step": 288
+    },
+    {
+      "epoch": 1.970940170940171,
+      "grad_norm": 0.8567415475845337,
+      "learning_rate": 5.91433241906952e-06,
+      "loss": 1.285038948059082,
+      "step": 289
+    },
+    {
+      "epoch": 1.9777777777777779,
+      "grad_norm": 0.8555871248245239,
+      "learning_rate": 5.845849869981137e-06,
+      "loss": 1.2825312614440918,
+      "step": 290
+    },
+    {
+      "epoch": 1.9846153846153847,
+      "grad_norm": 0.8524548411369324,
+      "learning_rate": 5.7776019745488665e-06,
+      "loss": 1.3078036308288574,
+      "step": 291
+    },
+    {
+      "epoch": 1.9914529914529915,
+      "grad_norm": 0.8610931634902954,
+      "learning_rate": 5.709592587861637e-06,
+      "loss": 1.2933144569396973,
+      "step": 292
+    },
+    {
+      "epoch": 1.9982905982905983,
+      "grad_norm": 0.8547428250312805,
+      "learning_rate": 5.641825551535849e-06,
+      "loss": 1.2723497152328491,
+      "step": 293
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.6815301179885864,
+      "learning_rate": 5.574304693498346e-06,
+      "loss": 1.260840892791748,
+      "step": 294
+    },
+    {
+      "epoch": 2.006837606837607,
+      "grad_norm": 1.1894463300704956,
+      "learning_rate": 5.507033827770225e-06,
+      "loss": 1.2158567905426025,
+      "step": 295
+    },
+    {
+      "epoch": 2.0136752136752136,
+      "grad_norm": 1.1574074029922485,
+      "learning_rate": 5.440016754251364e-06,
+      "loss": 1.188340663909912,
+      "step": 296
+    },
+    {
+      "epoch": 2.0205128205128204,
+      "grad_norm": 0.9981362819671631,
+      "learning_rate": 5.373257258505798e-06,
+      "loss": 1.1729332208633423,
+      "step": 297
+    },
+    {
+      "epoch": 2.0273504273504273,
+      "grad_norm": 1.0496586561203003,
+      "learning_rate": 5.306759111547881e-06,
+      "loss": 1.1735312938690186,
+      "step": 298
+    },
+    {
+      "epoch": 2.034188034188034,
+      "grad_norm": 0.9409749507904053,
+      "learning_rate": 5.240526069629265e-06,
+      "loss": 1.198318600654602,
+      "step": 299
+    },
+    {
+      "epoch": 2.041025641025641,
+      "grad_norm": 0.9382721781730652,
+      "learning_rate": 5.174561874026741e-06,
+      "loss": 1.2194828987121582,
+      "step": 300
+    },
+    {
+      "epoch": 2.041025641025641,
+      "eval_loss": 1.4175776243209839,
+      "eval_runtime": 13.7699,
+      "eval_samples_per_second": 71.606,
+      "eval_steps_per_second": 9.005,
+      "step": 300
+    },
+    {
+      "epoch": 2.0478632478632477,
+      "grad_norm": 0.936610996723175,
+      "learning_rate": 5.1088702508308815e-06,
+      "loss": 1.2439236640930176,
+      "step": 301
+    },
+    {
+      "epoch": 2.0547008547008545,
+      "grad_norm": 0.9476950764656067,
+      "learning_rate": 5.043454910735595e-06,
+      "loss": 1.2119914293289185,
+      "step": 302
+    },
+    {
+      "epoch": 2.0615384615384613,
+      "grad_norm": 0.975143313407898,
+      "learning_rate": 4.978319548828504e-06,
+      "loss": 1.1766479015350342,
+      "step": 303
+    },
+    {
+      "epoch": 2.0683760683760686,
+      "grad_norm": 0.9535344243049622,
+      "learning_rate": 4.913467844382217e-06,
+      "loss": 1.2154781818389893,
+      "step": 304
+    },
+    {
+      "epoch": 2.0752136752136754,
+      "grad_norm": 0.9839100241661072,
+      "learning_rate": 4.848903460646522e-06,
+      "loss": 1.1973791122436523,
+      "step": 305
+    },
+    {
+      "epoch": 2.082051282051282,
+      "grad_norm": 0.9296822547912598,
+      "learning_rate": 4.784630044641435e-06,
+      "loss": 1.2077343463897705,
+      "step": 306
+    },
+    {
+      "epoch": 2.088888888888889,
+      "grad_norm": 0.9518297910690308,
+      "learning_rate": 4.720651226951213e-06,
+      "loss": 1.2044742107391357,
+      "step": 307
+    },
+    {
+      "epoch": 2.095726495726496,
+      "grad_norm": 0.9024590253829956,
+      "learning_rate": 4.65697062151927e-06,
+      "loss": 1.2214324474334717,
+      "step": 308
+    },
+    {
+      "epoch": 2.1025641025641026,
+      "grad_norm": 0.8939958214759827,
+      "learning_rate": 4.593591825444028e-06,
+      "loss": 1.230959177017212,
+      "step": 309
+    },
+    {
+      "epoch": 2.1094017094017095,
+      "grad_norm": 0.9565759301185608,
+      "learning_rate": 4.530518418775734e-06,
+      "loss": 1.2308049201965332,
+      "step": 310
+    },
+    {
+      "epoch": 2.1162393162393163,
+      "grad_norm": 0.8952397704124451,
+      "learning_rate": 4.467753964314245e-06,
+      "loss": 1.2218645811080933,
+      "step": 311
+    },
+    {
+      "epoch": 2.123076923076923,
+      "grad_norm": 0.9192137122154236,
+      "learning_rate": 4.40530200740777e-06,
+      "loss": 1.1945393085479736,
+      "step": 312
+    },
+    {
+      "epoch": 2.12991452991453,
+      "grad_norm": 0.9151750206947327,
+      "learning_rate": 4.343166075752605e-06,
+      "loss": 1.1909265518188477,
+      "step": 313
+    },
+    {
+      "epoch": 2.1367521367521367,
+      "grad_norm": 0.912064790725708,
+      "learning_rate": 4.281349679193862e-06,
+      "loss": 1.176002860069275,
+      "step": 314
+    },
+    {
+      "epoch": 2.1435897435897435,
+      "grad_norm": 0.9001777172088623,
+      "learning_rate": 4.219856309527212e-06,
+      "loss": 1.2102347612380981,
+      "step": 315
+    },
+    {
+      "epoch": 2.1504273504273503,
+      "grad_norm": 0.9100410342216492,
+      "learning_rate": 4.1586894403016576e-06,
+      "loss": 1.2215776443481445,
+      "step": 316
+    },
+    {
+      "epoch": 2.157264957264957,
+      "grad_norm": 0.8823668360710144,
+      "learning_rate": 4.097852526623307e-06,
+      "loss": 1.1972424983978271,
+      "step": 317
+    },
+    {
+      "epoch": 2.164102564102564,
+      "grad_norm": 0.8945139050483704,
+      "learning_rate": 4.03734900496022e-06,
+      "loss": 1.2440537214279175,
+      "step": 318
+    },
+    {
+      "epoch": 2.1709401709401708,
+      "grad_norm": 0.858863890171051,
+      "learning_rate": 3.9771822929482825e-06,
+      "loss": 1.2240134477615356,
+      "step": 319
+    },
+    {
+      "epoch": 2.1777777777777776,
+      "grad_norm": 0.9579023122787476,
+      "learning_rate": 3.917355789198157e-06,
+      "loss": 1.1975905895233154,
+      "step": 320
+    },
+    {
+      "epoch": 2.184615384615385,
+      "grad_norm": 0.8992065191268921,
+      "learning_rate": 3.857872873103322e-06,
+      "loss": 1.2251243591308594,
+      "step": 321
+    },
+    {
+      "epoch": 2.1914529914529917,
+      "grad_norm": 0.8930969834327698,
+      "learning_rate": 3.7987369046491684e-06,
+      "loss": 1.1994602680206299,
+      "step": 322
+    },
+    {
+      "epoch": 2.1982905982905985,
+      "grad_norm": 0.8879907727241516,
+      "learning_rate": 3.7399512242231994e-06,
+      "loss": 1.2023355960845947,
+      "step": 323
+    },
+    {
+      "epoch": 2.2051282051282053,
+      "grad_norm": 0.8827998638153076,
+      "learning_rate": 3.6815191524263628e-06,
+      "loss": 1.1980074644088745,
+      "step": 324
+    },
+    {
+      "epoch": 2.211965811965812,
+      "grad_norm": 0.9081103801727295,
+      "learning_rate": 3.623443989885462e-06,
+      "loss": 1.2123109102249146,
+      "step": 325
+    },
+    {
+      "epoch": 2.218803418803419,
+      "grad_norm": 0.8658437132835388,
+      "learning_rate": 3.565729017066729e-06,
+      "loss": 1.1860473155975342,
+      "step": 326
+    },
+    {
+      "epoch": 2.2256410256410257,
+      "grad_norm": 0.8716210722923279,
+      "learning_rate": 3.508377494090521e-06,
+      "loss": 1.246274471282959,
+      "step": 327
+    },
+    {
+      "epoch": 2.2324786324786325,
+      "grad_norm": 0.8930105566978455,
+      "learning_rate": 3.4513926605471504e-06,
+      "loss": 1.2249618768692017,
+      "step": 328
+    },
+    {
+      "epoch": 2.2393162393162394,
+      "grad_norm": 0.8859133720397949,
+      "learning_rate": 3.3947777353139188e-06,
+      "loss": 1.2300435304641724,
+      "step": 329
+    },
+    {
+      "epoch": 2.246153846153846,
+      "grad_norm": 0.876879096031189,
+      "learning_rate": 3.338535916373267e-06,
+      "loss": 1.226067066192627,
+      "step": 330
+    },
+    {
+      "epoch": 2.252991452991453,
+      "grad_norm": 0.8582764863967896,
+      "learning_rate": 3.2826703806321526e-06,
+      "loss": 1.2141978740692139,
+      "step": 331
+    },
+    {
+      "epoch": 2.25982905982906,
+      "grad_norm": 0.9050947427749634,
+      "learning_rate": 3.2271842837425917e-06,
+      "loss": 1.199479103088379,
+      "step": 332
+    },
+    {
+      "epoch": 2.2666666666666666,
+      "grad_norm": 0.8743166923522949,
+      "learning_rate": 3.1720807599233903e-06,
+      "loss": 1.2526406049728394,
+      "step": 333
+    },
+    {
+      "epoch": 2.2735042735042734,
+      "grad_norm": 0.9142019152641296,
+      "learning_rate": 3.1173629217831345e-06,
+      "loss": 1.1963285207748413,
+      "step": 334
+    },
+    {
+      "epoch": 2.2803418803418802,
+      "grad_norm": 0.8888209462165833,
+      "learning_rate": 3.063033860144339e-06,
+      "loss": 1.209120512008667,
+      "step": 335
+    },
+    {
+      "epoch": 2.287179487179487,
+      "grad_norm": 0.8925624489784241,
+      "learning_rate": 3.0090966438688774e-06,
+      "loss": 1.1804795265197754,
+      "step": 336
+    },
+    {
+      "epoch": 2.294017094017094,
+      "grad_norm": 0.9087634682655334,
+      "learning_rate": 2.9555543196846293e-06,
+      "loss": 1.2147403955459595,
+      "step": 337
+    },
+    {
+      "epoch": 2.3008547008547007,
+      "grad_norm": 0.9099950194358826,
+      "learning_rate": 2.9024099120133674e-06,
+      "loss": 1.2237548828125,
+      "step": 338
+    },
+    {
+      "epoch": 2.3076923076923075,
+      "grad_norm": 0.8658971786499023,
+      "learning_rate": 2.8496664227999417e-06,
+      "loss": 1.2072890996932983,
+      "step": 339
+    },
+    {
+      "epoch": 2.3145299145299143,
+      "grad_norm": 0.8897408843040466,
+      "learning_rate": 2.7973268313426836e-06,
+      "loss": 1.2147533893585205,
+      "step": 340
+    },
+    {
+      "epoch": 2.3213675213675216,
+      "grad_norm": 0.8564779758453369,
+      "learning_rate": 2.745394094125141e-06,
+      "loss": 1.2456395626068115,
+      "step": 341
+    },
+    {
+      "epoch": 2.3282051282051284,
+      "grad_norm": 0.8652287125587463,
+      "learning_rate": 2.6938711446490607e-06,
+      "loss": 1.2109252214431763,
+      "step": 342
+    },
+    {
+      "epoch": 2.335042735042735,
+      "grad_norm": 0.8643552660942078,
+      "learning_rate": 2.642760893268684e-06,
+      "loss": 1.1878920793533325,
+      "step": 343
+    },
+    {
+      "epoch": 2.341880341880342,
+      "grad_norm": 0.8824043869972229,
+      "learning_rate": 2.5920662270263653e-06,
+      "loss": 1.1911319494247437,
+      "step": 344
+    },
+    {
+      "epoch": 2.348717948717949,
+      "grad_norm": 0.8898422122001648,
+      "learning_rate": 2.541790009489474e-06,
+      "loss": 1.193242073059082,
+      "step": 345
+    },
+    {
+      "epoch": 2.3555555555555556,
+      "grad_norm": 0.8772786259651184,
+      "learning_rate": 2.491935080588658e-06,
+      "loss": 1.1836318969726562,
+      "step": 346
+    },
+    {
+      "epoch": 2.3623931623931624,
+      "grad_norm": 0.8587839603424072,
+      "learning_rate": 2.4425042564574186e-06,
+      "loss": 1.2118480205535889,
+      "step": 347
+    },
+    {
+      "epoch": 2.3692307692307693,
+      "grad_norm": 0.8739367127418518,
+      "learning_rate": 2.3935003292730295e-06,
+      "loss": 1.2201834917068481,
+      "step": 348
+    },
+    {
+      "epoch": 2.376068376068376,
+      "grad_norm": 0.8904187679290771,
+      "learning_rate": 2.344926067098836e-06,
+      "loss": 1.1912821531295776,
+      "step": 349
+    },
+    {
+      "epoch": 2.382905982905983,
+      "grad_norm": 0.8717731237411499,
+      "learning_rate": 2.2967842137278706e-06,
+      "loss": 1.2726080417633057,
+      "step": 350
+    },
+    {
+      "epoch": 2.382905982905983,
+      "eval_loss": 1.422935962677002,
+      "eval_runtime": 13.7932,
+      "eval_samples_per_second": 71.484,
+      "eval_steps_per_second": 8.99,
+      "step": 350
+    },
+    {
+      "epoch": 2.3897435897435897,
+      "grad_norm": 0.8623640537261963,
+      "learning_rate": 2.249077488527891e-06,
+      "loss": 1.1917917728424072,
+      "step": 351
+    },
+    {
+      "epoch": 2.3965811965811965,
+      "grad_norm": 0.9295298457145691,
+      "learning_rate": 2.201808586287757e-06,
+      "loss": 1.195438027381897,
+      "step": 352
+    },
+    {
+      "epoch": 2.4034188034188033,
+      "grad_norm": 0.8726212382316589,
+      "learning_rate": 2.15498017706521e-06,
+      "loss": 1.1993173360824585,
+      "step": 353
+    },
+    {
+      "epoch": 2.41025641025641,
+      "grad_norm": 0.8750997185707092,
+      "learning_rate": 2.1085949060360654e-06,
+      "loss": 1.2198253870010376,
+      "step": 354
+    },
+    {
+      "epoch": 2.417094017094017,
+      "grad_norm": 0.8799977898597717,
+      "learning_rate": 2.0626553933447734e-06,
+      "loss": 1.1714023351669312,
+      "step": 355
+    },
+    {
+      "epoch": 2.4239316239316238,
+      "grad_norm": 0.9106065034866333,
+      "learning_rate": 2.01716423395644e-06,
+      "loss": 1.2285724878311157,
+      "step": 356
+    },
+    {
+      "epoch": 2.430769230769231,
+      "grad_norm": 0.8555257320404053,
+      "learning_rate": 1.9721239975102313e-06,
+      "loss": 1.1813218593597412,
+      "step": 357
+    },
+    {
+      "epoch": 2.437606837606838,
+      "grad_norm": 0.8696889877319336,
+      "learning_rate": 1.9275372281742242e-06,
+      "loss": 1.2316478490829468,
+      "step": 358
+    },
+    {
+      "epoch": 2.4444444444444446,
+      "grad_norm": 0.9041836857795715,
+      "learning_rate": 1.8834064445016952e-06,
+      "loss": 1.2227892875671387,
+      "step": 359
+    },
+    {
+      "epoch": 2.4512820512820515,
+      "grad_norm": 0.8697716593742371,
+      "learning_rate": 1.8397341392888679e-06,
+      "loss": 1.224617600440979,
+      "step": 360
+    },
+    {
+      "epoch": 2.4581196581196583,
+      "grad_norm": 0.8882873058319092,
+      "learning_rate": 1.7965227794340879e-06,
+      "loss": 1.1995422840118408,
+      "step": 361
+    },
+    {
+      "epoch": 2.464957264957265,
+      "grad_norm": 0.8834539651870728,
+      "learning_rate": 1.7537748057984861e-06,
+      "loss": 1.2222732305526733,
+      "step": 362
+    },
+    {
+      "epoch": 2.471794871794872,
+      "grad_norm": 0.899989128112793,
+      "learning_rate": 1.7114926330680958e-06,
+      "loss": 1.2143341302871704,
+      "step": 363
+    },
+    {
+      "epoch": 2.4786324786324787,
+      "grad_norm": 0.8635477423667908,
+      "learning_rate": 1.6696786496174578e-06,
+      "loss": 1.2323466539382935,
+      "step": 364
+    },
+    {
+      "epoch": 2.4854700854700855,
+      "grad_norm": 0.8827865719795227,
+      "learning_rate": 1.6283352173747148e-06,
+      "loss": 1.1907907724380493,
+      "step": 365
+    },
+    {
+      "epoch": 2.4923076923076923,
+      "grad_norm": 0.8702190518379211,
+      "learning_rate": 1.587464671688187e-06,
+      "loss": 1.201211929321289,
+      "step": 366
+    },
+    {
+      "epoch": 2.499145299145299,
+      "grad_norm": 0.8626653552055359,
+      "learning_rate": 1.5470693211944643e-06,
+      "loss": 1.1894201040267944,
+      "step": 367
+    },
+    {
+      "epoch": 2.505982905982906,
+      "grad_norm": 0.879705011844635,
+      "learning_rate": 1.5071514476879878e-06,
+      "loss": 1.2102407217025757,
+      "step": 368
+    },
+    {
+      "epoch": 2.5128205128205128,
+      "grad_norm": 0.8780226707458496,
+      "learning_rate": 1.4677133059921634e-06,
+      "loss": 1.235593557357788,
+      "step": 369
+    },
+    {
+      "epoch": 2.5196581196581196,
+      "grad_norm": 0.8804551362991333,
+      "learning_rate": 1.4287571238320053e-06,
+      "loss": 1.2265985012054443,
+      "step": 370
+    },
+    {
+      "epoch": 2.5264957264957264,
+      "grad_norm": 0.8670660257339478,
+      "learning_rate": 1.3902851017082863e-06,
+      "loss": 1.1925873756408691,
+      "step": 371
+    },
+    {
+      "epoch": 2.533333333333333,
+      "grad_norm": 0.8729323744773865,
+      "learning_rate": 1.3522994127732415e-06,
+      "loss": 1.20308518409729,
+      "step": 372
+    },
+    {
+      "epoch": 2.54017094017094,
+      "grad_norm": 0.8794763088226318,
+      "learning_rate": 1.3148022027078223e-06,
+      "loss": 1.2204805612564087,
+      "step": 373
+    },
+    {
+      "epoch": 2.547008547008547,
+      "grad_norm": 0.870823323726654,
+      "learning_rate": 1.2777955896004812e-06,
+      "loss": 1.2257260084152222,
+      "step": 374
+    },
+    {
+      "epoch": 2.5538461538461537,
+      "grad_norm": 0.8570955991744995,
+      "learning_rate": 1.2412816638275406e-06,
+      "loss": 1.2166708707809448,
+      "step": 375
+    },
+    {
+      "epoch": 2.5606837606837605,
+      "grad_norm": 0.8496021628379822,
+      "learning_rate": 1.2052624879351105e-06,
+      "loss": 1.1956825256347656,
+      "step": 376
+    },
+    {
+      "epoch": 2.5675213675213673,
+      "grad_norm": 0.8563467860221863,
+      "learning_rate": 1.1697400965225746e-06,
+      "loss": 1.2383781671524048,
+      "step": 377
+    },
+    {
+      "epoch": 2.574358974358974,
+      "grad_norm": 0.8653855919837952,
+      "learning_rate": 1.134716496127679e-06,
+      "loss": 1.218265414237976,
+      "step": 378
+    },
+    {
+      "epoch": 2.5811965811965814,
+      "grad_norm": 0.8653165698051453,
+      "learning_rate": 1.1001936651131717e-06,
+      "loss": 1.226462483406067,
+      "step": 379
+    },
+    {
+      "epoch": 2.588034188034188,
+      "grad_norm": 0.8810314536094666,
+      "learning_rate": 1.0661735535550666e-06,
+      "loss": 1.176276445388794,
+      "step": 380
+    },
+    {
+      "epoch": 2.594871794871795,
+      "grad_norm": 0.8538199663162231,
+      "learning_rate": 1.0326580831324816e-06,
+      "loss": 1.2393090724945068,
+      "step": 381
+    },
+    {
+      "epoch": 2.601709401709402,
+      "grad_norm": 0.849739134311676,
+      "learning_rate": 9.996491470190917e-07,
+      "loss": 1.2231508493423462,
+      "step": 382
+    },
+    {
+      "epoch": 2.6085470085470086,
+      "grad_norm": 0.891149640083313,
+      "learning_rate": 9.671486097761918e-07,
+      "loss": 1.2225626707077026,
+      "step": 383
+    },
+    {
+      "epoch": 2.6153846153846154,
+      "grad_norm": 0.8668763637542725,
+      "learning_rate": 9.351583072473713e-07,
+      "loss": 1.2182505130767822,
+      "step": 384
+    },
+    {
+      "epoch": 2.6222222222222222,
+      "grad_norm": 0.8931220173835754,
+      "learning_rate": 9.036800464548157e-07,
+      "loss": 1.1996538639068604,
+      "step": 385
+    },
+    {
+      "epoch": 2.629059829059829,
+      "grad_norm": 0.923690140247345,
+      "learning_rate": 8.727156054972374e-07,
+      "loss": 1.238417148590088,
+      "step": 386
+    },
+    {
+      "epoch": 2.635897435897436,
+      "grad_norm": 0.9119179844856262,
+      "learning_rate": 8.42266733449425e-07,
+      "loss": 1.226833462715149,
+      "step": 387
+    },
+    {
+      "epoch": 2.6427350427350427,
+      "grad_norm": 0.8686037659645081,
+      "learning_rate": 8.123351502634625e-07,
+      "loss": 1.1834110021591187,
+      "step": 388
+    },
+    {
+      "epoch": 2.6495726495726495,
+      "grad_norm": 0.8596007823944092,
+      "learning_rate": 7.829225466715551e-07,
+      "loss": 1.1922662258148193,
+      "step": 389
+    },
+    {
+      "epoch": 2.6564102564102563,
+      "grad_norm": 0.8411397337913513,
+      "learning_rate": 7.540305840905371e-07,
+      "loss": 1.2220802307128906,
+      "step": 390
+    },
+    {
+      "epoch": 2.663247863247863,
+      "grad_norm": 0.8473320007324219,
+      "learning_rate": 7.256608945280319e-07,
+      "loss": 1.176034688949585,
+      "step": 391
+    },
+    {
+      "epoch": 2.67008547008547,
+      "grad_norm": 0.8465791940689087,
+      "learning_rate": 6.978150804902451e-07,
+      "loss": 1.2118513584136963,
+      "step": 392
+    },
+    {
+      "epoch": 2.676923076923077,
+      "grad_norm": 0.8556994199752808,
+      "learning_rate": 6.704947148914608e-07,
+      "loss": 1.2035595178604126,
+      "step": 393
+    },
+    {
+      "epoch": 2.683760683760684,
+      "grad_norm": 0.8603663444519043,
+      "learning_rate": 6.437013409651849e-07,
+      "loss": 1.2043513059616089,
+      "step": 394
+    },
+    {
+      "epoch": 2.690598290598291,
+      "grad_norm": 0.8347552418708801,
+      "learning_rate": 6.174364721769744e-07,
+      "loss": 1.260666847229004,
+      "step": 395
+    },
+    {
+      "epoch": 2.6974358974358976,
+      "grad_norm": 0.867624044418335,
+      "learning_rate": 5.917015921389569e-07,
+      "loss": 1.2071622610092163,
+      "step": 396
+    },
+    {
+      "epoch": 2.7042735042735044,
+      "grad_norm": 0.8668217062950134,
+      "learning_rate": 5.664981545260073e-07,
+      "loss": 1.197313904762268,
+      "step": 397
+    },
+    {
+      "epoch": 2.7111111111111112,
+      "grad_norm": 0.8758941292762756,
+      "learning_rate": 5.418275829936537e-07,
+      "loss": 1.1844048500061035,
+      "step": 398
+    },
+    {
+      "epoch": 2.717948717948718,
+      "grad_norm": 0.866844892501831,
+      "learning_rate": 5.176912710976467e-07,
+      "loss": 1.1971948146820068,
+      "step": 399
+    },
+    {
+      "epoch": 2.724786324786325,
+      "grad_norm": 0.8587160110473633,
+      "learning_rate": 4.940905822152454e-07,
+      "loss": 1.1895333528518677,
+      "step": 400
+    },
+    {
+      "epoch": 2.724786324786325,
+      "eval_loss": 1.4216117858886719,
+      "eval_runtime": 13.782,
+      "eval_samples_per_second": 71.543,
+      "eval_steps_per_second": 8.997,
+      "step": 400
+    },
+    {
+      "epoch": 2.7316239316239317,
+      "grad_norm": 0.8763930201530457,
+      "learning_rate": 4.710268494682146e-07,
+      "loss": 1.1914920806884766,
+      "step": 401
+    },
+    {
+      "epoch": 2.7384615384615385,
+      "grad_norm": 0.8831557035446167,
+      "learning_rate": 4.485013756475076e-07,
+      "loss": 1.1900079250335693,
+      "step": 402
+    },
+    {
+      "epoch": 2.7452991452991453,
+      "grad_norm": 0.866532027721405,
+      "learning_rate": 4.265154331396815e-07,
+      "loss": 1.1844745874404907,
+      "step": 403
+    },
+    {
+      "epoch": 2.752136752136752,
+      "grad_norm": 0.8787288069725037,
+      "learning_rate": 4.0507026385502747e-07,
+      "loss": 1.2126126289367676,
+      "step": 404
+    },
+    {
+      "epoch": 2.758974358974359,
+      "grad_norm": 0.8669936060905457,
+      "learning_rate": 3.841670791574137e-07,
+      "loss": 1.229267954826355,
+      "step": 405
+    },
+    {
+      "epoch": 2.7658119658119658,
+      "grad_norm": 0.8436914086341858,
+      "learning_rate": 3.638070597958665e-07,
+      "loss": 1.1994611024856567,
+      "step": 406
+    },
+    {
+      "epoch": 2.7726495726495726,
+      "grad_norm": 0.8477561473846436,
+      "learning_rate": 3.439913558378705e-07,
+      "loss": 1.2160733938217163,
+      "step": 407
+    },
+    {
+      "epoch": 2.7794871794871794,
+      "grad_norm": 0.9217561483383179,
+      "learning_rate": 3.2472108660439706e-07,
+      "loss": 1.1882672309875488,
+      "step": 408
+    },
+    {
+      "epoch": 2.786324786324786,
+      "grad_norm": 0.8692064881324768,
+      "learning_rate": 3.059973406066963e-07,
+      "loss": 1.186108112335205,
+      "step": 409
+    },
+    {
+      "epoch": 2.793162393162393,
+      "grad_norm": 0.8593800067901611,
+      "learning_rate": 2.878211754847926e-07,
+      "loss": 1.2128371000289917,
+      "step": 410
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.8875913023948669,
+      "learning_rate": 2.701936179477516e-07,
+      "loss": 1.1906311511993408,
+      "step": 411
+    },
+    {
+      "epoch": 2.8068376068376066,
+      "grad_norm": 0.8833599090576172,
+      "learning_rate": 2.5311566371568505e-07,
+      "loss": 1.1937415599822998,
+      "step": 412
+    },
+    {
+      "epoch": 2.8136752136752134,
+      "grad_norm": 0.8523573279380798,
+      "learning_rate": 2.3658827746349976e-07,
+      "loss": 1.1862268447875977,
+      "step": 413
+    },
+    {
+      "epoch": 2.8205128205128203,
+      "grad_norm": 0.8653656244277954,
+      "learning_rate": 2.206123927664161e-07,
+      "loss": 1.2255483865737915,
+      "step": 414
+    },
+    {
+      "epoch": 2.827350427350427,
+      "grad_norm": 0.874724805355072,
+      "learning_rate": 2.0518891204722169e-07,
+      "loss": 1.2177876234054565,
+      "step": 415
+    },
+    {
+      "epoch": 2.8341880341880343,
+      "grad_norm": 0.8411559462547302,
+      "learning_rate": 1.903187065253076e-07,
+      "loss": 1.2034833431243896,
+      "step": 416
+    },
+    {
+      "epoch": 2.841025641025641,
+      "grad_norm": 0.8371963500976562,
+      "learning_rate": 1.7600261616745106e-07,
+      "loss": 1.1710231304168701,
+      "step": 417
+    },
+    {
+      "epoch": 2.847863247863248,
+      "grad_norm": 0.8555141687393188,
+      "learning_rate": 1.622414496403668e-07,
+      "loss": 1.2024474143981934,
+      "step": 418
+    },
+    {
+      "epoch": 2.8547008547008548,
+      "grad_norm": 0.8661652207374573,
+      "learning_rate": 1.490359842650324e-07,
+      "loss": 1.2498114109039307,
+      "step": 419
+    },
+    {
+      "epoch": 2.8615384615384616,
+      "grad_norm": 0.8592333197593689,
+      "learning_rate": 1.3638696597277678e-07,
+      "loss": 1.2100580930709839,
+      "step": 420
+    },
+    {
+      "epoch": 2.8683760683760684,
+      "grad_norm": 0.8594926595687866,
+      "learning_rate": 1.2429510926314835e-07,
+      "loss": 1.1787865161895752,
+      "step": 421
+    },
+    {
+      "epoch": 2.875213675213675,
+      "grad_norm": 0.8879026174545288,
+      "learning_rate": 1.1276109716355288e-07,
+      "loss": 1.2315534353256226,
+      "step": 422
+    },
+    {
+      "epoch": 2.882051282051282,
+      "grad_norm": 0.8497971892356873,
+      "learning_rate": 1.0178558119067316e-07,
+      "loss": 1.2027359008789062,
+      "step": 423
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 0.8838421106338501,
+      "learning_rate": 9.136918131366412e-08,
+      "loss": 1.2284358739852905,
+      "step": 424
+    },
+    {
+      "epoch": 2.8957264957264957,
+      "grad_norm": 0.8940805196762085,
+      "learning_rate": 8.151248591913519e-08,
+      "loss": 1.2018911838531494,
+      "step": 425
+    },
+    {
+      "epoch": 2.9025641025641025,
+      "grad_norm": 0.8463784456253052,
+      "learning_rate": 7.22160517779169e-08,
+      "loss": 1.2137906551361084,
+      "step": 426
+    },
+    {
+      "epoch": 2.9094017094017093,
+      "grad_norm": 0.8508373498916626,
+      "learning_rate": 6.348040401360833e-08,
+      "loss": 1.2048455476760864,
+      "step": 427
+    },
+    {
+      "epoch": 2.916239316239316,
+      "grad_norm": 0.8702911138534546,
+      "learning_rate": 5.530603607290852e-08,
+      "loss": 1.216880202293396,
+      "step": 428
+    },
+    {
+      "epoch": 2.9230769230769234,
+      "grad_norm": 0.8441773653030396,
+      "learning_rate": 4.7693409697756596e-08,
+      "loss": 1.2449169158935547,
+      "step": 429
+    },
+    {
+      "epoch": 2.92991452991453,
+      "grad_norm": 0.8643396496772766,
+      "learning_rate": 4.0642954899238196e-08,
+      "loss": 1.2154898643493652,
+      "step": 430
+    },
+    {
+      "epoch": 2.936752136752137,
+      "grad_norm": 0.8390621542930603,
+      "learning_rate": 3.4155069933301535e-08,
+      "loss": 1.1894207000732422,
+      "step": 431
+    },
+    {
+      "epoch": 2.943589743589744,
+      "grad_norm": 0.8889386057853699,
+      "learning_rate": 2.823012127825764e-08,
+      "loss": 1.2449326515197754,
+      "step": 432
+    },
+    {
+      "epoch": 2.9504273504273506,
+      "grad_norm": 0.8431465029716492,
+      "learning_rate": 2.2868443614082468e-08,
+      "loss": 1.1918964385986328,
+      "step": 433
+    },
+    {
+      "epoch": 2.9572649572649574,
+      "grad_norm": 0.859993577003479,
+      "learning_rate": 1.8070339803509805e-08,
+      "loss": 1.1882524490356445,
+      "step": 434
+    },
+    {
+      "epoch": 2.9641025641025642,
+      "grad_norm": 0.8584935069084167,
+      "learning_rate": 1.383608087492605e-08,
+      "loss": 1.2315739393234253,
+      "step": 435
+    },
+    {
+      "epoch": 2.970940170940171,
+      "grad_norm": 0.8648282289505005,
+      "learning_rate": 1.0165906007056914e-08,
+      "loss": 1.235274314880371,
+      "step": 436
+    },
+    {
+      "epoch": 2.977777777777778,
+      "grad_norm": 0.8602524399757385,
+      "learning_rate": 7.060022515460452e-09,
+      "loss": 1.1928036212921143,
+      "step": 437
+    },
+    {
+      "epoch": 2.9846153846153847,
+      "grad_norm": 0.8722023367881775,
+      "learning_rate": 4.5186058408153156e-09,
+      "loss": 1.2146607637405396,
+      "step": 438
+    },
+    {
+      "epoch": 2.9914529914529915,
+      "grad_norm": 0.8878926038742065,
+      "learning_rate": 2.5417995390086824e-09,
+      "loss": 1.1910994052886963,
+      "step": 439
+    },
+    {
+      "epoch": 2.9982905982905983,
+      "grad_norm": 0.8773415088653564,
+      "learning_rate": 1.129715273033849e-09,
+      "loss": 1.1811952590942383,
+      "step": 440
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.841178059577942,
+      "learning_rate": 2.8243280667306084e-10,
+      "loss": 1.14687180519104,
+      "step": 441
+    },
+    {
+      "epoch": 3.0,
+      "step": 441,
+      "total_flos": 5.3379040973665075e+17,
+      "train_loss": 1.3470534840408637,
+      "train_runtime": 3006.8003,
+      "train_samples_per_second": 18.676,
+      "train_steps_per_second": 0.147
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 441,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.3379040973665075e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}