commit 7c02353428da24d73a1390b5bf4fa5687b1812bb
Author: ModelHub XC <noreply@modelhub.org.cn>
Date:   Sat Jun 20 17:56:19 2026 +0800

    初始化项目，由ModelHub XC社区提供模型
    
    Model: guangyangnlp/Qwen3-4B-SFT-medical-1e-5
    Source: Original Platform

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..52373fe
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,36 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ee0e798
--- /dev/null
+++ b/README.md
@@ -0,0 +1,72 @@
+---
+library_name: transformers
+license: other
+base_model: Qwen/Qwen3-4B
+tags:
+- llama-factory
+- full
+- generated_from_trainer
+model-index:
+- name: medical-o1-sft-full-1e-5
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+# medical-o1-sft-full-1e-5
+
+This model is a fine-tuned version of [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) on the medical_o1_train dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.3088
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 4
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 32
+- total_train_batch_size: 128
+- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 0.05
+- num_epochs: 3.0
+
+### Training results
+
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.3952        | 0.3009 | 44   | 1.3628          |
+| 1.3153        | 0.6017 | 88   | 1.3319          |
+| 1.3174        | 0.9026 | 132  | 1.3175          |
+| 1.1758        | 1.1983 | 176  | 1.3184          |
+| 1.2047        | 1.4991 | 220  | 1.3131          |
+| 1.1961        | 1.8    | 264  | 1.3088          |
+| 1.1401        | 2.0957 | 308  | 1.3254          |
+| 1.1144        | 2.3966 | 352  | 1.3240          |
+| 1.1196        | 2.6974 | 396  | 1.3236          |
+| 1.0940        | 2.9983 | 440  | 1.3234          |
+
+
+### Framework versions
+
+- Transformers 5.0.0
+- Pytorch 2.10.0+cu128
+- Datasets 4.0.0
+- Tokenizers 0.22.2
diff --git a/all_results.json b/all_results.json
new file mode 100644
index 0000000..5e18feb
--- /dev/null
+++ b/all_results.json
@@ -0,0 +1,12 @@
+{
+    "epoch": 3.0,
+    "eval_loss": 1.308773159980774,
+    "eval_runtime": 24.6494,
+    "eval_samples_per_second": 40.001,
+    "eval_steps_per_second": 5.031,
+    "total_flos": 9.743300044908134e+17,
+    "train_loss": 1.2459275746832088,
+    "train_runtime": 6646.3979,
+    "train_samples_per_second": 8.449,
+    "train_steps_per_second": 0.066
+}
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..1e6ad26
--- /dev/null
+++ b/config.json
@@ -0,0 +1,71 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/eval_results.json b/eval_results.json
new file mode 100644
index 0000000..1d0688c
--- /dev/null
+++ b/eval_results.json
@@ -0,0 +1,7 @@
+{
+    "epoch": 3.0,
+    "eval_loss": 1.308773159980774,
+    "eval_runtime": 24.6494,
+    "eval_samples_per_second": 40.001,
+    "eval_steps_per_second": 5.031
+}
\ No newline at end of file
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..c33fb76
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,12 @@
+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.0.0"
+}
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000..39c1fef
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fce6b23b7b5fd06726350735e0ccbc86ce9237583d2e16e868ee2d4abb7df01b
+size 16089918232
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000..c7afbed
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..145e2c7
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,30 @@
+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
diff --git a/train_results.json b/train_results.json
new file mode 100644
index 0000000..28dc4b3
--- /dev/null
+++ b/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.0,
+    "total_flos": 9.743300044908134e+17,
+    "train_loss": 1.2459275746832088,
+    "train_runtime": 6646.3979,
+    "train_samples_per_second": 8.449,
+    "train_steps_per_second": 0.066
+}
\ No newline at end of file
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000..6f11b78
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,3210 @@
+{
+  "best_global_step": 264,
+  "best_metric": 1.308773159980774,
+  "best_model_checkpoint": "saves/qwen3-4B/medical-o1-sft-full-1e-5/checkpoint-264",
+  "epoch": 3.0,
+  "eval_steps": 44,
+  "global_step": 441,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006837606837606838,
+      "grad_norm": 24.729957580566406,
+      "learning_rate": 0.0,
+      "loss": 2.180166482925415,
+      "step": 1
+    },
+    {
+      "epoch": 0.013675213675213675,
+      "grad_norm": 25.152711868286133,
+      "learning_rate": 4.347826086956522e-07,
+      "loss": 2.1789543628692627,
+      "step": 2
+    },
+    {
+      "epoch": 0.020512820512820513,
+      "grad_norm": 24.6761417388916,
+      "learning_rate": 8.695652173913044e-07,
+      "loss": 2.204561233520508,
+      "step": 3
+    },
+    {
+      "epoch": 0.02735042735042735,
+      "grad_norm": 24.276906967163086,
+      "learning_rate": 1.3043478260869566e-06,
+      "loss": 2.1825883388519287,
+      "step": 4
+    },
+    {
+      "epoch": 0.03418803418803419,
+      "grad_norm": 23.327831268310547,
+      "learning_rate": 1.7391304347826088e-06,
+      "loss": 2.2022361755371094,
+      "step": 5
+    },
+    {
+      "epoch": 0.041025641025641026,
+      "grad_norm": 20.180011749267578,
+      "learning_rate": 2.173913043478261e-06,
+      "loss": 2.0757670402526855,
+      "step": 6
+    },
+    {
+      "epoch": 0.04786324786324787,
+      "grad_norm": 18.820642471313477,
+      "learning_rate": 2.6086956521739132e-06,
+      "loss": 2.024721145629883,
+      "step": 7
+    },
+    {
+      "epoch": 0.0547008547008547,
+      "grad_norm": 13.223835945129395,
+      "learning_rate": 3.043478260869566e-06,
+      "loss": 1.9034565687179565,
+      "step": 8
+    },
+    {
+      "epoch": 0.06153846153846154,
+      "grad_norm": 11.584263801574707,
+      "learning_rate": 3.4782608695652175e-06,
+      "loss": 1.8130236864089966,
+      "step": 9
+    },
+    {
+      "epoch": 0.06837606837606838,
+      "grad_norm": 5.6841607093811035,
+      "learning_rate": 3.91304347826087e-06,
+      "loss": 1.6309248208999634,
+      "step": 10
+    },
+    {
+      "epoch": 0.07521367521367521,
+      "grad_norm": 4.208008766174316,
+      "learning_rate": 4.347826086956522e-06,
+      "loss": 1.5361576080322266,
+      "step": 11
+    },
+    {
+      "epoch": 0.08205128205128205,
+      "grad_norm": 3.528555154800415,
+      "learning_rate": 4.782608695652174e-06,
+      "loss": 1.6088225841522217,
+      "step": 12
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 3.099165916442871,
+      "learning_rate": 5.2173913043478265e-06,
+      "loss": 1.5432047843933105,
+      "step": 13
+    },
+    {
+      "epoch": 0.09572649572649573,
+      "grad_norm": 6.412608623504639,
+      "learning_rate": 5.652173913043479e-06,
+      "loss": 1.5963867902755737,
+      "step": 14
+    },
+    {
+      "epoch": 0.10256410256410256,
+      "grad_norm": 5.609615802764893,
+      "learning_rate": 6.086956521739132e-06,
+      "loss": 1.5698325634002686,
+      "step": 15
+    },
+    {
+      "epoch": 0.1094017094017094,
+      "grad_norm": 4.161319255828857,
+      "learning_rate": 6.521739130434783e-06,
+      "loss": 1.555444598197937,
+      "step": 16
+    },
+    {
+      "epoch": 0.11623931623931624,
+      "grad_norm": 3.2057743072509766,
+      "learning_rate": 6.956521739130435e-06,
+      "loss": 1.475843906402588,
+      "step": 17
+    },
+    {
+      "epoch": 0.12307692307692308,
+      "grad_norm": 2.5646772384643555,
+      "learning_rate": 7.391304347826087e-06,
+      "loss": 1.509574294090271,
+      "step": 18
+    },
+    {
+      "epoch": 0.12991452991452992,
+      "grad_norm": 1.9250593185424805,
+      "learning_rate": 7.82608695652174e-06,
+      "loss": 1.4932482242584229,
+      "step": 19
+    },
+    {
+      "epoch": 0.13675213675213677,
+      "grad_norm": 1.6663166284561157,
+      "learning_rate": 8.260869565217392e-06,
+      "loss": 1.4706228971481323,
+      "step": 20
+    },
+    {
+      "epoch": 0.14358974358974358,
+      "grad_norm": 1.488690733909607,
+      "learning_rate": 8.695652173913044e-06,
+      "loss": 1.4192920923233032,
+      "step": 21
+    },
+    {
+      "epoch": 0.15042735042735042,
+      "grad_norm": 1.3503153324127197,
+      "learning_rate": 9.130434782608697e-06,
+      "loss": 1.427452802658081,
+      "step": 22
+    },
+    {
+      "epoch": 0.15726495726495726,
+      "grad_norm": 1.2214534282684326,
+      "learning_rate": 9.565217391304349e-06,
+      "loss": 1.4610393047332764,
+      "step": 23
+    },
+    {
+      "epoch": 0.1641025641025641,
+      "grad_norm": 1.1983873844146729,
+      "learning_rate": 1e-05,
+      "loss": 1.4273948669433594,
+      "step": 24
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 1.1930960416793823,
+      "learning_rate": 9.999858783596665e-06,
+      "loss": 1.4003199338912964,
+      "step": 25
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 1.0275226831436157,
+      "learning_rate": 9.999435142363484e-06,
+      "loss": 1.4090672731399536,
+      "step": 26
+    },
+    {
+      "epoch": 0.18461538461538463,
+      "grad_norm": 1.001726508140564,
+      "learning_rate": 9.998729100230497e-06,
+      "loss": 1.3982799053192139,
+      "step": 27
+    },
+    {
+      "epoch": 0.19145299145299147,
+      "grad_norm": 0.9476358890533447,
+      "learning_rate": 9.997740697079595e-06,
+      "loss": 1.4250205755233765,
+      "step": 28
+    },
+    {
+      "epoch": 0.19829059829059828,
+      "grad_norm": 0.9169353246688843,
+      "learning_rate": 9.99646998874227e-06,
+      "loss": 1.407841682434082,
+      "step": 29
+    },
+    {
+      "epoch": 0.20512820512820512,
+      "grad_norm": 0.9049670696258545,
+      "learning_rate": 9.994917046996472e-06,
+      "loss": 1.4163107872009277,
+      "step": 30
+    },
+    {
+      "epoch": 0.21196581196581196,
+      "grad_norm": 0.902590811252594,
+      "learning_rate": 9.993081959562539e-06,
+      "loss": 1.4395619630813599,
+      "step": 31
+    },
+    {
+      "epoch": 0.2188034188034188,
+      "grad_norm": 0.9725260138511658,
+      "learning_rate": 9.990964830098246e-06,
+      "loss": 1.4067661762237549,
+      "step": 32
+    },
+    {
+      "epoch": 0.22564102564102564,
+      "grad_norm": 0.8750798106193542,
+      "learning_rate": 9.98856577819296e-06,
+      "loss": 1.4079771041870117,
+      "step": 33
+    },
+    {
+      "epoch": 0.23247863247863249,
+      "grad_norm": 0.8549812436103821,
+      "learning_rate": 9.985884939360873e-06,
+      "loss": 1.398482322692871,
+      "step": 34
+    },
+    {
+      "epoch": 0.23931623931623933,
+      "grad_norm": 0.869503378868103,
+      "learning_rate": 9.98292246503335e-06,
+      "loss": 1.344150424003601,
+      "step": 35
+    },
+    {
+      "epoch": 0.24615384615384617,
+      "grad_norm": 0.9242067337036133,
+      "learning_rate": 9.979678522550382e-06,
+      "loss": 1.37479567527771,
+      "step": 36
+    },
+    {
+      "epoch": 0.252991452991453,
+      "grad_norm": 0.8416987657546997,
+      "learning_rate": 9.976153295151123e-06,
+      "loss": 1.3731480836868286,
+      "step": 37
+    },
+    {
+      "epoch": 0.25982905982905985,
+      "grad_norm": 0.9907390475273132,
+      "learning_rate": 9.972346981963546e-06,
+      "loss": 1.3624351024627686,
+      "step": 38
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.8205696940422058,
+      "learning_rate": 9.968259797993197e-06,
+      "loss": 1.3645293712615967,
+      "step": 39
+    },
+    {
+      "epoch": 0.27350427350427353,
+      "grad_norm": 0.8257843852043152,
+      "learning_rate": 9.963891974111042e-06,
+      "loss": 1.3727067708969116,
+      "step": 40
+    },
+    {
+      "epoch": 0.28034188034188035,
+      "grad_norm": 0.7986466288566589,
+      "learning_rate": 9.959243757040434e-06,
+      "loss": 1.3945657014846802,
+      "step": 41
+    },
+    {
+      "epoch": 0.28717948717948716,
+      "grad_norm": 0.9684669971466064,
+      "learning_rate": 9.95431540934317e-06,
+      "loss": 1.3376381397247314,
+      "step": 42
+    },
+    {
+      "epoch": 0.294017094017094,
+      "grad_norm": 0.7717859148979187,
+      "learning_rate": 9.949107209404664e-06,
+      "loss": 1.354946494102478,
+      "step": 43
+    },
+    {
+      "epoch": 0.30085470085470084,
+      "grad_norm": 0.8021324276924133,
+      "learning_rate": 9.943619451418225e-06,
+      "loss": 1.3951725959777832,
+      "step": 44
+    },
+    {
+      "epoch": 0.30085470085470084,
+      "eval_loss": 1.362805724143982,
+      "eval_runtime": 24.9887,
+      "eval_samples_per_second": 39.458,
+      "eval_steps_per_second": 4.962,
+      "step": 44
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.829911470413208,
+      "learning_rate": 9.937852445368427e-06,
+      "loss": 1.3832783699035645,
+      "step": 45
+    },
+    {
+      "epoch": 0.3145299145299145,
+      "grad_norm": 0.8109715580940247,
+      "learning_rate": 9.931806517013612e-06,
+      "loss": 1.3637301921844482,
+      "step": 46
+    },
+    {
+      "epoch": 0.3213675213675214,
+      "grad_norm": 0.7627991437911987,
+      "learning_rate": 9.925482007867485e-06,
+      "loss": 1.3353031873703003,
+      "step": 47
+    },
+    {
+      "epoch": 0.3282051282051282,
+      "grad_norm": 0.7720788717269897,
+      "learning_rate": 9.918879275179819e-06,
+      "loss": 1.367252230644226,
+      "step": 48
+    },
+    {
+      "epoch": 0.335042735042735,
+      "grad_norm": 0.7520493865013123,
+      "learning_rate": 9.911998691916275e-06,
+      "loss": 1.386542797088623,
+      "step": 49
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.7559177875518799,
+      "learning_rate": 9.904840646737346e-06,
+      "loss": 1.3789976835250854,
+      "step": 50
+    },
+    {
+      "epoch": 0.3487179487179487,
+      "grad_norm": 0.770207405090332,
+      "learning_rate": 9.89740554397639e-06,
+      "loss": 1.356705904006958,
+      "step": 51
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.7609772086143494,
+      "learning_rate": 9.889693803616793e-06,
+      "loss": 1.3461980819702148,
+      "step": 52
+    },
+    {
+      "epoch": 0.3623931623931624,
+      "grad_norm": 0.7604424953460693,
+      "learning_rate": 9.881705861268252e-06,
+      "loss": 1.344923496246338,
+      "step": 53
+    },
+    {
+      "epoch": 0.36923076923076925,
+      "grad_norm": 0.7701961398124695,
+      "learning_rate": 9.873442168142158e-06,
+      "loss": 1.364449143409729,
+      "step": 54
+    },
+    {
+      "epoch": 0.37606837606837606,
+      "grad_norm": 0.7939377427101135,
+      "learning_rate": 9.864903191026125e-06,
+      "loss": 1.4013525247573853,
+      "step": 55
+    },
+    {
+      "epoch": 0.38290598290598293,
+      "grad_norm": 0.7690542340278625,
+      "learning_rate": 9.856089412257605e-06,
+      "loss": 1.3586581945419312,
+      "step": 56
+    },
+    {
+      "epoch": 0.38974358974358975,
+      "grad_norm": 0.798068106174469,
+      "learning_rate": 9.847001329696653e-06,
+      "loss": 1.3378022909164429,
+      "step": 57
+    },
+    {
+      "epoch": 0.39658119658119656,
+      "grad_norm": 0.7824757695198059,
+      "learning_rate": 9.837639456697802e-06,
+      "loss": 1.3118129968643188,
+      "step": 58
+    },
+    {
+      "epoch": 0.40341880341880343,
+      "grad_norm": 0.7629351019859314,
+      "learning_rate": 9.828004322081067e-06,
+      "loss": 1.3393217325210571,
+      "step": 59
+    },
+    {
+      "epoch": 0.41025641025641024,
+      "grad_norm": 0.7708514332771301,
+      "learning_rate": 9.818096470102067e-06,
+      "loss": 1.3732938766479492,
+      "step": 60
+    },
+    {
+      "epoch": 0.4170940170940171,
+      "grad_norm": 0.8133201003074646,
+      "learning_rate": 9.807916460421294e-06,
+      "loss": 1.3423891067504883,
+      "step": 61
+    },
+    {
+      "epoch": 0.4239316239316239,
+      "grad_norm": 0.7727287411689758,
+      "learning_rate": 9.797464868072489e-06,
+      "loss": 1.3378151655197144,
+      "step": 62
+    },
+    {
+      "epoch": 0.4307692307692308,
+      "grad_norm": 0.7684638500213623,
+      "learning_rate": 9.78674228343016e-06,
+      "loss": 1.3335256576538086,
+      "step": 63
+    },
+    {
+      "epoch": 0.4376068376068376,
+      "grad_norm": 0.7602411508560181,
+      "learning_rate": 9.775749312176249e-06,
+      "loss": 1.3320605754852295,
+      "step": 64
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.8044481873512268,
+      "learning_rate": 9.764486575265893e-06,
+      "loss": 1.3325685262680054,
+      "step": 65
+    },
+    {
+      "epoch": 0.4512820512820513,
+      "grad_norm": 0.7876479029655457,
+      "learning_rate": 9.752954708892379e-06,
+      "loss": 1.3242830038070679,
+      "step": 66
+    },
+    {
+      "epoch": 0.4581196581196581,
+      "grad_norm": 0.7659040689468384,
+      "learning_rate": 9.741154364451179e-06,
+      "loss": 1.3692903518676758,
+      "step": 67
+    },
+    {
+      "epoch": 0.46495726495726497,
+      "grad_norm": 0.8316842317581177,
+      "learning_rate": 9.729086208503174e-06,
+      "loss": 1.344923734664917,
+      "step": 68
+    },
+    {
+      "epoch": 0.4717948717948718,
+      "grad_norm": 0.8216245174407959,
+      "learning_rate": 9.716750922736998e-06,
+      "loss": 1.3780957460403442,
+      "step": 69
+    },
+    {
+      "epoch": 0.47863247863247865,
+      "grad_norm": 0.7839699387550354,
+      "learning_rate": 9.704149203930522e-06,
+      "loss": 1.3786989450454712,
+      "step": 70
+    },
+    {
+      "epoch": 0.48547008547008547,
+      "grad_norm": 0.7707169055938721,
+      "learning_rate": 9.691281763911513e-06,
+      "loss": 1.3283625841140747,
+      "step": 71
+    },
+    {
+      "epoch": 0.49230769230769234,
+      "grad_norm": 0.7598075270652771,
+      "learning_rate": 9.67814932951741e-06,
+      "loss": 1.3375245332717896,
+      "step": 72
+    },
+    {
+      "epoch": 0.49914529914529915,
+      "grad_norm": 0.8022596836090088,
+      "learning_rate": 9.664752642554272e-06,
+      "loss": 1.3409022092819214,
+      "step": 73
+    },
+    {
+      "epoch": 0.505982905982906,
+      "grad_norm": 0.7512302398681641,
+      "learning_rate": 9.651092459754879e-06,
+      "loss": 1.2996271848678589,
+      "step": 74
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.7390022277832031,
+      "learning_rate": 9.637169552735985e-06,
+      "loss": 1.3141694068908691,
+      "step": 75
+    },
+    {
+      "epoch": 0.5196581196581197,
+      "grad_norm": 0.7599424123764038,
+      "learning_rate": 9.622984707954732e-06,
+      "loss": 1.3220386505126953,
+      "step": 76
+    },
+    {
+      "epoch": 0.5264957264957265,
+      "grad_norm": 0.7562436461448669,
+      "learning_rate": 9.608538726664224e-06,
+      "loss": 1.3605300188064575,
+      "step": 77
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.7731190919876099,
+      "learning_rate": 9.593832424868271e-06,
+      "loss": 1.3461638689041138,
+      "step": 78
+    },
+    {
+      "epoch": 0.5401709401709401,
+      "grad_norm": 0.7543560266494751,
+      "learning_rate": 9.578866633275289e-06,
+      "loss": 1.340885877609253,
+      "step": 79
+    },
+    {
+      "epoch": 0.5470085470085471,
+      "grad_norm": 0.772647500038147,
+      "learning_rate": 9.563642197251382e-06,
+      "loss": 1.3663382530212402,
+      "step": 80
+    },
+    {
+      "epoch": 0.5538461538461539,
+      "grad_norm": 0.7314751148223877,
+      "learning_rate": 9.548159976772593e-06,
+      "loss": 1.3287297487258911,
+      "step": 81
+    },
+    {
+      "epoch": 0.5606837606837607,
+      "grad_norm": 0.7391103506088257,
+      "learning_rate": 9.532420846376316e-06,
+      "loss": 1.3285285234451294,
+      "step": 82
+    },
+    {
+      "epoch": 0.5675213675213675,
+      "grad_norm": 0.7641813158988953,
+      "learning_rate": 9.516425695111906e-06,
+      "loss": 1.3269128799438477,
+      "step": 83
+    },
+    {
+      "epoch": 0.5743589743589743,
+      "grad_norm": 0.7769819498062134,
+      "learning_rate": 9.500175426490455e-06,
+      "loss": 1.3374706506729126,
+      "step": 84
+    },
+    {
+      "epoch": 0.5811965811965812,
+      "grad_norm": 0.7199158668518066,
+      "learning_rate": 9.48367095843376e-06,
+      "loss": 1.3117002248764038,
+      "step": 85
+    },
+    {
+      "epoch": 0.588034188034188,
+      "grad_norm": 0.7510148882865906,
+      "learning_rate": 9.466913223222467e-06,
+      "loss": 1.3387565612792969,
+      "step": 86
+    },
+    {
+      "epoch": 0.5948717948717949,
+      "grad_norm": 0.7325724363327026,
+      "learning_rate": 9.449903167443415e-06,
+      "loss": 1.269672155380249,
+      "step": 87
+    },
+    {
+      "epoch": 0.6017094017094017,
+      "grad_norm": 0.7675944566726685,
+      "learning_rate": 9.432641751936162e-06,
+      "loss": 1.3153454065322876,
+      "step": 88
+    },
+    {
+      "epoch": 0.6017094017094017,
+      "eval_loss": 1.3318638801574707,
+      "eval_runtime": 24.6717,
+      "eval_samples_per_second": 39.965,
+      "eval_steps_per_second": 5.026,
+      "step": 88
+    },
+    {
+      "epoch": 0.6085470085470085,
+      "grad_norm": 0.7539426684379578,
+      "learning_rate": 9.415129951738713e-06,
+      "loss": 1.378519058227539,
+      "step": 89
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.7739952802658081,
+      "learning_rate": 9.397368756032445e-06,
+      "loss": 1.3163981437683105,
+      "step": 90
+    },
+    {
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.7639786005020142,
+      "learning_rate": 9.379359168086231e-06,
+      "loss": 1.3244612216949463,
+      "step": 91
+    },
+    {
+      "epoch": 0.629059829059829,
+      "grad_norm": 0.7307687997817993,
+      "learning_rate": 9.361102205199762e-06,
+      "loss": 1.3425580263137817,
+      "step": 92
+    },
+    {
+      "epoch": 0.6358974358974359,
+      "grad_norm": 0.7326052188873291,
+      "learning_rate": 9.34259889864609e-06,
+      "loss": 1.349947452545166,
+      "step": 93
+    },
+    {
+      "epoch": 0.6427350427350428,
+      "grad_norm": 0.7336087822914124,
+      "learning_rate": 9.32385029361338e-06,
+      "loss": 1.3235843181610107,
+      "step": 94
+    },
+    {
+      "epoch": 0.6495726495726496,
+      "grad_norm": 0.7857178449630737,
+      "learning_rate": 9.304857449145858e-06,
+      "loss": 1.29775071144104,
+      "step": 95
+    },
+    {
+      "epoch": 0.6564102564102564,
+      "grad_norm": 0.7694044709205627,
+      "learning_rate": 9.285621438083997e-06,
+      "loss": 1.3575528860092163,
+      "step": 96
+    },
+    {
+      "epoch": 0.6632478632478632,
+      "grad_norm": 0.7426573634147644,
+      "learning_rate": 9.26614334700392e-06,
+      "loss": 1.334963083267212,
+      "step": 97
+    },
+    {
+      "epoch": 0.67008547008547,
+      "grad_norm": 0.7567334175109863,
+      "learning_rate": 9.246424276156008e-06,
+      "loss": 1.335172176361084,
+      "step": 98
+    },
+    {
+      "epoch": 0.676923076923077,
+      "grad_norm": 0.733529269695282,
+      "learning_rate": 9.226465339402768e-06,
+      "loss": 1.3033547401428223,
+      "step": 99
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.7475197315216064,
+      "learning_rate": 9.206267664155906e-06,
+      "loss": 1.316215991973877,
+      "step": 100
+    },
+    {
+      "epoch": 0.6905982905982906,
+      "grad_norm": 0.7870779633522034,
+      "learning_rate": 9.185832391312644e-06,
+      "loss": 1.347679853439331,
+      "step": 101
+    },
+    {
+      "epoch": 0.6974358974358974,
+      "grad_norm": 0.764722466468811,
+      "learning_rate": 9.165160675191272e-06,
+      "loss": 1.305860996246338,
+      "step": 102
+    },
+    {
+      "epoch": 0.7042735042735043,
+      "grad_norm": 0.7680871486663818,
+      "learning_rate": 9.144253683465953e-06,
+      "loss": 1.3211126327514648,
+      "step": 103
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.734742283821106,
+      "learning_rate": 9.123112597100759e-06,
+      "loss": 1.2861220836639404,
+      "step": 104
+    },
+    {
+      "epoch": 0.717948717948718,
+      "grad_norm": 0.7347426414489746,
+      "learning_rate": 9.101738610282956e-06,
+      "loss": 1.315138578414917,
+      "step": 105
+    },
+    {
+      "epoch": 0.7247863247863248,
+      "grad_norm": 0.7639749646186829,
+      "learning_rate": 9.080132930355567e-06,
+      "loss": 1.3426464796066284,
+      "step": 106
+    },
+    {
+      "epoch": 0.7316239316239316,
+      "grad_norm": 0.7904943227767944,
+      "learning_rate": 9.058296777749154e-06,
+      "loss": 1.334005355834961,
+      "step": 107
+    },
+    {
+      "epoch": 0.7384615384615385,
+      "grad_norm": 0.780296266078949,
+      "learning_rate": 9.03623138591289e-06,
+      "loss": 1.3893626928329468,
+      "step": 108
+    },
+    {
+      "epoch": 0.7452991452991453,
+      "grad_norm": 0.7619044184684753,
+      "learning_rate": 9.013938001244885e-06,
+      "loss": 1.3112680912017822,
+      "step": 109
+    },
+    {
+      "epoch": 0.7521367521367521,
+      "grad_norm": 0.7852951884269714,
+      "learning_rate": 8.99141788302178e-06,
+      "loss": 1.3263344764709473,
+      "step": 110
+    },
+    {
+      "epoch": 0.7589743589743589,
+      "grad_norm": 0.746293306350708,
+      "learning_rate": 8.968672303327614e-06,
+      "loss": 1.3137162923812866,
+      "step": 111
+    },
+    {
+      "epoch": 0.7658119658119659,
+      "grad_norm": 0.7697060704231262,
+      "learning_rate": 8.94570254698197e-06,
+      "loss": 1.305846095085144,
+      "step": 112
+    },
+    {
+      "epoch": 0.7726495726495727,
+      "grad_norm": 0.7505799531936646,
+      "learning_rate": 8.922509911467395e-06,
+      "loss": 1.3263046741485596,
+      "step": 113
+    },
+    {
+      "epoch": 0.7794871794871795,
+      "grad_norm": 0.7378644347190857,
+      "learning_rate": 8.899095706856122e-06,
+      "loss": 1.2952595949172974,
+      "step": 114
+    },
+    {
+      "epoch": 0.7863247863247863,
+      "grad_norm": 0.7393775582313538,
+      "learning_rate": 8.875461255736055e-06,
+      "loss": 1.314041018486023,
+      "step": 115
+    },
+    {
+      "epoch": 0.7931623931623931,
+      "grad_norm": 0.7198286056518555,
+      "learning_rate": 8.851607893136065e-06,
+      "loss": 1.301222801208496,
+      "step": 116
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7539902925491333,
+      "learning_rate": 8.827536966450584e-06,
+      "loss": 1.3459645509719849,
+      "step": 117
+    },
+    {
+      "epoch": 0.8068376068376069,
+      "grad_norm": 0.728272020816803,
+      "learning_rate": 8.803249835363486e-06,
+      "loss": 1.3075345754623413,
+      "step": 118
+    },
+    {
+      "epoch": 0.8136752136752137,
+      "grad_norm": 0.7353615164756775,
+      "learning_rate": 8.778747871771293e-06,
+      "loss": 1.2967561483383179,
+      "step": 119
+    },
+    {
+      "epoch": 0.8205128205128205,
+      "grad_norm": 0.7358576655387878,
+      "learning_rate": 8.754032459705672e-06,
+      "loss": 1.3145124912261963,
+      "step": 120
+    },
+    {
+      "epoch": 0.8273504273504273,
+      "grad_norm": 0.7736720442771912,
+      "learning_rate": 8.729104995255265e-06,
+      "loss": 1.3146538734436035,
+      "step": 121
+    },
+    {
+      "epoch": 0.8341880341880342,
+      "grad_norm": 0.7337418794631958,
+      "learning_rate": 8.703966886486819e-06,
+      "loss": 1.2823609113693237,
+      "step": 122
+    },
+    {
+      "epoch": 0.841025641025641,
+      "grad_norm": 0.7514926195144653,
+      "learning_rate": 8.67861955336566e-06,
+      "loss": 1.3389618396759033,
+      "step": 123
+    },
+    {
+      "epoch": 0.8478632478632478,
+      "grad_norm": 0.7190932035446167,
+      "learning_rate": 8.65306442767547e-06,
+      "loss": 1.3115108013153076,
+      "step": 124
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.7332461476325989,
+      "learning_rate": 8.627302952937431e-06,
+      "loss": 1.333253264427185,
+      "step": 125
+    },
+    {
+      "epoch": 0.8615384615384616,
+      "grad_norm": 0.7428878545761108,
+      "learning_rate": 8.601336584328659e-06,
+      "loss": 1.3187751770019531,
+      "step": 126
+    },
+    {
+      "epoch": 0.8683760683760684,
+      "grad_norm": 0.7715012431144714,
+      "learning_rate": 8.575166788600031e-06,
+      "loss": 1.3300316333770752,
+      "step": 127
+    },
+    {
+      "epoch": 0.8752136752136752,
+      "grad_norm": 0.7566640973091125,
+      "learning_rate": 8.548795043993316e-06,
+      "loss": 1.307992696762085,
+      "step": 128
+    },
+    {
+      "epoch": 0.882051282051282,
+      "grad_norm": 0.7760566473007202,
+      "learning_rate": 8.522222840157687e-06,
+      "loss": 1.32774817943573,
+      "step": 129
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.7682384848594666,
+      "learning_rate": 8.495451678065563e-06,
+      "loss": 1.3295447826385498,
+      "step": 130
+    },
+    {
+      "epoch": 0.8957264957264958,
+      "grad_norm": 0.7397897839546204,
+      "learning_rate": 8.468483069927832e-06,
+      "loss": 1.3145328760147095,
+      "step": 131
+    },
+    {
+      "epoch": 0.9025641025641026,
+      "grad_norm": 0.7603890299797058,
+      "learning_rate": 8.441318539108433e-06,
+      "loss": 1.3174394369125366,
+      "step": 132
+    },
+    {
+      "epoch": 0.9025641025641026,
+      "eval_loss": 1.317511796951294,
+      "eval_runtime": 24.6804,
+      "eval_samples_per_second": 39.951,
+      "eval_steps_per_second": 5.024,
+      "step": 132
+    },
+    {
+      "epoch": 0.9094017094017094,
+      "grad_norm": 0.7623502612113953,
+      "learning_rate": 8.413959620038306e-06,
+      "loss": 1.3393348455429077,
+      "step": 133
+    },
+    {
+      "epoch": 0.9162393162393162,
+      "grad_norm": 0.7669332027435303,
+      "learning_rate": 8.386407858128707e-06,
+      "loss": 1.302769660949707,
+      "step": 134
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.7234067320823669,
+      "learning_rate": 8.358664809683926e-06,
+      "loss": 1.3381096124649048,
+      "step": 135
+    },
+    {
+      "epoch": 0.9299145299145299,
+      "grad_norm": 0.7574735283851624,
+      "learning_rate": 8.330732041813367e-06,
+      "loss": 1.335377812385559,
+      "step": 136
+    },
+    {
+      "epoch": 0.9367521367521368,
+      "grad_norm": 0.7575842142105103,
+      "learning_rate": 8.302611132343042e-06,
+      "loss": 1.3330005407333374,
+      "step": 137
+    },
+    {
+      "epoch": 0.9435897435897436,
+      "grad_norm": 0.7127556800842285,
+      "learning_rate": 8.274303669726427e-06,
+      "loss": 1.2971893548965454,
+      "step": 138
+    },
+    {
+      "epoch": 0.9504273504273504,
+      "grad_norm": 0.8172794580459595,
+      "learning_rate": 8.245811252954741e-06,
+      "loss": 1.3225749731063843,
+      "step": 139
+    },
+    {
+      "epoch": 0.9572649572649573,
+      "grad_norm": 0.7154548764228821,
+      "learning_rate": 8.217135491466636e-06,
+      "loss": 1.2955387830734253,
+      "step": 140
+    },
+    {
+      "epoch": 0.9641025641025641,
+      "grad_norm": 0.7610012888908386,
+      "learning_rate": 8.18827800505727e-06,
+      "loss": 1.3369195461273193,
+      "step": 141
+    },
+    {
+      "epoch": 0.9709401709401709,
+      "grad_norm": 0.7487711906433105,
+      "learning_rate": 8.15924042378682e-06,
+      "loss": 1.2916451692581177,
+      "step": 142
+    },
+    {
+      "epoch": 0.9777777777777777,
+      "grad_norm": 0.7546627521514893,
+      "learning_rate": 8.130024387888402e-06,
+      "loss": 1.310347318649292,
+      "step": 143
+    },
+    {
+      "epoch": 0.9846153846153847,
+      "grad_norm": 0.7537707090377808,
+      "learning_rate": 8.100631547675417e-06,
+      "loss": 1.3267855644226074,
+      "step": 144
+    },
+    {
+      "epoch": 0.9914529914529915,
+      "grad_norm": 0.7335416078567505,
+      "learning_rate": 8.071063563448341e-06,
+      "loss": 1.2958036661148071,
+      "step": 145
+    },
+    {
+      "epoch": 0.9982905982905983,
+      "grad_norm": 0.773562490940094,
+      "learning_rate": 8.041322105400923e-06,
+      "loss": 1.2804107666015625,
+      "step": 146
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.4411433935165405,
+      "learning_rate": 8.01140885352586e-06,
+      "loss": 1.3802165985107422,
+      "step": 147
+    },
+    {
+      "epoch": 1.0068376068376068,
+      "grad_norm": 0.9124190211296082,
+      "learning_rate": 7.981325497519892e-06,
+      "loss": 1.2135487794876099,
+      "step": 148
+    },
+    {
+      "epoch": 1.0136752136752136,
+      "grad_norm": 0.8284032344818115,
+      "learning_rate": 7.951073736688348e-06,
+      "loss": 1.1935949325561523,
+      "step": 149
+    },
+    {
+      "epoch": 1.0205128205128204,
+      "grad_norm": 0.8174305558204651,
+      "learning_rate": 7.920655279849173e-06,
+      "loss": 1.2410966157913208,
+      "step": 150
+    },
+    {
+      "epoch": 1.0273504273504273,
+      "grad_norm": 0.7865321040153503,
+      "learning_rate": 7.890071845236395e-06,
+      "loss": 1.2489113807678223,
+      "step": 151
+    },
+    {
+      "epoch": 1.0341880341880343,
+      "grad_norm": 0.812463104724884,
+      "learning_rate": 7.859325160403073e-06,
+      "loss": 1.1999475955963135,
+      "step": 152
+    },
+    {
+      "epoch": 1.041025641025641,
+      "grad_norm": 0.8780131936073303,
+      "learning_rate": 7.8284169621237e-06,
+      "loss": 1.2193069458007812,
+      "step": 153
+    },
+    {
+      "epoch": 1.047863247863248,
+      "grad_norm": 0.8348581790924072,
+      "learning_rate": 7.797348996296116e-06,
+      "loss": 1.1925896406173706,
+      "step": 154
+    },
+    {
+      "epoch": 1.0547008547008547,
+      "grad_norm": 0.8675538897514343,
+      "learning_rate": 7.766123017842877e-06,
+      "loss": 1.2143549919128418,
+      "step": 155
+    },
+    {
+      "epoch": 1.0615384615384615,
+      "grad_norm": 0.8252431750297546,
+      "learning_rate": 7.734740790612137e-06,
+      "loss": 1.2455641031265259,
+      "step": 156
+    },
+    {
+      "epoch": 1.0683760683760684,
+      "grad_norm": 0.8385781049728394,
+      "learning_rate": 7.703204087277989e-06,
+      "loss": 1.2102444171905518,
+      "step": 157
+    },
+    {
+      "epoch": 1.0752136752136752,
+      "grad_norm": 0.827889084815979,
+      "learning_rate": 7.671514689240366e-06,
+      "loss": 1.2144052982330322,
+      "step": 158
+    },
+    {
+      "epoch": 1.082051282051282,
+      "grad_norm": 0.7633846998214722,
+      "learning_rate": 7.639674386524395e-06,
+      "loss": 1.2118767499923706,
+      "step": 159
+    },
+    {
+      "epoch": 1.0888888888888888,
+      "grad_norm": 0.8267090320587158,
+      "learning_rate": 7.607684977679284e-06,
+      "loss": 1.188737392425537,
+      "step": 160
+    },
+    {
+      "epoch": 1.0957264957264958,
+      "grad_norm": 0.8270633816719055,
+      "learning_rate": 7.575548269676741e-06,
+      "loss": 1.214994192123413,
+      "step": 161
+    },
+    {
+      "epoch": 1.1025641025641026,
+      "grad_norm": 0.8160786628723145,
+      "learning_rate": 7.543266077808893e-06,
+      "loss": 1.221800446510315,
+      "step": 162
+    },
+    {
+      "epoch": 1.1094017094017095,
+      "grad_norm": 0.829490065574646,
+      "learning_rate": 7.510840225585749e-06,
+      "loss": 1.1974472999572754,
+      "step": 163
+    },
+    {
+      "epoch": 1.1162393162393163,
+      "grad_norm": 0.8170298933982849,
+      "learning_rate": 7.478272544632204e-06,
+      "loss": 1.2150561809539795,
+      "step": 164
+    },
+    {
+      "epoch": 1.123076923076923,
+      "grad_norm": 0.7731851935386658,
+      "learning_rate": 7.44556487458456e-06,
+      "loss": 1.1988686323165894,
+      "step": 165
+    },
+    {
+      "epoch": 1.12991452991453,
+      "grad_norm": 0.7923320531845093,
+      "learning_rate": 7.412719062986632e-06,
+      "loss": 1.2086683511734009,
+      "step": 166
+    },
+    {
+      "epoch": 1.1367521367521367,
+      "grad_norm": 0.7592716217041016,
+      "learning_rate": 7.379736965185369e-06,
+      "loss": 1.215879201889038,
+      "step": 167
+    },
+    {
+      "epoch": 1.1435897435897435,
+      "grad_norm": 0.7586809396743774,
+      "learning_rate": 7.3466204442260605e-06,
+      "loss": 1.2311599254608154,
+      "step": 168
+    },
+    {
+      "epoch": 1.1504273504273503,
+      "grad_norm": 0.7838971614837646,
+      "learning_rate": 7.313371370747104e-06,
+      "loss": 1.2183728218078613,
+      "step": 169
+    },
+    {
+      "epoch": 1.1572649572649572,
+      "grad_norm": 0.7780983448028564,
+      "learning_rate": 7.279991622874319e-06,
+      "loss": 1.1952356100082397,
+      "step": 170
+    },
+    {
+      "epoch": 1.1641025641025642,
+      "grad_norm": 0.7715050578117371,
+      "learning_rate": 7.24648308611489e-06,
+      "loss": 1.2417360544204712,
+      "step": 171
+    },
+    {
+      "epoch": 1.170940170940171,
+      "grad_norm": 0.7692239880561829,
+      "learning_rate": 7.212847653250828e-06,
+      "loss": 1.2170333862304688,
+      "step": 172
+    },
+    {
+      "epoch": 1.1777777777777778,
+      "grad_norm": 0.7896147966384888,
+      "learning_rate": 7.1790872242320775e-06,
+      "loss": 1.2121965885162354,
+      "step": 173
+    },
+    {
+      "epoch": 1.1846153846153846,
+      "grad_norm": 0.8173856139183044,
+      "learning_rate": 7.145203706069183e-06,
+      "loss": 1.1911547183990479,
+      "step": 174
+    },
+    {
+      "epoch": 1.1914529914529914,
+      "grad_norm": 0.7522553205490112,
+      "learning_rate": 7.1111990127255684e-06,
+      "loss": 1.210161566734314,
+      "step": 175
+    },
+    {
+      "epoch": 1.1982905982905983,
+      "grad_norm": 0.7353285551071167,
+      "learning_rate": 7.0770750650094335e-06,
+      "loss": 1.1757725477218628,
+      "step": 176
+    },
+    {
+      "epoch": 1.1982905982905983,
+      "eval_loss": 1.3184372186660767,
+      "eval_runtime": 24.8388,
+      "eval_samples_per_second": 39.696,
+      "eval_steps_per_second": 4.992,
+      "step": 176
+    },
+    {
+      "epoch": 1.205128205128205,
+      "grad_norm": 0.7701054811477661,
+      "learning_rate": 7.042833790465241e-06,
+      "loss": 1.2243812084197998,
+      "step": 177
+    },
+    {
+      "epoch": 1.2119658119658119,
+      "grad_norm": 0.7278676629066467,
+      "learning_rate": 7.008477123264849e-06,
+      "loss": 1.198972463607788,
+      "step": 178
+    },
+    {
+      "epoch": 1.218803418803419,
+      "grad_norm": 0.7595424056053162,
+      "learning_rate": 6.974007004098243e-06,
+      "loss": 1.2435779571533203,
+      "step": 179
+    },
+    {
+      "epoch": 1.2256410256410257,
+      "grad_norm": 0.7661744952201843,
+      "learning_rate": 6.939425380063924e-06,
+      "loss": 1.2413814067840576,
+      "step": 180
+    },
+    {
+      "epoch": 1.2324786324786325,
+      "grad_norm": 0.7790281176567078,
+      "learning_rate": 6.9047342045589224e-06,
+      "loss": 1.1771953105926514,
+      "step": 181
+    },
+    {
+      "epoch": 1.2393162393162394,
+      "grad_norm": 0.7655471563339233,
+      "learning_rate": 6.869935437168449e-06,
+      "loss": 1.203190565109253,
+      "step": 182
+    },
+    {
+      "epoch": 1.2461538461538462,
+      "grad_norm": 0.784903347492218,
+      "learning_rate": 6.835031043555211e-06,
+      "loss": 1.2171598672866821,
+      "step": 183
+    },
+    {
+      "epoch": 1.252991452991453,
+      "grad_norm": 0.7539082765579224,
+      "learning_rate": 6.800022995348381e-06,
+      "loss": 1.2139626741409302,
+      "step": 184
+    },
+    {
+      "epoch": 1.2598290598290598,
+      "grad_norm": 0.7623985409736633,
+      "learning_rate": 6.76491327003222e-06,
+      "loss": 1.2187587022781372,
+      "step": 185
+    },
+    {
+      "epoch": 1.2666666666666666,
+      "grad_norm": 0.7418251037597656,
+      "learning_rate": 6.729703850834381e-06,
+      "loss": 1.2088682651519775,
+      "step": 186
+    },
+    {
+      "epoch": 1.2735042735042734,
+      "grad_norm": 0.7652315497398376,
+      "learning_rate": 6.694396726613883e-06,
+      "loss": 1.2204537391662598,
+      "step": 187
+    },
+    {
+      "epoch": 1.2803418803418802,
+      "grad_norm": 0.7618216872215271,
+      "learning_rate": 6.65899389174876e-06,
+      "loss": 1.220557451248169,
+      "step": 188
+    },
+    {
+      "epoch": 1.287179487179487,
+      "grad_norm": 0.774918794631958,
+      "learning_rate": 6.6234973460234184e-06,
+      "loss": 1.238166093826294,
+      "step": 189
+    },
+    {
+      "epoch": 1.294017094017094,
+      "grad_norm": 0.7822843790054321,
+      "learning_rate": 6.587909094515663e-06,
+      "loss": 1.2424533367156982,
+      "step": 190
+    },
+    {
+      "epoch": 1.300854700854701,
+      "grad_norm": 0.7934525012969971,
+      "learning_rate": 6.552231147483448e-06,
+      "loss": 1.1982380151748657,
+      "step": 191
+    },
+    {
+      "epoch": 1.3076923076923077,
+      "grad_norm": 0.7817178964614868,
+      "learning_rate": 6.5164655202513135e-06,
+      "loss": 1.205663800239563,
+      "step": 192
+    },
+    {
+      "epoch": 1.3145299145299145,
+      "grad_norm": 0.8002380728721619,
+      "learning_rate": 6.480614233096558e-06,
+      "loss": 1.1866426467895508,
+      "step": 193
+    },
+    {
+      "epoch": 1.3213675213675213,
+      "grad_norm": 0.7488191723823547,
+      "learning_rate": 6.444679311135112e-06,
+      "loss": 1.2407163381576538,
+      "step": 194
+    },
+    {
+      "epoch": 1.3282051282051281,
+      "grad_norm": 0.8069729208946228,
+      "learning_rate": 6.408662784207149e-06,
+      "loss": 1.2296785116195679,
+      "step": 195
+    },
+    {
+      "epoch": 1.335042735042735,
+      "grad_norm": 0.8026877641677856,
+      "learning_rate": 6.372566686762427e-06,
+      "loss": 1.228287696838379,
+      "step": 196
+    },
+    {
+      "epoch": 1.341880341880342,
+      "grad_norm": 0.7794991731643677,
+      "learning_rate": 6.336393057745365e-06,
+      "loss": 1.2325451374053955,
+      "step": 197
+    },
+    {
+      "epoch": 1.3487179487179488,
+      "grad_norm": 0.7851534485816956,
+      "learning_rate": 6.300143940479881e-06,
+      "loss": 1.2433525323867798,
+      "step": 198
+    },
+    {
+      "epoch": 1.3555555555555556,
+      "grad_norm": 0.7642512321472168,
+      "learning_rate": 6.2638213825539595e-06,
+      "loss": 1.2330515384674072,
+      "step": 199
+    },
+    {
+      "epoch": 1.3623931623931624,
+      "grad_norm": 0.8071786165237427,
+      "learning_rate": 6.227427435703997e-06,
+      "loss": 1.2169106006622314,
+      "step": 200
+    },
+    {
+      "epoch": 1.3692307692307693,
+      "grad_norm": 0.7421261668205261,
+      "learning_rate": 6.190964155698903e-06,
+      "loss": 1.1981184482574463,
+      "step": 201
+    },
+    {
+      "epoch": 1.376068376068376,
+      "grad_norm": 0.7663130760192871,
+      "learning_rate": 6.154433602223979e-06,
+      "loss": 1.184199333190918,
+      "step": 202
+    },
+    {
+      "epoch": 1.3829059829059829,
+      "grad_norm": 0.778105616569519,
+      "learning_rate": 6.117837838764579e-06,
+      "loss": 1.1941637992858887,
+      "step": 203
+    },
+    {
+      "epoch": 1.3897435897435897,
+      "grad_norm": 0.7876622676849365,
+      "learning_rate": 6.0811789324895365e-06,
+      "loss": 1.1943039894104004,
+      "step": 204
+    },
+    {
+      "epoch": 1.3965811965811965,
+      "grad_norm": 0.7890434861183167,
+      "learning_rate": 6.044458954134411e-06,
+      "loss": 1.1947365999221802,
+      "step": 205
+    },
+    {
+      "epoch": 1.4034188034188033,
+      "grad_norm": 0.7558045387268066,
+      "learning_rate": 6.0076799778845105e-06,
+      "loss": 1.1994682550430298,
+      "step": 206
+    },
+    {
+      "epoch": 1.4102564102564101,
+      "grad_norm": 0.7472313046455383,
+      "learning_rate": 5.970844081257734e-06,
+      "loss": 1.210819959640503,
+      "step": 207
+    },
+    {
+      "epoch": 1.4170940170940172,
+      "grad_norm": 0.7487971782684326,
+      "learning_rate": 5.933953344987215e-06,
+      "loss": 1.1884093284606934,
+      "step": 208
+    },
+    {
+      "epoch": 1.423931623931624,
+      "grad_norm": 0.7524631023406982,
+      "learning_rate": 5.897009852903792e-06,
+      "loss": 1.2101268768310547,
+      "step": 209
+    },
+    {
+      "epoch": 1.4307692307692308,
+      "grad_norm": 0.7583618760108948,
+      "learning_rate": 5.860015691818292e-06,
+      "loss": 1.214969515800476,
+      "step": 210
+    },
+    {
+      "epoch": 1.4376068376068376,
+      "grad_norm": 0.7619627118110657,
+      "learning_rate": 5.82297295140367e-06,
+      "loss": 1.1723865270614624,
+      "step": 211
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.782787024974823,
+      "learning_rate": 5.78588372407695e-06,
+      "loss": 1.2125704288482666,
+      "step": 212
+    },
+    {
+      "epoch": 1.4512820512820512,
+      "grad_norm": 0.7758169174194336,
+      "learning_rate": 5.748750104881051e-06,
+      "loss": 1.219278335571289,
+      "step": 213
+    },
+    {
+      "epoch": 1.458119658119658,
+      "grad_norm": 0.7914722561836243,
+      "learning_rate": 5.711574191366427e-06,
+      "loss": 1.2299978733062744,
+      "step": 214
+    },
+    {
+      "epoch": 1.464957264957265,
+      "grad_norm": 0.7562519907951355,
+      "learning_rate": 5.674358083472598e-06,
+      "loss": 1.1945183277130127,
+      "step": 215
+    },
+    {
+      "epoch": 1.471794871794872,
+      "grad_norm": 0.7890987396240234,
+      "learning_rate": 5.637103883409525e-06,
+      "loss": 1.228225827217102,
+      "step": 216
+    },
+    {
+      "epoch": 1.4786324786324787,
+      "grad_norm": 0.7438657879829407,
+      "learning_rate": 5.599813695538866e-06,
+      "loss": 1.1812902688980103,
+      "step": 217
+    },
+    {
+      "epoch": 1.4854700854700855,
+      "grad_norm": 0.7696713805198669,
+      "learning_rate": 5.562489626255104e-06,
+      "loss": 1.2277076244354248,
+      "step": 218
+    },
+    {
+      "epoch": 1.4923076923076923,
+      "grad_norm": 0.8019750714302063,
+      "learning_rate": 5.52513378386657e-06,
+      "loss": 1.2309683561325073,
+      "step": 219
+    },
+    {
+      "epoch": 1.4991452991452991,
+      "grad_norm": 0.7668002247810364,
+      "learning_rate": 5.487748278476342e-06,
+      "loss": 1.2046821117401123,
+      "step": 220
+    },
+    {
+      "epoch": 1.4991452991452991,
+      "eval_loss": 1.3131194114685059,
+      "eval_runtime": 24.7008,
+      "eval_samples_per_second": 39.918,
+      "eval_steps_per_second": 5.02,
+      "step": 220
+    },
+    {
+      "epoch": 1.505982905982906,
+      "grad_norm": 0.7732208967208862,
+      "learning_rate": 5.450335221863068e-06,
+      "loss": 1.2219358682632446,
+      "step": 221
+    },
+    {
+      "epoch": 1.5128205128205128,
+      "grad_norm": 0.7456432580947876,
+      "learning_rate": 5.412896727361663e-06,
+      "loss": 1.2196807861328125,
+      "step": 222
+    },
+    {
+      "epoch": 1.5196581196581196,
+      "grad_norm": 0.7411943674087524,
+      "learning_rate": 5.375434909743942e-06,
+      "loss": 1.2303682565689087,
+      "step": 223
+    },
+    {
+      "epoch": 1.5264957264957264,
+      "grad_norm": 0.7763144373893738,
+      "learning_rate": 5.337951885099167e-06,
+      "loss": 1.188888669013977,
+      "step": 224
+    },
+    {
+      "epoch": 1.5333333333333332,
+      "grad_norm": 0.8138889074325562,
+      "learning_rate": 5.300449770714502e-06,
+      "loss": 1.1965391635894775,
+      "step": 225
+    },
+    {
+      "epoch": 1.54017094017094,
+      "grad_norm": 0.7770660519599915,
+      "learning_rate": 5.262930684955439e-06,
+      "loss": 1.233127474784851,
+      "step": 226
+    },
+    {
+      "epoch": 1.547008547008547,
+      "grad_norm": 0.7718791961669922,
+      "learning_rate": 5.225396747146112e-06,
+      "loss": 1.240120768547058,
+      "step": 227
+    },
+    {
+      "epoch": 1.5538461538461539,
+      "grad_norm": 0.7710370421409607,
+      "learning_rate": 5.187850077449604e-06,
+      "loss": 1.202008605003357,
+      "step": 228
+    },
+    {
+      "epoch": 1.5606837606837607,
+      "grad_norm": 0.7775757908821106,
+      "learning_rate": 5.150292796748174e-06,
+      "loss": 1.2269346714019775,
+      "step": 229
+    },
+    {
+      "epoch": 1.5675213675213675,
+      "grad_norm": 0.7479456067085266,
+      "learning_rate": 5.112727026523461e-06,
+      "loss": 1.1906824111938477,
+      "step": 230
+    },
+    {
+      "epoch": 1.5743589743589743,
+      "grad_norm": 0.7567362189292908,
+      "learning_rate": 5.075154888736653e-06,
+      "loss": 1.1966190338134766,
+      "step": 231
+    },
+    {
+      "epoch": 1.5811965811965814,
+      "grad_norm": 0.7536229491233826,
+      "learning_rate": 5.03757850570861e-06,
+      "loss": 1.1917792558670044,
+      "step": 232
+    },
+    {
+      "epoch": 1.5880341880341882,
+      "grad_norm": 0.7776764035224915,
+      "learning_rate": 5e-06,
+      "loss": 1.1941741704940796,
+      "step": 233
+    },
+    {
+      "epoch": 1.594871794871795,
+      "grad_norm": 0.7667071223258972,
+      "learning_rate": 4.9624214942913916e-06,
+      "loss": 1.1881437301635742,
+      "step": 234
+    },
+    {
+      "epoch": 1.6017094017094018,
+      "grad_norm": 0.773404061794281,
+      "learning_rate": 4.924845111263349e-06,
+      "loss": 1.2190567255020142,
+      "step": 235
+    },
+    {
+      "epoch": 1.6085470085470086,
+      "grad_norm": 0.7392263412475586,
+      "learning_rate": 4.88727297347654e-06,
+      "loss": 1.2026817798614502,
+      "step": 236
+    },
+    {
+      "epoch": 1.6153846153846154,
+      "grad_norm": 0.7713451981544495,
+      "learning_rate": 4.8497072032518274e-06,
+      "loss": 1.2358677387237549,
+      "step": 237
+    },
+    {
+      "epoch": 1.6222222222222222,
+      "grad_norm": 0.7625684142112732,
+      "learning_rate": 4.8121499225503974e-06,
+      "loss": 1.1716538667678833,
+      "step": 238
+    },
+    {
+      "epoch": 1.629059829059829,
+      "grad_norm": 0.7581425309181213,
+      "learning_rate": 4.774603252853889e-06,
+      "loss": 1.1988354921340942,
+      "step": 239
+    },
+    {
+      "epoch": 1.6358974358974359,
+      "grad_norm": 0.751584529876709,
+      "learning_rate": 4.737069315044562e-06,
+      "loss": 1.2101967334747314,
+      "step": 240
+    },
+    {
+      "epoch": 1.6427350427350427,
+      "grad_norm": 0.7554129362106323,
+      "learning_rate": 4.699550229285499e-06,
+      "loss": 1.202675223350525,
+      "step": 241
+    },
+    {
+      "epoch": 1.6495726495726495,
+      "grad_norm": 0.761131227016449,
+      "learning_rate": 4.662048114900837e-06,
+      "loss": 1.201820731163025,
+      "step": 242
+    },
+    {
+      "epoch": 1.6564102564102563,
+      "grad_norm": 0.7265458703041077,
+      "learning_rate": 4.624565090256059e-06,
+      "loss": 1.2179176807403564,
+      "step": 243
+    },
+    {
+      "epoch": 1.6632478632478631,
+      "grad_norm": 0.767880916595459,
+      "learning_rate": 4.587103272638339e-06,
+      "loss": 1.1769942045211792,
+      "step": 244
+    },
+    {
+      "epoch": 1.67008547008547,
+      "grad_norm": 0.7633269429206848,
+      "learning_rate": 4.549664778136933e-06,
+      "loss": 1.2298530340194702,
+      "step": 245
+    },
+    {
+      "epoch": 1.676923076923077,
+      "grad_norm": 0.7275070548057556,
+      "learning_rate": 4.512251721523659e-06,
+      "loss": 1.2158825397491455,
+      "step": 246
+    },
+    {
+      "epoch": 1.6837606837606838,
+      "grad_norm": 0.7592760920524597,
+      "learning_rate": 4.4748662161334335e-06,
+      "loss": 1.207166314125061,
+      "step": 247
+    },
+    {
+      "epoch": 1.6905982905982906,
+      "grad_norm": 0.7778440713882446,
+      "learning_rate": 4.437510373744897e-06,
+      "loss": 1.2096598148345947,
+      "step": 248
+    },
+    {
+      "epoch": 1.6974358974358974,
+      "grad_norm": 0.7637122869491577,
+      "learning_rate": 4.400186304461136e-06,
+      "loss": 1.1851915121078491,
+      "step": 249
+    },
+    {
+      "epoch": 1.7042735042735044,
+      "grad_norm": 0.7784591317176819,
+      "learning_rate": 4.362896116590475e-06,
+      "loss": 1.2293877601623535,
+      "step": 250
+    },
+    {
+      "epoch": 1.7111111111111112,
+      "grad_norm": 0.8099437355995178,
+      "learning_rate": 4.325641916527405e-06,
+      "loss": 1.2101249694824219,
+      "step": 251
+    },
+    {
+      "epoch": 1.717948717948718,
+      "grad_norm": 0.7552655339241028,
+      "learning_rate": 4.2884258086335755e-06,
+      "loss": 1.2240850925445557,
+      "step": 252
+    },
+    {
+      "epoch": 1.7247863247863249,
+      "grad_norm": 0.7730560898780823,
+      "learning_rate": 4.25124989511895e-06,
+      "loss": 1.2249057292938232,
+      "step": 253
+    },
+    {
+      "epoch": 1.7316239316239317,
+      "grad_norm": 0.7381757497787476,
+      "learning_rate": 4.214116275923051e-06,
+      "loss": 1.1832340955734253,
+      "step": 254
+    },
+    {
+      "epoch": 1.7384615384615385,
+      "grad_norm": 0.739567756652832,
+      "learning_rate": 4.17702704859633e-06,
+      "loss": 1.200039267539978,
+      "step": 255
+    },
+    {
+      "epoch": 1.7452991452991453,
+      "grad_norm": 0.774598240852356,
+      "learning_rate": 4.1399843081817085e-06,
+      "loss": 1.2123297452926636,
+      "step": 256
+    },
+    {
+      "epoch": 1.7521367521367521,
+      "grad_norm": 0.8052539229393005,
+      "learning_rate": 4.1029901470962105e-06,
+      "loss": 1.2242088317871094,
+      "step": 257
+    },
+    {
+      "epoch": 1.758974358974359,
+      "grad_norm": 0.7723326683044434,
+      "learning_rate": 4.066046655012786e-06,
+      "loss": 1.2281506061553955,
+      "step": 258
+    },
+    {
+      "epoch": 1.7658119658119658,
+      "grad_norm": 0.7577686309814453,
+      "learning_rate": 4.029155918742268e-06,
+      "loss": 1.2183786630630493,
+      "step": 259
+    },
+    {
+      "epoch": 1.7726495726495726,
+      "grad_norm": 0.7814478278160095,
+      "learning_rate": 3.992320022115492e-06,
+      "loss": 1.2138553857803345,
+      "step": 260
+    },
+    {
+      "epoch": 1.7794871794871794,
+      "grad_norm": 0.7868865132331848,
+      "learning_rate": 3.955541045865591e-06,
+      "loss": 1.1890326738357544,
+      "step": 261
+    },
+    {
+      "epoch": 1.7863247863247862,
+      "grad_norm": 0.7574802041053772,
+      "learning_rate": 3.918821067510464e-06,
+      "loss": 1.1699459552764893,
+      "step": 262
+    },
+    {
+      "epoch": 1.793162393162393,
+      "grad_norm": 0.7787984013557434,
+      "learning_rate": 3.882162161235421e-06,
+      "loss": 1.1902029514312744,
+      "step": 263
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.780857264995575,
+      "learning_rate": 3.845566397776022e-06,
+      "loss": 1.1960508823394775,
+      "step": 264
+    },
+    {
+      "epoch": 1.8,
+      "eval_loss": 1.308773159980774,
+      "eval_runtime": 24.5858,
+      "eval_samples_per_second": 40.104,
+      "eval_steps_per_second": 5.044,
+      "step": 264
+    },
+    {
+      "epoch": 1.8068376068376069,
+      "grad_norm": 0.7353282570838928,
+      "learning_rate": 3.8090358443010993e-06,
+      "loss": 1.2238385677337646,
+      "step": 265
+    },
+    {
+      "epoch": 1.8136752136752137,
+      "grad_norm": 0.7844496369361877,
+      "learning_rate": 3.7725725642960047e-06,
+      "loss": 1.2065067291259766,
+      "step": 266
+    },
+    {
+      "epoch": 1.8205128205128205,
+      "grad_norm": 0.7792806029319763,
+      "learning_rate": 3.7361786174460414e-06,
+      "loss": 1.1908563375473022,
+      "step": 267
+    },
+    {
+      "epoch": 1.8273504273504273,
+      "grad_norm": 0.7404017448425293,
+      "learning_rate": 3.6998560595201188e-06,
+      "loss": 1.2162412405014038,
+      "step": 268
+    },
+    {
+      "epoch": 1.8341880341880343,
+      "grad_norm": 0.7953075170516968,
+      "learning_rate": 3.6636069422546363e-06,
+      "loss": 1.2134095430374146,
+      "step": 269
+    },
+    {
+      "epoch": 1.8410256410256411,
+      "grad_norm": 0.7584754824638367,
+      "learning_rate": 3.627433313237576e-06,
+      "loss": 1.2177472114562988,
+      "step": 270
+    },
+    {
+      "epoch": 1.847863247863248,
+      "grad_norm": 0.7290381789207458,
+      "learning_rate": 3.5913372157928515e-06,
+      "loss": 1.189732551574707,
+      "step": 271
+    },
+    {
+      "epoch": 1.8547008547008548,
+      "grad_norm": 0.7861201763153076,
+      "learning_rate": 3.555320688864889e-06,
+      "loss": 1.2073522806167603,
+      "step": 272
+    },
+    {
+      "epoch": 1.8615384615384616,
+      "grad_norm": 0.7544710636138916,
+      "learning_rate": 3.519385766903442e-06,
+      "loss": 1.2041759490966797,
+      "step": 273
+    },
+    {
+      "epoch": 1.8683760683760684,
+      "grad_norm": 0.7539916038513184,
+      "learning_rate": 3.483534479748688e-06,
+      "loss": 1.2057629823684692,
+      "step": 274
+    },
+    {
+      "epoch": 1.8752136752136752,
+      "grad_norm": 0.7374740242958069,
+      "learning_rate": 3.447768852516554e-06,
+      "loss": 1.2203168869018555,
+      "step": 275
+    },
+    {
+      "epoch": 1.882051282051282,
+      "grad_norm": 0.7594785690307617,
+      "learning_rate": 3.4120909054843375e-06,
+      "loss": 1.182802438735962,
+      "step": 276
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.7542571425437927,
+      "learning_rate": 3.3765026539765832e-06,
+      "loss": 1.2168110609054565,
+      "step": 277
+    },
+    {
+      "epoch": 1.8957264957264957,
+      "grad_norm": 0.7577287554740906,
+      "learning_rate": 3.3410061082512422e-06,
+      "loss": 1.2106308937072754,
+      "step": 278
+    },
+    {
+      "epoch": 1.9025641025641025,
+      "grad_norm": 0.7561420798301697,
+      "learning_rate": 3.3056032733861188e-06,
+      "loss": 1.20242440700531,
+      "step": 279
+    },
+    {
+      "epoch": 1.9094017094017093,
+      "grad_norm": 0.7456007599830627,
+      "learning_rate": 3.2702961491656197e-06,
+      "loss": 1.2251598834991455,
+      "step": 280
+    },
+    {
+      "epoch": 1.916239316239316,
+      "grad_norm": 0.790366530418396,
+      "learning_rate": 3.2350867299677802e-06,
+      "loss": 1.2062650918960571,
+      "step": 281
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.7317772507667542,
+      "learning_rate": 3.1999770046516198e-06,
+      "loss": 1.1729378700256348,
+      "step": 282
+    },
+    {
+      "epoch": 1.92991452991453,
+      "grad_norm": 0.7773919105529785,
+      "learning_rate": 3.164968956444791e-06,
+      "loss": 1.1983883380889893,
+      "step": 283
+    },
+    {
+      "epoch": 1.9367521367521368,
+      "grad_norm": 0.7585593461990356,
+      "learning_rate": 3.130064562831553e-06,
+      "loss": 1.2086600065231323,
+      "step": 284
+    },
+    {
+      "epoch": 1.9435897435897436,
+      "grad_norm": 0.7703876495361328,
+      "learning_rate": 3.0952657954410792e-06,
+      "loss": 1.2189124822616577,
+      "step": 285
+    },
+    {
+      "epoch": 1.9504273504273504,
+      "grad_norm": 0.7693601250648499,
+      "learning_rate": 3.0605746199360755e-06,
+      "loss": 1.210176706314087,
+      "step": 286
+    },
+    {
+      "epoch": 1.9572649572649574,
+      "grad_norm": 0.7466776967048645,
+      "learning_rate": 3.0259929959017585e-06,
+      "loss": 1.2027801275253296,
+      "step": 287
+    },
+    {
+      "epoch": 1.9641025641025642,
+      "grad_norm": 0.772388219833374,
+      "learning_rate": 2.991522876735154e-06,
+      "loss": 1.2112243175506592,
+      "step": 288
+    },
+    {
+      "epoch": 1.970940170940171,
+      "grad_norm": 0.7715580463409424,
+      "learning_rate": 2.95716620953476e-06,
+      "loss": 1.1904889345169067,
+      "step": 289
+    },
+    {
+      "epoch": 1.9777777777777779,
+      "grad_norm": 0.7397588491439819,
+      "learning_rate": 2.9229249349905686e-06,
+      "loss": 1.1913639307022095,
+      "step": 290
+    },
+    {
+      "epoch": 1.9846153846153847,
+      "grad_norm": 0.7530134916305542,
+      "learning_rate": 2.8888009872744332e-06,
+      "loss": 1.2205219268798828,
+      "step": 291
+    },
+    {
+      "epoch": 1.9914529914529915,
+      "grad_norm": 0.7689472436904907,
+      "learning_rate": 2.8547962939308187e-06,
+      "loss": 1.2000938653945923,
+      "step": 292
+    },
+    {
+      "epoch": 1.9982905982905983,
+      "grad_norm": 0.7348621487617493,
+      "learning_rate": 2.8209127757679246e-06,
+      "loss": 1.1786831617355347,
+      "step": 293
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.537250280380249,
+      "learning_rate": 2.787152346749173e-06,
+      "loss": 1.1778086423873901,
+      "step": 294
+    },
+    {
+      "epoch": 2.006837606837607,
+      "grad_norm": 0.9093112945556641,
+      "learning_rate": 2.7535169138851124e-06,
+      "loss": 1.1308534145355225,
+      "step": 295
+    },
+    {
+      "epoch": 2.0136752136752136,
+      "grad_norm": 0.895119845867157,
+      "learning_rate": 2.720008377125682e-06,
+      "loss": 1.1030248403549194,
+      "step": 296
+    },
+    {
+      "epoch": 2.0205128205128204,
+      "grad_norm": 0.822189211845398,
+      "learning_rate": 2.686628629252899e-06,
+      "loss": 1.0862432718276978,
+      "step": 297
+    },
+    {
+      "epoch": 2.0273504273504273,
+      "grad_norm": 0.839640200138092,
+      "learning_rate": 2.6533795557739407e-06,
+      "loss": 1.0923850536346436,
+      "step": 298
+    },
+    {
+      "epoch": 2.034188034188034,
+      "grad_norm": 0.7948157787322998,
+      "learning_rate": 2.6202630348146323e-06,
+      "loss": 1.1080037355422974,
+      "step": 299
+    },
+    {
+      "epoch": 2.041025641025641,
+      "grad_norm": 0.7708576321601868,
+      "learning_rate": 2.5872809370133704e-06,
+      "loss": 1.133652687072754,
+      "step": 300
+    },
+    {
+      "epoch": 2.0478632478632477,
+      "grad_norm": 0.784568727016449,
+      "learning_rate": 2.5544351254154407e-06,
+      "loss": 1.1596778631210327,
+      "step": 301
+    },
+    {
+      "epoch": 2.0547008547008545,
+      "grad_norm": 0.8119481205940247,
+      "learning_rate": 2.5217274553677975e-06,
+      "loss": 1.129364252090454,
+      "step": 302
+    },
+    {
+      "epoch": 2.0615384615384613,
+      "grad_norm": 0.7969528436660767,
+      "learning_rate": 2.489159774414252e-06,
+      "loss": 1.0949797630310059,
+      "step": 303
+    },
+    {
+      "epoch": 2.0683760683760686,
+      "grad_norm": 0.823360800743103,
+      "learning_rate": 2.4567339221911086e-06,
+      "loss": 1.1301119327545166,
+      "step": 304
+    },
+    {
+      "epoch": 2.0752136752136754,
+      "grad_norm": 0.8292282223701477,
+      "learning_rate": 2.424451730323261e-06,
+      "loss": 1.1120922565460205,
+      "step": 305
+    },
+    {
+      "epoch": 2.082051282051282,
+      "grad_norm": 0.8004986047744751,
+      "learning_rate": 2.3923150223207176e-06,
+      "loss": 1.1214550733566284,
+      "step": 306
+    },
+    {
+      "epoch": 2.088888888888889,
+      "grad_norm": 0.8165397644042969,
+      "learning_rate": 2.3603256134756066e-06,
+      "loss": 1.1209532022476196,
+      "step": 307
+    },
+    {
+      "epoch": 2.095726495726496,
+      "grad_norm": 0.8034455180168152,
+      "learning_rate": 2.328485310759635e-06,
+      "loss": 1.1401094198226929,
+      "step": 308
+    },
+    {
+      "epoch": 2.095726495726496,
+      "eval_loss": 1.3253560066223145,
+      "eval_runtime": 24.6122,
+      "eval_samples_per_second": 40.061,
+      "eval_steps_per_second": 5.038,
+      "step": 308
+    },
+    {
+      "epoch": 2.1025641025641026,
+      "grad_norm": 0.7844864130020142,
+      "learning_rate": 2.296795912722014e-06,
+      "loss": 1.144791603088379,
+      "step": 309
+    },
+    {
+      "epoch": 2.1094017094017095,
+      "grad_norm": 0.7857894897460938,
+      "learning_rate": 2.265259209387867e-06,
+      "loss": 1.1488922834396362,
+      "step": 310
+    },
+    {
+      "epoch": 2.1162393162393163,
+      "grad_norm": 0.7851693630218506,
+      "learning_rate": 2.2338769821571225e-06,
+      "loss": 1.1399354934692383,
+      "step": 311
+    },
+    {
+      "epoch": 2.123076923076923,
+      "grad_norm": 0.8227202296257019,
+      "learning_rate": 2.202651003703885e-06,
+      "loss": 1.1063587665557861,
+      "step": 312
+    },
+    {
+      "epoch": 2.12991452991453,
+      "grad_norm": 0.822938084602356,
+      "learning_rate": 2.1715830378763025e-06,
+      "loss": 1.1050540208816528,
+      "step": 313
+    },
+    {
+      "epoch": 2.1367521367521367,
+      "grad_norm": 0.8058551549911499,
+      "learning_rate": 2.140674839596931e-06,
+      "loss": 1.0922585725784302,
+      "step": 314
+    },
+    {
+      "epoch": 2.1435897435897435,
+      "grad_norm": 0.7917458415031433,
+      "learning_rate": 2.109928154763606e-06,
+      "loss": 1.1247828006744385,
+      "step": 315
+    },
+    {
+      "epoch": 2.1504273504273503,
+      "grad_norm": 0.8290326595306396,
+      "learning_rate": 2.0793447201508288e-06,
+      "loss": 1.1369386911392212,
+      "step": 316
+    },
+    {
+      "epoch": 2.157264957264957,
+      "grad_norm": 0.7832273840904236,
+      "learning_rate": 2.0489262633116536e-06,
+      "loss": 1.110697627067566,
+      "step": 317
+    },
+    {
+      "epoch": 2.164102564102564,
+      "grad_norm": 0.7919285297393799,
+      "learning_rate": 2.01867450248011e-06,
+      "loss": 1.157274842262268,
+      "step": 318
+    },
+    {
+      "epoch": 2.1709401709401708,
+      "grad_norm": 0.7776212096214294,
+      "learning_rate": 1.9885911464741413e-06,
+      "loss": 1.139618992805481,
+      "step": 319
+    },
+    {
+      "epoch": 2.1777777777777776,
+      "grad_norm": 0.7800706624984741,
+      "learning_rate": 1.9586778945990785e-06,
+      "loss": 1.1110671758651733,
+      "step": 320
+    },
+    {
+      "epoch": 2.184615384615385,
+      "grad_norm": 0.8117327094078064,
+      "learning_rate": 1.928936436551661e-06,
+      "loss": 1.1395684480667114,
+      "step": 321
+    },
+    {
+      "epoch": 2.1914529914529917,
+      "grad_norm": 0.7962910532951355,
+      "learning_rate": 1.8993684523245842e-06,
+      "loss": 1.1162846088409424,
+      "step": 322
+    },
+    {
+      "epoch": 2.1982905982905985,
+      "grad_norm": 0.7874794602394104,
+      "learning_rate": 1.8699756121115997e-06,
+      "loss": 1.1188956499099731,
+      "step": 323
+    },
+    {
+      "epoch": 2.2051282051282053,
+      "grad_norm": 0.785068690776825,
+      "learning_rate": 1.8407595762131814e-06,
+      "loss": 1.1131058931350708,
+      "step": 324
+    },
+    {
+      "epoch": 2.211965811965812,
+      "grad_norm": 0.8046601414680481,
+      "learning_rate": 1.811721994942731e-06,
+      "loss": 1.1231977939605713,
+      "step": 325
+    },
+    {
+      "epoch": 2.218803418803419,
+      "grad_norm": 0.759477972984314,
+      "learning_rate": 1.7828645085333645e-06,
+      "loss": 1.1036738157272339,
+      "step": 326
+    },
+    {
+      "epoch": 2.2256410256410257,
+      "grad_norm": 0.7955328226089478,
+      "learning_rate": 1.7541887470452606e-06,
+      "loss": 1.166395664215088,
+      "step": 327
+    },
+    {
+      "epoch": 2.2324786324786325,
+      "grad_norm": 0.7807881236076355,
+      "learning_rate": 1.7256963302735752e-06,
+      "loss": 1.1385221481323242,
+      "step": 328
+    },
+    {
+      "epoch": 2.2393162393162394,
+      "grad_norm": 0.7881447076797485,
+      "learning_rate": 1.6973888676569594e-06,
+      "loss": 1.145586609840393,
+      "step": 329
+    },
+    {
+      "epoch": 2.246153846153846,
+      "grad_norm": 0.8092402815818787,
+      "learning_rate": 1.6692679581866334e-06,
+      "loss": 1.1422295570373535,
+      "step": 330
+    },
+    {
+      "epoch": 2.252991452991453,
+      "grad_norm": 0.7870088219642639,
+      "learning_rate": 1.6413351903160763e-06,
+      "loss": 1.1302958726882935,
+      "step": 331
+    },
+    {
+      "epoch": 2.25982905982906,
+      "grad_norm": 0.8018279075622559,
+      "learning_rate": 1.6135921418712959e-06,
+      "loss": 1.114201545715332,
+      "step": 332
+    },
+    {
+      "epoch": 2.2666666666666666,
+      "grad_norm": 0.7955658435821533,
+      "learning_rate": 1.5860403799616951e-06,
+      "loss": 1.1686758995056152,
+      "step": 333
+    },
+    {
+      "epoch": 2.2735042735042734,
+      "grad_norm": 0.8098942637443542,
+      "learning_rate": 1.5586814608915673e-06,
+      "loss": 1.1103954315185547,
+      "step": 334
+    },
+    {
+      "epoch": 2.2803418803418802,
+      "grad_norm": 0.7653470039367676,
+      "learning_rate": 1.5315169300721694e-06,
+      "loss": 1.1263670921325684,
+      "step": 335
+    },
+    {
+      "epoch": 2.287179487179487,
+      "grad_norm": 0.7954714894294739,
+      "learning_rate": 1.5045483219344387e-06,
+      "loss": 1.091448187828064,
+      "step": 336
+    },
+    {
+      "epoch": 2.294017094017094,
+      "grad_norm": 0.7870411276817322,
+      "learning_rate": 1.4777771598423147e-06,
+      "loss": 1.127175211906433,
+      "step": 337
+    },
+    {
+      "epoch": 2.3008547008547007,
+      "grad_norm": 0.8070060014724731,
+      "learning_rate": 1.4512049560066837e-06,
+      "loss": 1.1385235786437988,
+      "step": 338
+    },
+    {
+      "epoch": 2.3076923076923075,
+      "grad_norm": 0.7654244303703308,
+      "learning_rate": 1.4248332113999708e-06,
+      "loss": 1.1272555589675903,
+      "step": 339
+    },
+    {
+      "epoch": 2.3145299145299143,
+      "grad_norm": 0.7763322591781616,
+      "learning_rate": 1.3986634156713418e-06,
+      "loss": 1.1271766424179077,
+      "step": 340
+    },
+    {
+      "epoch": 2.3213675213675216,
+      "grad_norm": 0.7544705867767334,
+      "learning_rate": 1.3726970470625705e-06,
+      "loss": 1.157515525817871,
+      "step": 341
+    },
+    {
+      "epoch": 2.3282051282051284,
+      "grad_norm": 0.7676778435707092,
+      "learning_rate": 1.3469355723245303e-06,
+      "loss": 1.1277141571044922,
+      "step": 342
+    },
+    {
+      "epoch": 2.335042735042735,
+      "grad_norm": 0.7713337540626526,
+      "learning_rate": 1.321380446634342e-06,
+      "loss": 1.1003583669662476,
+      "step": 343
+    },
+    {
+      "epoch": 2.341880341880342,
+      "grad_norm": 0.7740820646286011,
+      "learning_rate": 1.2960331135131826e-06,
+      "loss": 1.1071029901504517,
+      "step": 344
+    },
+    {
+      "epoch": 2.348717948717949,
+      "grad_norm": 0.758073091506958,
+      "learning_rate": 1.270895004744737e-06,
+      "loss": 1.110722303390503,
+      "step": 345
+    },
+    {
+      "epoch": 2.3555555555555556,
+      "grad_norm": 0.7693141102790833,
+      "learning_rate": 1.245967540294329e-06,
+      "loss": 1.097144365310669,
+      "step": 346
+    },
+    {
+      "epoch": 2.3623931623931624,
+      "grad_norm": 0.7613301873207092,
+      "learning_rate": 1.2212521282287093e-06,
+      "loss": 1.130142092704773,
+      "step": 347
+    },
+    {
+      "epoch": 2.3692307692307693,
+      "grad_norm": 0.7610928416252136,
+      "learning_rate": 1.1967501646365147e-06,
+      "loss": 1.1337437629699707,
+      "step": 348
+    },
+    {
+      "epoch": 2.376068376068376,
+      "grad_norm": 0.7692887187004089,
+      "learning_rate": 1.172463033549418e-06,
+      "loss": 1.1064190864562988,
+      "step": 349
+    },
+    {
+      "epoch": 2.382905982905983,
+      "grad_norm": 0.7826989889144897,
+      "learning_rate": 1.1483921068639353e-06,
+      "loss": 1.1885005235671997,
+      "step": 350
+    },
+    {
+      "epoch": 2.3897435897435897,
+      "grad_norm": 0.7613060474395752,
+      "learning_rate": 1.1245387442639456e-06,
+      "loss": 1.110337734222412,
+      "step": 351
+    },
+    {
+      "epoch": 2.3965811965811965,
+      "grad_norm": 0.7910706400871277,
+      "learning_rate": 1.1009042931438784e-06,
+      "loss": 1.1144278049468994,
+      "step": 352
+    },
+    {
+      "epoch": 2.3965811965811965,
+      "eval_loss": 1.323965311050415,
+      "eval_runtime": 24.7109,
+      "eval_samples_per_second": 39.901,
+      "eval_steps_per_second": 5.018,
+      "step": 352
+    },
+    {
+      "epoch": 2.4034188034188033,
+      "grad_norm": 0.7570564150810242,
+      "learning_rate": 1.077490088532605e-06,
+      "loss": 1.114471435546875,
+      "step": 353
+    },
+    {
+      "epoch": 2.41025641025641,
+      "grad_norm": 0.7983273863792419,
+      "learning_rate": 1.0542974530180327e-06,
+      "loss": 1.132286787033081,
+      "step": 354
+    },
+    {
+      "epoch": 2.417094017094017,
+      "grad_norm": 0.7606459856033325,
+      "learning_rate": 1.0313276966723867e-06,
+      "loss": 1.0865505933761597,
+      "step": 355
+    },
+    {
+      "epoch": 2.4239316239316238,
+      "grad_norm": 0.7879711389541626,
+      "learning_rate": 1.00858211697822e-06,
+      "loss": 1.1440324783325195,
+      "step": 356
+    },
+    {
+      "epoch": 2.430769230769231,
+      "grad_norm": 0.762718915939331,
+      "learning_rate": 9.860619987551157e-07,
+      "loss": 1.1018445491790771,
+      "step": 357
+    },
+    {
+      "epoch": 2.437606837606838,
+      "grad_norm": 0.7899941802024841,
+      "learning_rate": 9.637686140871121e-07,
+      "loss": 1.1469783782958984,
+      "step": 358
+    },
+    {
+      "epoch": 2.4444444444444446,
+      "grad_norm": 0.7909042239189148,
+      "learning_rate": 9.417032222508476e-07,
+      "loss": 1.1333407163619995,
+      "step": 359
+    },
+    {
+      "epoch": 2.4512820512820515,
+      "grad_norm": 0.7936816811561584,
+      "learning_rate": 9.198670696444339e-07,
+      "loss": 1.1438573598861694,
+      "step": 360
+    },
+    {
+      "epoch": 2.4581196581196583,
+      "grad_norm": 0.7882561683654785,
+      "learning_rate": 8.982613897170439e-07,
+      "loss": 1.1176822185516357,
+      "step": 361
+    },
+    {
+      "epoch": 2.464957264957265,
+      "grad_norm": 0.7810674905776978,
+      "learning_rate": 8.768874028992431e-07,
+      "loss": 1.135961651802063,
+      "step": 362
+    },
+    {
+      "epoch": 2.471794871794872,
+      "grad_norm": 0.7794176340103149,
+      "learning_rate": 8.557463165340479e-07,
+      "loss": 1.1315698623657227,
+      "step": 363
+    },
+    {
+      "epoch": 2.4786324786324787,
+      "grad_norm": 0.7674309611320496,
+      "learning_rate": 8.348393248087289e-07,
+      "loss": 1.1471264362335205,
+      "step": 364
+    },
+    {
+      "epoch": 2.4854700854700855,
+      "grad_norm": 0.7684411406517029,
+      "learning_rate": 8.141676086873574e-07,
+      "loss": 1.1023811101913452,
+      "step": 365
+    },
+    {
+      "epoch": 2.4923076923076923,
+      "grad_norm": 0.7729819416999817,
+      "learning_rate": 7.937323358440935e-07,
+      "loss": 1.1146825551986694,
+      "step": 366
+    },
+    {
+      "epoch": 2.499145299145299,
+      "grad_norm": 0.7710589170455933,
+      "learning_rate": 7.735346605972322e-07,
+      "loss": 1.1076273918151855,
+      "step": 367
+    },
+    {
+      "epoch": 2.505982905982906,
+      "grad_norm": 0.7700541019439697,
+      "learning_rate": 7.535757238439939e-07,
+      "loss": 1.1303023099899292,
+      "step": 368
+    },
+    {
+      "epoch": 2.5128205128205128,
+      "grad_norm": 0.7796255946159363,
+      "learning_rate": 7.338566529960817e-07,
+      "loss": 1.1434168815612793,
+      "step": 369
+    },
+    {
+      "epoch": 2.5196581196581196,
+      "grad_norm": 0.7890748977661133,
+      "learning_rate": 7.143785619160026e-07,
+      "loss": 1.137059211730957,
+      "step": 370
+    },
+    {
+      "epoch": 2.5264957264957264,
+      "grad_norm": 0.7733116149902344,
+      "learning_rate": 6.951425508541432e-07,
+      "loss": 1.1050790548324585,
+      "step": 371
+    },
+    {
+      "epoch": 2.533333333333333,
+      "grad_norm": 0.7718008160591125,
+      "learning_rate": 6.761497063866207e-07,
+      "loss": 1.1239290237426758,
+      "step": 372
+    },
+    {
+      "epoch": 2.54017094017094,
+      "grad_norm": 0.7675129771232605,
+      "learning_rate": 6.574011013539111e-07,
+      "loss": 1.1362709999084473,
+      "step": 373
+    },
+    {
+      "epoch": 2.547008547008547,
+      "grad_norm": 0.7831134796142578,
+      "learning_rate": 6.388977948002406e-07,
+      "loss": 1.1359511613845825,
+      "step": 374
+    },
+    {
+      "epoch": 2.5538461538461537,
+      "grad_norm": 0.7688263654708862,
+      "learning_rate": 6.206408319137703e-07,
+      "loss": 1.1311153173446655,
+      "step": 375
+    },
+    {
+      "epoch": 2.5606837606837605,
+      "grad_norm": 0.7608706951141357,
+      "learning_rate": 6.026312439675553e-07,
+      "loss": 1.1158239841461182,
+      "step": 376
+    },
+    {
+      "epoch": 2.5675213675213673,
+      "grad_norm": 0.7655665278434753,
+      "learning_rate": 5.848700482612873e-07,
+      "loss": 1.1498501300811768,
+      "step": 377
+    },
+    {
+      "epoch": 2.574358974358974,
+      "grad_norm": 0.7795934081077576,
+      "learning_rate": 5.673582480638395e-07,
+      "loss": 1.1341049671173096,
+      "step": 378
+    },
+    {
+      "epoch": 2.5811965811965814,
+      "grad_norm": 0.7773811221122742,
+      "learning_rate": 5.500968325565859e-07,
+      "loss": 1.1404979228973389,
+      "step": 379
+    },
+    {
+      "epoch": 2.588034188034188,
+      "grad_norm": 0.8611118793487549,
+      "learning_rate": 5.330867767775333e-07,
+      "loss": 1.0921636819839478,
+      "step": 380
+    },
+    {
+      "epoch": 2.594871794871795,
+      "grad_norm": 0.745428204536438,
+      "learning_rate": 5.163290415662408e-07,
+      "loss": 1.1557259559631348,
+      "step": 381
+    },
+    {
+      "epoch": 2.601709401709402,
+      "grad_norm": 0.7756429314613342,
+      "learning_rate": 4.998245735095459e-07,
+      "loss": 1.1447691917419434,
+      "step": 382
+    },
+    {
+      "epoch": 2.6085470085470086,
+      "grad_norm": 0.7908133864402771,
+      "learning_rate": 4.835743048880959e-07,
+      "loss": 1.143109917640686,
+      "step": 383
+    },
+    {
+      "epoch": 2.6153846153846154,
+      "grad_norm": 0.7732424736022949,
+      "learning_rate": 4.6757915362368567e-07,
+      "loss": 1.132035493850708,
+      "step": 384
+    },
+    {
+      "epoch": 2.6222222222222222,
+      "grad_norm": 0.7889422178268433,
+      "learning_rate": 4.5184002322740784e-07,
+      "loss": 1.1180846691131592,
+      "step": 385
+    },
+    {
+      "epoch": 2.629059829059829,
+      "grad_norm": 0.7938551902770996,
+      "learning_rate": 4.363578027486187e-07,
+      "loss": 1.1456289291381836,
+      "step": 386
+    },
+    {
+      "epoch": 2.635897435897436,
+      "grad_norm": 0.8030667901039124,
+      "learning_rate": 4.211333667247125e-07,
+      "loss": 1.1397569179534912,
+      "step": 387
+    },
+    {
+      "epoch": 2.6427350427350427,
+      "grad_norm": 0.7819530367851257,
+      "learning_rate": 4.0616757513173123e-07,
+      "loss": 1.1004501581192017,
+      "step": 388
+    },
+    {
+      "epoch": 2.6495726495726495,
+      "grad_norm": 0.758314311504364,
+      "learning_rate": 3.9146127333577757e-07,
+      "loss": 1.1101858615875244,
+      "step": 389
+    },
+    {
+      "epoch": 2.6564102564102563,
+      "grad_norm": 0.7801131010055542,
+      "learning_rate": 3.7701529204526856e-07,
+      "loss": 1.1453076601028442,
+      "step": 390
+    },
+    {
+      "epoch": 2.663247863247863,
+      "grad_norm": 0.7489244937896729,
+      "learning_rate": 3.6283044726401594e-07,
+      "loss": 1.0911612510681152,
+      "step": 391
+    },
+    {
+      "epoch": 2.67008547008547,
+      "grad_norm": 0.761225700378418,
+      "learning_rate": 3.4890754024512254e-07,
+      "loss": 1.130741000175476,
+      "step": 392
+    },
+    {
+      "epoch": 2.676923076923077,
+      "grad_norm": 0.761887788772583,
+      "learning_rate": 3.352473574457304e-07,
+      "loss": 1.120837926864624,
+      "step": 393
+    },
+    {
+      "epoch": 2.683760683760684,
+      "grad_norm": 0.7792303562164307,
+      "learning_rate": 3.2185067048259245e-07,
+      "loss": 1.1177864074707031,
+      "step": 394
+    },
+    {
+      "epoch": 2.690598290598291,
+      "grad_norm": 0.7689954042434692,
+      "learning_rate": 3.087182360884872e-07,
+      "loss": 1.177292823791504,
+      "step": 395
+    },
+    {
+      "epoch": 2.6974358974358976,
+      "grad_norm": 0.7710866332054138,
+      "learning_rate": 2.9585079606947843e-07,
+      "loss": 1.1195672750473022,
+      "step": 396
+    },
+    {
+      "epoch": 2.6974358974358976,
+      "eval_loss": 1.3236175775527954,
+      "eval_runtime": 24.7082,
+      "eval_samples_per_second": 39.906,
+      "eval_steps_per_second": 5.019,
+      "step": 396
+    },
+    {
+      "epoch": 2.7042735042735044,
+      "grad_norm": 0.7776737809181213,
+      "learning_rate": 2.8324907726300366e-07,
+      "loss": 1.113619327545166,
+      "step": 397
+    },
+    {
+      "epoch": 2.7111111111111112,
+      "grad_norm": 0.7743112444877625,
+      "learning_rate": 2.7091379149682683e-07,
+      "loss": 1.0938081741333008,
+      "step": 398
+    },
+    {
+      "epoch": 2.717948717948718,
+      "grad_norm": 0.7779694199562073,
+      "learning_rate": 2.5884563554882336e-07,
+      "loss": 1.1138122081756592,
+      "step": 399
+    },
+    {
+      "epoch": 2.724786324786325,
+      "grad_norm": 0.7622742652893066,
+      "learning_rate": 2.470452911076227e-07,
+      "loss": 1.1006677150726318,
+      "step": 400
+    },
+    {
+      "epoch": 2.7316239316239317,
+      "grad_norm": 0.7664272785186768,
+      "learning_rate": 2.355134247341073e-07,
+      "loss": 1.1065200567245483,
+      "step": 401
+    },
+    {
+      "epoch": 2.7384615384615385,
+      "grad_norm": 0.7712447643280029,
+      "learning_rate": 2.242506878237538e-07,
+      "loss": 1.1020417213439941,
+      "step": 402
+    },
+    {
+      "epoch": 2.7452991452991453,
+      "grad_norm": 0.7656382322311401,
+      "learning_rate": 2.1325771656984075e-07,
+      "loss": 1.1001569032669067,
+      "step": 403
+    },
+    {
+      "epoch": 2.752136752136752,
+      "grad_norm": 0.7811654806137085,
+      "learning_rate": 2.0253513192751374e-07,
+      "loss": 1.1310510635375977,
+      "step": 404
+    },
+    {
+      "epoch": 2.758974358974359,
+      "grad_norm": 0.7687283158302307,
+      "learning_rate": 1.9208353957870684e-07,
+      "loss": 1.146543264389038,
+      "step": 405
+    },
+    {
+      "epoch": 2.7658119658119658,
+      "grad_norm": 0.7670867443084717,
+      "learning_rate": 1.8190352989793325e-07,
+      "loss": 1.1161731481552124,
+      "step": 406
+    },
+    {
+      "epoch": 2.7726495726495726,
+      "grad_norm": 0.7807978391647339,
+      "learning_rate": 1.7199567791893524e-07,
+      "loss": 1.1282137632369995,
+      "step": 407
+    },
+    {
+      "epoch": 2.7794871794871794,
+      "grad_norm": 0.7957569360733032,
+      "learning_rate": 1.6236054330219853e-07,
+      "loss": 1.1041632890701294,
+      "step": 408
+    },
+    {
+      "epoch": 2.786324786324786,
+      "grad_norm": 0.7832216024398804,
+      "learning_rate": 1.5299867030334815e-07,
+      "loss": 1.108730435371399,
+      "step": 409
+    },
+    {
+      "epoch": 2.793162393162393,
+      "grad_norm": 0.753606915473938,
+      "learning_rate": 1.439105877423963e-07,
+      "loss": 1.131809115409851,
+      "step": 410
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.7802961468696594,
+      "learning_rate": 1.350968089738758e-07,
+      "loss": 1.1083602905273438,
+      "step": 411
+    },
+    {
+      "epoch": 2.8068376068376066,
+      "grad_norm": 0.768670380115509,
+      "learning_rate": 1.2655783185784253e-07,
+      "loss": 1.1080389022827148,
+      "step": 412
+    },
+    {
+      "epoch": 2.8136752136752134,
+      "grad_norm": 0.7562652230262756,
+      "learning_rate": 1.1829413873174988e-07,
+      "loss": 1.1086317300796509,
+      "step": 413
+    },
+    {
+      "epoch": 2.8205128205128203,
+      "grad_norm": 0.763107180595398,
+      "learning_rate": 1.1030619638320805e-07,
+      "loss": 1.1433099508285522,
+      "step": 414
+    },
+    {
+      "epoch": 2.827350427350427,
+      "grad_norm": 0.7749531865119934,
+      "learning_rate": 1.0259445602361084e-07,
+      "loss": 1.129563331604004,
+      "step": 415
+    },
+    {
+      "epoch": 2.8341880341880343,
+      "grad_norm": 0.7604458928108215,
+      "learning_rate": 9.51593532626538e-08,
+      "loss": 1.120940089225769,
+      "step": 416
+    },
+    {
+      "epoch": 2.841025641025641,
+      "grad_norm": 0.750518262386322,
+      "learning_rate": 8.800130808372553e-08,
+      "loss": 1.0916835069656372,
+      "step": 417
+    },
+    {
+      "epoch": 2.847863247863248,
+      "grad_norm": 0.7595433592796326,
+      "learning_rate": 8.11207248201834e-08,
+      "loss": 1.1178152561187744,
+      "step": 418
+    },
+    {
+      "epoch": 2.8547008547008548,
+      "grad_norm": 0.7640005350112915,
+      "learning_rate": 7.45179921325162e-08,
+      "loss": 1.1630092859268188,
+      "step": 419
+    },
+    {
+      "epoch": 2.8615384615384616,
+      "grad_norm": 0.8447228074073792,
+      "learning_rate": 6.819348298638839e-08,
+      "loss": 1.1273298263549805,
+      "step": 420
+    },
+    {
+      "epoch": 2.8683760683760684,
+      "grad_norm": 0.7577494978904724,
+      "learning_rate": 6.214755463157417e-08,
+      "loss": 1.0993590354919434,
+      "step": 421
+    },
+    {
+      "epoch": 2.875213675213675,
+      "grad_norm": 0.7751004099845886,
+      "learning_rate": 5.638054858177644e-08,
+      "loss": 1.1498969793319702,
+      "step": 422
+    },
+    {
+      "epoch": 2.882051282051282,
+      "grad_norm": 0.7662968039512634,
+      "learning_rate": 5.089279059533658e-08,
+      "loss": 1.1176806688308716,
+      "step": 423
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 0.7827076315879822,
+      "learning_rate": 4.568459065683206e-08,
+      "loss": 1.1449580192565918,
+      "step": 424
+    },
+    {
+      "epoch": 2.8957264957264957,
+      "grad_norm": 0.7646909952163696,
+      "learning_rate": 4.0756242959567596e-08,
+      "loss": 1.1186950206756592,
+      "step": 425
+    },
+    {
+      "epoch": 2.9025641025641025,
+      "grad_norm": 0.7541195154190063,
+      "learning_rate": 3.610802588895845e-08,
+      "loss": 1.131952166557312,
+      "step": 426
+    },
+    {
+      "epoch": 2.9094017094017093,
+      "grad_norm": 0.7776208519935608,
+      "learning_rate": 3.1740202006804166e-08,
+      "loss": 1.1178792715072632,
+      "step": 427
+    },
+    {
+      "epoch": 2.916239316239316,
+      "grad_norm": 0.7766209244728088,
+      "learning_rate": 2.765301803645426e-08,
+      "loss": 1.1331486701965332,
+      "step": 428
+    },
+    {
+      "epoch": 2.9230769230769234,
+      "grad_norm": 0.7666369676589966,
+      "learning_rate": 2.3846704848878298e-08,
+      "loss": 1.1589261293411255,
+      "step": 429
+    },
+    {
+      "epoch": 2.92991452991453,
+      "grad_norm": 0.7775545716285706,
+      "learning_rate": 2.0321477449619098e-08,
+      "loss": 1.1344677209854126,
+      "step": 430
+    },
+    {
+      "epoch": 2.936752136752137,
+      "grad_norm": 0.7537861466407776,
+      "learning_rate": 1.7077534966650767e-08,
+      "loss": 1.1040513515472412,
+      "step": 431
+    },
+    {
+      "epoch": 2.943589743589744,
+      "grad_norm": 0.7825785875320435,
+      "learning_rate": 1.411506063912882e-08,
+      "loss": 1.1581734418869019,
+      "step": 432
+    },
+    {
+      "epoch": 2.9504273504273506,
+      "grad_norm": 0.7491230368614197,
+      "learning_rate": 1.1434221807041234e-08,
+      "loss": 1.1113041639328003,
+      "step": 433
+    },
+    {
+      "epoch": 2.9572649572649574,
+      "grad_norm": 0.7601305842399597,
+      "learning_rate": 9.035169901754902e-09,
+      "loss": 1.0998278856277466,
+      "step": 434
+    },
+    {
+      "epoch": 2.9641025641025642,
+      "grad_norm": 0.7869414687156677,
+      "learning_rate": 6.918040437463025e-09,
+      "loss": 1.1475398540496826,
+      "step": 435
+    },
+    {
+      "epoch": 2.970940170940171,
+      "grad_norm": 0.760128915309906,
+      "learning_rate": 5.082953003528457e-09,
+      "loss": 1.1517993211746216,
+      "step": 436
+    },
+    {
+      "epoch": 2.977777777777778,
+      "grad_norm": 0.7626367211341858,
+      "learning_rate": 3.530011257730226e-09,
+      "loss": 1.1134616136550903,
+      "step": 437
+    },
+    {
+      "epoch": 2.9846153846153847,
+      "grad_norm": 0.765670657157898,
+      "learning_rate": 2.2593029204076578e-09,
+      "loss": 1.1342540979385376,
+      "step": 438
+    },
+    {
+      "epoch": 2.9914529914529915,
+      "grad_norm": 0.7739811539649963,
+      "learning_rate": 1.2708997695043412e-09,
+      "loss": 1.1077520847320557,
+      "step": 439
+    },
+    {
+      "epoch": 2.9982905982905983,
+      "grad_norm": 0.7707903385162354,
+      "learning_rate": 5.648576365169245e-10,
+      "loss": 1.0939933061599731,
+      "step": 440
+    },
+    {
+      "epoch": 2.9982905982905983,
+      "eval_loss": 1.3233778476715088,
+      "eval_runtime": 24.6851,
+      "eval_samples_per_second": 39.943,
+      "eval_steps_per_second": 5.023,
+      "step": 440
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.5993366241455078,
+      "learning_rate": 1.4121640333653042e-10,
+      "loss": 1.0642163753509521,
+      "step": 441
+    },
+    {
+      "epoch": 3.0,
+      "step": 441,
+      "total_flos": 9.743300044908134e+17,
+      "train_loss": 1.2459275746832088,
+      "train_runtime": 6646.3979,
+      "train_samples_per_second": 8.449,
+      "train_steps_per_second": 0.066
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 441,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 44,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.743300044908134e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}