From ef149dd320b40620d4164c6b8750027b8266ba64 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sun, 12 Apr 2026 08:59:55 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: BSC-LT/salamandra-2b-instruct_tools Source: Original Platform --- .gitattributes | 39 + README.md | 5 + added_tokens.json | 8 + config.json | 30 + configuration.json | 1 + generation_config.json | 6 + model.safetensors | 3 + special_tokens_map.json | 22 + tokenizer.json | 3 + tokenizer.model | 3 + tokenizer_config.json | 1152 +++++ trainer_state.json | 10175 ++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 11450 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 added_tokens.json create mode 100644 config.json create mode 100644 configuration.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer.model create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..6c4d79e --- /dev/null +++ b/.gitattributes @@ -0,0 +1,39 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +*.sagetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.model filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model.safetensors filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..1b0969c --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +--- +license: apache-2.0 +library_name: transformers +pipeline_tag: text-generation +--- diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..b53198b --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,8 @@ +{ + "": 256004, + "": 256003, + "": 256002, + "": 256000, + "": 256005, + "": 256001 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..499ad35 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "/gpfs/projects/bsc88/text/models/instruction-tuning/models/base_models_with_special_tokens/restart_mix1_all_fineweb_2b_new_data_hf", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5440, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "num_key_value_heads": 16, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.44.0", + "use_cache": true, + "vocab_size": 256006 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..619b676 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.44.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..4e1272d --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4defb08b7ba76f0d41d14cc88e351f300b96cbe2fc36ab5e6c8dfccea129971 +size 4507054896 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..28045d8 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,22 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..cadd5fa --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0287ada59bc90ef6622f02d21fed0dea9cffd1373702dfe96184cbcfff5254a2 +size 19093506 diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..3307f8d --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa490e57cebce5cb1a0a5b1a5d3fa4de05aee53dc3a44791f1c3401db44d802d +size 4813274 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..c4481bb --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,1152 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "5": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "6": { + "content": "<|reserved_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "7": { + "content": "<|reserved_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "8": { + "content": "<|reserved_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "9": { + "content": "<|reserved_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "10": { + "content": "<|reserved_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "11": { + "content": "<|reserved_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "12": { + "content": "<|reserved_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "13": { + "content": "<|reserved_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "14": { + "content": "<|reserved_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "15": { + "content": "<|reserved_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "16": { + "content": "<|reserved_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "17": { + "content": "<|reserved_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "18": { + "content": "<|reserved_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "19": { + "content": "<|reserved_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "20": { + "content": "<|reserved_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "21": { + "content": "<|reserved_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "22": { + "content": "<|reserved_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "23": { + "content": "<|reserved_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "24": { + "content": "<|reserved_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "25": { + "content": "<|reserved_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "26": { + "content": "<|reserved_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "27": { + "content": "<|reserved_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "28": { + "content": "<|reserved_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "29": { + "content": "<|reserved_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "30": { + "content": "<|reserved_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "31": { + "content": "<|reserved_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32": { + "content": "<|reserved_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "33": { + "content": "<|reserved_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "34": { + "content": "<|reserved_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "35": { + "content": "<|reserved_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "36": { + "content": "<|reserved_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "37": { + "content": "<|reserved_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "38": { + "content": "<|reserved_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "39": { + "content": "<|reserved_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "40": { + "content": "<|reserved_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "41": { + "content": "<|reserved_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "42": { + "content": "<|reserved_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "43": { + "content": "<|reserved_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "44": { + "content": "<|reserved_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "45": { + "content": "<|reserved_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "46": { + "content": "<|reserved_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "47": { + "content": "<|reserved_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "48": { + "content": "<|reserved_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "49": { + "content": "<|reserved_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "50": { + "content": "<|reserved_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "51": { + "content": "<|reserved_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "52": { + "content": "<|reserved_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "53": { + "content": "<|reserved_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "54": { + "content": "<|reserved_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "55": { + "content": "<|reserved_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "56": { + "content": "<|reserved_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "57": { + "content": "<|reserved_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "58": { + "content": "<|reserved_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "59": { + "content": "<|reserved_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "60": { + "content": "<|reserved_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "61": { + "content": "<|reserved_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "62": { + "content": "<|reserved_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "63": { + "content": "<|reserved_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64": { + "content": "<|reserved_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "65": { + "content": "<|reserved_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "66": { + "content": "<|reserved_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "67": { + "content": "<|reserved_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "68": { + "content": "<|reserved_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "69": { + "content": "<|reserved_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "70": { + "content": "<|reserved_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "71": { + "content": "<|reserved_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "72": { + "content": "<|reserved_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "73": { + "content": "<|reserved_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "74": { + "content": "<|reserved_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "75": { + "content": "<|reserved_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "76": { + "content": "<|reserved_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "77": { + "content": "<|reserved_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "78": { + "content": "<|reserved_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "79": { + "content": "<|reserved_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "80": { + "content": "<|reserved_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "81": { + "content": "<|reserved_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "82": { + "content": "<|reserved_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "83": { + "content": "<|reserved_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "84": { + "content": "<|reserved_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "85": { + "content": "<|reserved_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "86": { + "content": "<|reserved_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "87": { + "content": "<|reserved_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "88": { + "content": "<|reserved_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "89": { + "content": "<|reserved_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "90": { + "content": "<|reserved_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "91": { + "content": "<|reserved_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92": { + "content": "<|reserved_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "93": { + "content": "<|reserved_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "94": { + "content": "<|reserved_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "95": { + "content": "<|reserved_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "96": { + "content": "<|reserved_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "97": { + "content": "<|reserved_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "98": { + "content": "<|reserved_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "99": { + "content": "<|reserved_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "100": { + "content": "<|reserved_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "101": { + "content": "<|reserved_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "102": { + "content": "<|reserved_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "103": { + "content": "<|reserved_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "104": { + "content": "\\r", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256001": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256002": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256003": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256004": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + }, + "256005": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": "", + "chat_template": "{%- set tools = tools if tools is defined else None -%}\n{%- set date_string = date_string if date_string is defined else \"1 Sep 2024\" -%}\n\n{%- set system_message = messages[0].content if messages[0].role == \"system\" else \"\" -%}\n{%- if messages[0].role == \"system\" -%}\n {%- set messages = messages[1:] -%}\n{%- endif -%}\n\n{%- if not tool_prompt -%}\n {%- set tool_prompt = \"For each function call return a json object with function name and arguments within tags with the following schema:\n\n{\\\"name\\\": , \\\"arguments\\\": }\n\" -%}\n{%- endif -%}\n\n{%- if system_message or tools -%}\n {{- '<|im_start|>system\n'}}\n{%- endif -%}\n\n{%- if system_message %}\n {{- system_message + \"\n\"}}\n{%- endif -%}\n\n{%- if tools -%}\n {{- \"You are a function-calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\n\" }}\n {{- \"\n\" }}\n {{- tools }}\n {{- \"\n\n\" }}\n {{- tool_prompt -}}\n{%- endif -%}\n\n{%- if system_message or tools -%}\n {{- '<|im_end|>\n'}}\n{%- endif -%}\n\n{# Main message loop #}\n{%- for message in messages -%}\n {%- if message.role == \"user\" or message.role == \"assistant\" or message.role == \"tool\" -%}\n {%- if loop.first and message.role != \"user\" -%}\n {{ raise_exception(\"Invalid sequence: The first message role must be 'user' after 'system' if provided .\") }}\n {%- endif -%}\n\n {%- if not loop.first and message.role in [\"user\", \"assistant\"] and message.role == loop.previtem.role -%}\n {{ raise_exception(\"Invalid sequence: Consecutive messages cannot have the same role ('user' or 'assistant').\") }}\n {%- endif -%}\n\n {%- if message.role == \"user\" and not loop.first and loop.previtem.role != \"assistant\" -%}\n {{ raise_exception(\"Invalid sequence: A 'user' message must be preceded by an 'assistant' message.\") }}\n {%- endif -%}\n\n {%- if message.role == \"tool\" and not loop.first and loop.previtem.role not in [\"assistant\", \"tool\"] -%}\n {{ raise_exception(\"Invalid sequence: A 'tool' message must be preceded by 'assistant' or 'tool'.\") }}\n {%- endif -%}\n {%- else -%}\n {{- raise_exception(\"Invalid role detected: only 'user', 'assistant', or 'tool' roles are accepted.\") }}\n {%- endif -%}\n {%- if message.role == \"user\" or (message.role == \"assistant\" and message.tool_calls is not defined) -%}\n {{- '<|im_start|>' + message.role + '\n' + message.content | trim + '<|im_end|>\n'}}\n {%- elif message.role == \"assistant\" -%}\n {{- '<|im_start|>' + message.role }}\n {%- for tool_call in message.tool_calls -%}\n {{ '\n\n' }}\n {%- if tool_call.function -%}\n {\"name\": \"{{ tool_call.function.name }}\", \"arguments\": {{ tool_call.function.arguments | tojson }} }\n {%- else -%}\n {\"name\": \"{{ tool_call.name }}\", \"arguments\": {{ tool_call.arguments | tojson }} }\n {%- endif -%}\n {{ '\n' }}\n {%- endfor -%}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" -%}\n {%- if loop.previtem and loop.previtem.role != \"tool\" -%}\n {{- '<|im_start|>tool\n' }}\n {%- endif -%}\n {{- '\n' }} \n {{- message.content }}\n {{- '\n\n' }}\n {%- if loop.last or loop.nextitem.role != \"tool\" -%}\n {{- '<|im_end|>\n'}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n\n{# Prompt for assistant generation if needed #}\n{%- if add_generation_prompt -%}\n {{- '<|im_start|>assistant\n' }}\n{%- endif -%}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "legacy": true, + "model_max_length": 8192, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..9be42f6 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,10175 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999066554653225, + "eval_steps": 14, + "global_step": 1339, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000746756277419957, + "grad_norm": 39.864603672245025, + "learning_rate": 2.439024390243903e-07, + "loss": 1.872, + "step": 1 + }, + { + "epoch": 0.001493512554839914, + "grad_norm": 25.154230105266365, + "learning_rate": 4.878048780487805e-07, + "loss": 1.9997, + "step": 2 + }, + { + "epoch": 0.0022402688322598714, + "grad_norm": 39.12364037859748, + "learning_rate": 7.317073170731707e-07, + "loss": 1.9296, + "step": 3 + }, + { + "epoch": 0.002987025109679828, + "grad_norm": 33.83330023997382, + "learning_rate": 9.75609756097561e-07, + "loss": 1.9772, + "step": 4 + }, + { + "epoch": 0.003733781387099785, + "grad_norm": 21.94955478058591, + "learning_rate": 1.2195121951219514e-06, + "loss": 1.87, + "step": 5 + }, + { + "epoch": 0.004480537664519743, + "grad_norm": 20.905610860537625, + "learning_rate": 1.4634146341463414e-06, + "loss": 1.7998, + "step": 6 + }, + { + "epoch": 0.005227293941939699, + "grad_norm": 22.006935536653316, + "learning_rate": 1.707317073170732e-06, + "loss": 1.8928, + "step": 7 + }, + { + "epoch": 0.005974050219359656, + "grad_norm": 19.70076963746501, + "learning_rate": 1.951219512195122e-06, + "loss": 1.9504, + "step": 8 + }, + { + "epoch": 0.006720806496779613, + "grad_norm": 22.22349502935227, + "learning_rate": 2.1951219512195125e-06, + "loss": 2.1065, + "step": 9 + }, + { + "epoch": 0.00746756277419957, + "grad_norm": 153.22088865153165, + "learning_rate": 2.4390243902439027e-06, + "loss": 1.8163, + "step": 10 + }, + { + "epoch": 0.008214319051619528, + "grad_norm": 12.758400252716752, + "learning_rate": 2.682926829268293e-06, + "loss": 1.7599, + "step": 11 + }, + { + "epoch": 0.008961075329039486, + "grad_norm": 17.03707793518957, + "learning_rate": 2.926829268292683e-06, + "loss": 1.8174, + "step": 12 + }, + { + "epoch": 0.009707831606459442, + "grad_norm": 12.542896293073882, + "learning_rate": 3.1707317073170736e-06, + "loss": 1.6822, + "step": 13 + }, + { + "epoch": 0.010454587883879398, + "grad_norm": 16.440723028682665, + "learning_rate": 3.414634146341464e-06, + "loss": 1.7731, + "step": 14 + }, + { + "epoch": 0.010454587883879398, + "eval_loss": 1.4025799036026, + "eval_runtime": 179.1157, + "eval_samples_per_second": 100.656, + "eval_steps_per_second": 1.574, + "step": 14 + }, + { + "epoch": 0.011201344161299356, + "grad_norm": 17.64290219497995, + "learning_rate": 3.6585365853658537e-06, + "loss": 1.7477, + "step": 15 + }, + { + "epoch": 0.011948100438719312, + "grad_norm": 14.176146937463686, + "learning_rate": 3.902439024390244e-06, + "loss": 1.6269, + "step": 16 + }, + { + "epoch": 0.01269485671613927, + "grad_norm": 10.32131977607697, + "learning_rate": 4.146341463414634e-06, + "loss": 1.5132, + "step": 17 + }, + { + "epoch": 0.013441612993559227, + "grad_norm": 11.77667555622052, + "learning_rate": 4.390243902439025e-06, + "loss": 1.6, + "step": 18 + }, + { + "epoch": 0.014188369270979185, + "grad_norm": 11.65934333405466, + "learning_rate": 4.634146341463416e-06, + "loss": 1.5687, + "step": 19 + }, + { + "epoch": 0.01493512554839914, + "grad_norm": 14.71495813797744, + "learning_rate": 4.8780487804878055e-06, + "loss": 1.4833, + "step": 20 + }, + { + "epoch": 0.0156818818258191, + "grad_norm": 19.563751131877677, + "learning_rate": 5.121951219512195e-06, + "loss": 1.5502, + "step": 21 + }, + { + "epoch": 0.016428638103239055, + "grad_norm": 9.697052484002269, + "learning_rate": 5.365853658536586e-06, + "loss": 1.499, + "step": 22 + }, + { + "epoch": 0.01717539438065901, + "grad_norm": 7.808180192853328, + "learning_rate": 5.609756097560977e-06, + "loss": 1.4112, + "step": 23 + }, + { + "epoch": 0.01792215065807897, + "grad_norm": 7.808693680801286, + "learning_rate": 5.853658536585366e-06, + "loss": 1.379, + "step": 24 + }, + { + "epoch": 0.018668906935498927, + "grad_norm": 10.205161557594558, + "learning_rate": 6.0975609756097564e-06, + "loss": 1.5749, + "step": 25 + }, + { + "epoch": 0.019415663212918884, + "grad_norm": 10.082022554511203, + "learning_rate": 6.341463414634147e-06, + "loss": 1.5026, + "step": 26 + }, + { + "epoch": 0.02016241949033884, + "grad_norm": 7.5938534821295285, + "learning_rate": 6.585365853658538e-06, + "loss": 1.4385, + "step": 27 + }, + { + "epoch": 0.020909175767758796, + "grad_norm": 8.080362258830748, + "learning_rate": 6.829268292682928e-06, + "loss": 1.368, + "step": 28 + }, + { + "epoch": 0.020909175767758796, + "eval_loss": 1.2178118228912354, + "eval_runtime": 160.607, + "eval_samples_per_second": 112.255, + "eval_steps_per_second": 1.756, + "step": 28 + }, + { + "epoch": 0.021655932045178756, + "grad_norm": 6.42748880141974, + "learning_rate": 7.0731707317073175e-06, + "loss": 1.373, + "step": 29 + }, + { + "epoch": 0.022402688322598712, + "grad_norm": 11.993286705868165, + "learning_rate": 7.317073170731707e-06, + "loss": 1.2981, + "step": 30 + }, + { + "epoch": 0.02314944460001867, + "grad_norm": 9.70428655700436, + "learning_rate": 7.560975609756098e-06, + "loss": 1.3723, + "step": 31 + }, + { + "epoch": 0.023896200877438625, + "grad_norm": 7.543123356855298, + "learning_rate": 7.804878048780489e-06, + "loss": 1.3837, + "step": 32 + }, + { + "epoch": 0.024642957154858584, + "grad_norm": 6.3551480225016865, + "learning_rate": 8.048780487804879e-06, + "loss": 1.3434, + "step": 33 + }, + { + "epoch": 0.02538971343227854, + "grad_norm": 6.170362178766895, + "learning_rate": 8.292682926829268e-06, + "loss": 1.2914, + "step": 34 + }, + { + "epoch": 0.026136469709698497, + "grad_norm": 7.579209609335756, + "learning_rate": 8.536585365853658e-06, + "loss": 1.4315, + "step": 35 + }, + { + "epoch": 0.026883225987118453, + "grad_norm": 8.161772733498161, + "learning_rate": 8.78048780487805e-06, + "loss": 1.4156, + "step": 36 + }, + { + "epoch": 0.027629982264538413, + "grad_norm": 9.153242946428481, + "learning_rate": 9.02439024390244e-06, + "loss": 1.3437, + "step": 37 + }, + { + "epoch": 0.02837673854195837, + "grad_norm": 8.270074388479511, + "learning_rate": 9.268292682926831e-06, + "loss": 1.3882, + "step": 38 + }, + { + "epoch": 0.029123494819378325, + "grad_norm": 5.810086051514348, + "learning_rate": 9.51219512195122e-06, + "loss": 1.2423, + "step": 39 + }, + { + "epoch": 0.02987025109679828, + "grad_norm": 5.940688408073622, + "learning_rate": 9.756097560975611e-06, + "loss": 1.3012, + "step": 40 + }, + { + "epoch": 0.030617007374218238, + "grad_norm": 7.446040839269437, + "learning_rate": 1e-05, + "loss": 1.3854, + "step": 41 + }, + { + "epoch": 0.0313637636516382, + "grad_norm": 7.823717873679139, + "learning_rate": 9.999985354973661e-06, + "loss": 1.2707, + "step": 42 + }, + { + "epoch": 0.0313637636516382, + "eval_loss": 1.1517595052719116, + "eval_runtime": 160.6701, + "eval_samples_per_second": 112.211, + "eval_steps_per_second": 1.755, + "step": 42 + }, + { + "epoch": 0.032110519929058154, + "grad_norm": 5.646313932886119, + "learning_rate": 9.999941419980432e-06, + "loss": 1.2381, + "step": 43 + }, + { + "epoch": 0.03285727620647811, + "grad_norm": 6.520800231029172, + "learning_rate": 9.999868195277684e-06, + "loss": 1.2704, + "step": 44 + }, + { + "epoch": 0.033604032483898066, + "grad_norm": 5.235413753926133, + "learning_rate": 9.999765681294371e-06, + "loss": 1.2493, + "step": 45 + }, + { + "epoch": 0.03435078876131802, + "grad_norm": 8.080884030634016, + "learning_rate": 9.99963387863102e-06, + "loss": 1.2873, + "step": 46 + }, + { + "epoch": 0.03509754503873798, + "grad_norm": 7.188542456253142, + "learning_rate": 9.999472788059732e-06, + "loss": 1.3258, + "step": 47 + }, + { + "epoch": 0.03584430131615794, + "grad_norm": 5.530317330717678, + "learning_rate": 9.999282410524176e-06, + "loss": 1.2104, + "step": 48 + }, + { + "epoch": 0.0365910575935779, + "grad_norm": 5.429267063162806, + "learning_rate": 9.999062747139587e-06, + "loss": 1.2585, + "step": 49 + }, + { + "epoch": 0.037337813870997855, + "grad_norm": 6.8607568553163425, + "learning_rate": 9.998813799192756e-06, + "loss": 1.2217, + "step": 50 + }, + { + "epoch": 0.03808457014841781, + "grad_norm": 5.864542890549609, + "learning_rate": 9.99853556814202e-06, + "loss": 1.1748, + "step": 51 + }, + { + "epoch": 0.03883132642583777, + "grad_norm": 6.776573554690618, + "learning_rate": 9.998228055617264e-06, + "loss": 1.2681, + "step": 52 + }, + { + "epoch": 0.03957808270325772, + "grad_norm": 5.240453608787099, + "learning_rate": 9.997891263419896e-06, + "loss": 1.2825, + "step": 53 + }, + { + "epoch": 0.04032483898067768, + "grad_norm": 5.502379192462283, + "learning_rate": 9.997525193522848e-06, + "loss": 1.1202, + "step": 54 + }, + { + "epoch": 0.041071595258097636, + "grad_norm": 4.981330215088245, + "learning_rate": 9.997129848070563e-06, + "loss": 1.237, + "step": 55 + }, + { + "epoch": 0.04181835153551759, + "grad_norm": 5.658491502222876, + "learning_rate": 9.99670522937898e-06, + "loss": 1.1674, + "step": 56 + }, + { + "epoch": 0.04181835153551759, + "eval_loss": 1.1058392524719238, + "eval_runtime": 160.6229, + "eval_samples_per_second": 112.244, + "eval_steps_per_second": 1.756, + "step": 56 + }, + { + "epoch": 0.042565107812937555, + "grad_norm": 5.781043637294228, + "learning_rate": 9.996251339935517e-06, + "loss": 1.2731, + "step": 57 + }, + { + "epoch": 0.04331186409035751, + "grad_norm": 6.798923459655398, + "learning_rate": 9.995768182399063e-06, + "loss": 1.2602, + "step": 58 + }, + { + "epoch": 0.04405862036777747, + "grad_norm": 8.183844154095041, + "learning_rate": 9.995255759599963e-06, + "loss": 1.2871, + "step": 59 + }, + { + "epoch": 0.044805376645197424, + "grad_norm": 5.8833180570242805, + "learning_rate": 9.994714074539991e-06, + "loss": 1.2261, + "step": 60 + }, + { + "epoch": 0.04555213292261738, + "grad_norm": 5.9510640282981395, + "learning_rate": 9.99414313039235e-06, + "loss": 1.1705, + "step": 61 + }, + { + "epoch": 0.04629888920003734, + "grad_norm": 4.960391929203876, + "learning_rate": 9.993542930501629e-06, + "loss": 1.1892, + "step": 62 + }, + { + "epoch": 0.04704564547745729, + "grad_norm": 6.082371592489799, + "learning_rate": 9.99291347838381e-06, + "loss": 1.2665, + "step": 63 + }, + { + "epoch": 0.04779240175487725, + "grad_norm": 5.928756573070347, + "learning_rate": 9.992254777726231e-06, + "loss": 1.2466, + "step": 64 + }, + { + "epoch": 0.04853915803229721, + "grad_norm": 5.784351587819939, + "learning_rate": 9.991566832387564e-06, + "loss": 1.1332, + "step": 65 + }, + { + "epoch": 0.04928591430971717, + "grad_norm": 5.309638624254364, + "learning_rate": 9.990849646397803e-06, + "loss": 1.0934, + "step": 66 + }, + { + "epoch": 0.050032670587137125, + "grad_norm": 5.282903270237203, + "learning_rate": 9.99010322395823e-06, + "loss": 1.1775, + "step": 67 + }, + { + "epoch": 0.05077942686455708, + "grad_norm": 7.505150676696945, + "learning_rate": 9.989327569441395e-06, + "loss": 1.1281, + "step": 68 + }, + { + "epoch": 0.05152618314197704, + "grad_norm": 5.111300188914756, + "learning_rate": 9.988522687391092e-06, + "loss": 1.0796, + "step": 69 + }, + { + "epoch": 0.052272939419396994, + "grad_norm": 5.220282812165929, + "learning_rate": 9.987688582522325e-06, + "loss": 1.1937, + "step": 70 + }, + { + "epoch": 0.052272939419396994, + "eval_loss": 1.0714255571365356, + "eval_runtime": 160.6215, + "eval_samples_per_second": 112.245, + "eval_steps_per_second": 1.756, + "step": 70 + }, + { + "epoch": 0.05301969569681695, + "grad_norm": 6.120957463564528, + "learning_rate": 9.986825259721292e-06, + "loss": 1.0982, + "step": 71 + }, + { + "epoch": 0.053766451974236906, + "grad_norm": 4.980966853682578, + "learning_rate": 9.985932724045347e-06, + "loss": 1.1755, + "step": 72 + }, + { + "epoch": 0.05451320825165686, + "grad_norm": 4.331150227314127, + "learning_rate": 9.985010980722974e-06, + "loss": 1.103, + "step": 73 + }, + { + "epoch": 0.055259964529076826, + "grad_norm": 5.124677563712541, + "learning_rate": 9.984060035153752e-06, + "loss": 1.1304, + "step": 74 + }, + { + "epoch": 0.05600672080649678, + "grad_norm": 5.5037953572716685, + "learning_rate": 9.983079892908332e-06, + "loss": 1.0897, + "step": 75 + }, + { + "epoch": 0.05675347708391674, + "grad_norm": 6.6795743519930415, + "learning_rate": 9.982070559728398e-06, + "loss": 1.2371, + "step": 76 + }, + { + "epoch": 0.057500233361336694, + "grad_norm": 5.052137551270288, + "learning_rate": 9.981032041526635e-06, + "loss": 1.124, + "step": 77 + }, + { + "epoch": 0.05824698963875665, + "grad_norm": 4.687977226171484, + "learning_rate": 9.979964344386692e-06, + "loss": 1.151, + "step": 78 + }, + { + "epoch": 0.05899374591617661, + "grad_norm": 4.4520211232435125, + "learning_rate": 9.978867474563151e-06, + "loss": 1.1519, + "step": 79 + }, + { + "epoch": 0.05974050219359656, + "grad_norm": 3.8288574242884774, + "learning_rate": 9.977741438481487e-06, + "loss": 1.0624, + "step": 80 + }, + { + "epoch": 0.06048725847101652, + "grad_norm": 4.534421709411, + "learning_rate": 9.976586242738032e-06, + "loss": 1.1844, + "step": 81 + }, + { + "epoch": 0.061234014748436476, + "grad_norm": 3.884735997829129, + "learning_rate": 9.975401894099933e-06, + "loss": 1.1259, + "step": 82 + }, + { + "epoch": 0.06198077102585644, + "grad_norm": 5.654542687957219, + "learning_rate": 9.974188399505116e-06, + "loss": 1.0949, + "step": 83 + }, + { + "epoch": 0.0627275273032764, + "grad_norm": 11.863718915842355, + "learning_rate": 9.972945766062248e-06, + "loss": 1.1209, + "step": 84 + }, + { + "epoch": 0.0627275273032764, + "eval_loss": 1.0480048656463623, + "eval_runtime": 160.6234, + "eval_samples_per_second": 112.244, + "eval_steps_per_second": 1.756, + "step": 84 + }, + { + "epoch": 0.06347428358069634, + "grad_norm": 5.672730835621609, + "learning_rate": 9.971674001050687e-06, + "loss": 1.0811, + "step": 85 + }, + { + "epoch": 0.06422103985811631, + "grad_norm": 4.58816995647344, + "learning_rate": 9.970373111920447e-06, + "loss": 1.0538, + "step": 86 + }, + { + "epoch": 0.06496779613553627, + "grad_norm": 3.901498541644042, + "learning_rate": 9.969043106292149e-06, + "loss": 1.1502, + "step": 87 + }, + { + "epoch": 0.06571455241295622, + "grad_norm": 4.564513799166725, + "learning_rate": 9.96768399195698e-06, + "loss": 1.0951, + "step": 88 + }, + { + "epoch": 0.06646130869037618, + "grad_norm": 5.186730910602631, + "learning_rate": 9.966295776876648e-06, + "loss": 1.1669, + "step": 89 + }, + { + "epoch": 0.06720806496779613, + "grad_norm": 4.925917142678016, + "learning_rate": 9.96487846918333e-06, + "loss": 1.1522, + "step": 90 + }, + { + "epoch": 0.0679548212452161, + "grad_norm": 6.5957881769118405, + "learning_rate": 9.963432077179629e-06, + "loss": 1.1221, + "step": 91 + }, + { + "epoch": 0.06870157752263605, + "grad_norm": 4.099086784960917, + "learning_rate": 9.961956609338526e-06, + "loss": 1.1186, + "step": 92 + }, + { + "epoch": 0.06944833380005601, + "grad_norm": 3.70039872123764, + "learning_rate": 9.960452074303327e-06, + "loss": 1.1268, + "step": 93 + }, + { + "epoch": 0.07019509007747596, + "grad_norm": 4.785660260223015, + "learning_rate": 9.958918480887612e-06, + "loss": 1.11, + "step": 94 + }, + { + "epoch": 0.07094184635489592, + "grad_norm": 3.968060355634415, + "learning_rate": 9.957355838075188e-06, + "loss": 1.1706, + "step": 95 + }, + { + "epoch": 0.07168860263231588, + "grad_norm": 3.793655960096394, + "learning_rate": 9.955764155020037e-06, + "loss": 1.0165, + "step": 96 + }, + { + "epoch": 0.07243535890973583, + "grad_norm": 4.377595345492589, + "learning_rate": 9.95414344104625e-06, + "loss": 1.1269, + "step": 97 + }, + { + "epoch": 0.0731821151871558, + "grad_norm": 3.7820990485608044, + "learning_rate": 9.952493705647989e-06, + "loss": 1.1133, + "step": 98 + }, + { + "epoch": 0.0731821151871558, + "eval_loss": 1.0365121364593506, + "eval_runtime": 160.5554, + "eval_samples_per_second": 112.291, + "eval_steps_per_second": 1.756, + "step": 98 + }, + { + "epoch": 0.07392887146457575, + "grad_norm": 5.305536373865056, + "learning_rate": 9.950814958489421e-06, + "loss": 1.1091, + "step": 99 + }, + { + "epoch": 0.07467562774199571, + "grad_norm": 4.4015698622632895, + "learning_rate": 9.949107209404664e-06, + "loss": 1.1791, + "step": 100 + }, + { + "epoch": 0.07542238401941566, + "grad_norm": 4.975666896127531, + "learning_rate": 9.947370468397731e-06, + "loss": 1.1585, + "step": 101 + }, + { + "epoch": 0.07616914029683562, + "grad_norm": 3.7736499179128167, + "learning_rate": 9.94560474564247e-06, + "loss": 1.038, + "step": 102 + }, + { + "epoch": 0.07691589657425557, + "grad_norm": 4.240252244792403, + "learning_rate": 9.9438100514825e-06, + "loss": 1.0741, + "step": 103 + }, + { + "epoch": 0.07766265285167553, + "grad_norm": 6.410869819768315, + "learning_rate": 9.941986396431161e-06, + "loss": 1.1008, + "step": 104 + }, + { + "epoch": 0.0784094091290955, + "grad_norm": 3.9651365430008463, + "learning_rate": 9.940133791171445e-06, + "loss": 1.0818, + "step": 105 + }, + { + "epoch": 0.07915616540651545, + "grad_norm": 4.449129793463904, + "learning_rate": 9.938252246555929e-06, + "loss": 1.1769, + "step": 106 + }, + { + "epoch": 0.07990292168393541, + "grad_norm": 4.552135217930411, + "learning_rate": 9.936341773606723e-06, + "loss": 1.0777, + "step": 107 + }, + { + "epoch": 0.08064967796135536, + "grad_norm": 4.190554265440604, + "learning_rate": 9.9344023835154e-06, + "loss": 1.054, + "step": 108 + }, + { + "epoch": 0.08139643423877532, + "grad_norm": 4.16264776639165, + "learning_rate": 9.932434087642924e-06, + "loss": 1.0736, + "step": 109 + }, + { + "epoch": 0.08214319051619527, + "grad_norm": 5.68044977176061, + "learning_rate": 9.930436897519595e-06, + "loss": 1.0091, + "step": 110 + }, + { + "epoch": 0.08288994679361524, + "grad_norm": 3.0998166629146042, + "learning_rate": 9.928410824844974e-06, + "loss": 1.0378, + "step": 111 + }, + { + "epoch": 0.08363670307103518, + "grad_norm": 4.842084164752693, + "learning_rate": 9.926355881487815e-06, + "loss": 1.1629, + "step": 112 + }, + { + "epoch": 0.08363670307103518, + "eval_loss": 1.032652735710144, + "eval_runtime": 160.5893, + "eval_samples_per_second": 112.268, + "eval_steps_per_second": 1.756, + "step": 112 + }, + { + "epoch": 0.08438345934845515, + "grad_norm": 3.7500012310156094, + "learning_rate": 9.924272079485996e-06, + "loss": 1.1139, + "step": 113 + }, + { + "epoch": 0.08513021562587511, + "grad_norm": 3.6304218738612177, + "learning_rate": 9.922159431046457e-06, + "loss": 1.0562, + "step": 114 + }, + { + "epoch": 0.08587697190329506, + "grad_norm": 4.219419886285629, + "learning_rate": 9.920017948545109e-06, + "loss": 1.0334, + "step": 115 + }, + { + "epoch": 0.08662372818071502, + "grad_norm": 4.106591390142076, + "learning_rate": 9.91784764452678e-06, + "loss": 1.0092, + "step": 116 + }, + { + "epoch": 0.08737048445813497, + "grad_norm": 3.2149890199105893, + "learning_rate": 9.91564853170514e-06, + "loss": 1.0324, + "step": 117 + }, + { + "epoch": 0.08811724073555494, + "grad_norm": 4.303949996240386, + "learning_rate": 9.913420622962606e-06, + "loss": 1.0643, + "step": 118 + }, + { + "epoch": 0.08886399701297489, + "grad_norm": 3.475558336468595, + "learning_rate": 9.911163931350296e-06, + "loss": 1.0757, + "step": 119 + }, + { + "epoch": 0.08961075329039485, + "grad_norm": 3.812619171039642, + "learning_rate": 9.908878470087931e-06, + "loss": 1.1746, + "step": 120 + }, + { + "epoch": 0.0903575095678148, + "grad_norm": 3.990770542904442, + "learning_rate": 9.906564252563769e-06, + "loss": 1.0596, + "step": 121 + }, + { + "epoch": 0.09110426584523476, + "grad_norm": 3.8102423981670337, + "learning_rate": 9.904221292334521e-06, + "loss": 1.0186, + "step": 122 + }, + { + "epoch": 0.09185102212265472, + "grad_norm": 3.766965024133607, + "learning_rate": 9.901849603125271e-06, + "loss": 1.1101, + "step": 123 + }, + { + "epoch": 0.09259777840007467, + "grad_norm": 3.3405241849704788, + "learning_rate": 9.8994491988294e-06, + "loss": 0.9937, + "step": 124 + }, + { + "epoch": 0.09334453467749464, + "grad_norm": 3.9927196426527383, + "learning_rate": 9.897020093508502e-06, + "loss": 1.0596, + "step": 125 + }, + { + "epoch": 0.09409129095491459, + "grad_norm": 3.2836748843642254, + "learning_rate": 9.894562301392301e-06, + "loss": 1.0812, + "step": 126 + }, + { + "epoch": 0.09409129095491459, + "eval_loss": 1.0256747007369995, + "eval_runtime": 160.6424, + "eval_samples_per_second": 112.231, + "eval_steps_per_second": 1.755, + "step": 126 + }, + { + "epoch": 0.09483804723233455, + "grad_norm": 4.363380401494807, + "learning_rate": 9.89207583687857e-06, + "loss": 1.0794, + "step": 127 + }, + { + "epoch": 0.0955848035097545, + "grad_norm": 6.478735079038311, + "learning_rate": 9.889560714533043e-06, + "loss": 1.066, + "step": 128 + }, + { + "epoch": 0.09633155978717446, + "grad_norm": 3.5425697986871705, + "learning_rate": 9.887016949089334e-06, + "loss": 1.0932, + "step": 129 + }, + { + "epoch": 0.09707831606459442, + "grad_norm": 3.693076593064551, + "learning_rate": 9.884444555448848e-06, + "loss": 1.0519, + "step": 130 + }, + { + "epoch": 0.09782507234201437, + "grad_norm": 7.860038890324692, + "learning_rate": 9.881843548680694e-06, + "loss": 1.0848, + "step": 131 + }, + { + "epoch": 0.09857182861943434, + "grad_norm": 7.4142740851339335, + "learning_rate": 9.879213944021597e-06, + "loss": 1.1747, + "step": 132 + }, + { + "epoch": 0.09931858489685429, + "grad_norm": 3.3316594853280947, + "learning_rate": 9.876555756875807e-06, + "loss": 0.9748, + "step": 133 + }, + { + "epoch": 0.10006534117427425, + "grad_norm": 3.78741230296177, + "learning_rate": 9.873869002815015e-06, + "loss": 1.0438, + "step": 134 + }, + { + "epoch": 0.1008120974516942, + "grad_norm": 4.609277603093571, + "learning_rate": 9.871153697578254e-06, + "loss": 0.9756, + "step": 135 + }, + { + "epoch": 0.10155885372911416, + "grad_norm": 4.188743485193134, + "learning_rate": 9.86840985707181e-06, + "loss": 1.0559, + "step": 136 + }, + { + "epoch": 0.10230561000653411, + "grad_norm": 3.428561895633466, + "learning_rate": 9.86563749736913e-06, + "loss": 1.0102, + "step": 137 + }, + { + "epoch": 0.10305236628395407, + "grad_norm": 6.01857672692751, + "learning_rate": 9.862836634710724e-06, + "loss": 1.0943, + "step": 138 + }, + { + "epoch": 0.10379912256137404, + "grad_norm": 4.3281257428742785, + "learning_rate": 9.860007285504079e-06, + "loss": 1.1187, + "step": 139 + }, + { + "epoch": 0.10454587883879399, + "grad_norm": 3.7218504298177675, + "learning_rate": 9.85714946632355e-06, + "loss": 1.0917, + "step": 140 + }, + { + "epoch": 0.10454587883879399, + "eval_loss": 1.023650884628296, + "eval_runtime": 160.5345, + "eval_samples_per_second": 112.306, + "eval_steps_per_second": 1.757, + "step": 140 + }, + { + "epoch": 0.10529263511621395, + "grad_norm": 3.4489168662027176, + "learning_rate": 9.854263193910274e-06, + "loss": 1.1063, + "step": 141 + }, + { + "epoch": 0.1060393913936339, + "grad_norm": 4.423623706010156, + "learning_rate": 9.85134848517206e-06, + "loss": 1.0728, + "step": 142 + }, + { + "epoch": 0.10678614767105386, + "grad_norm": 2.8649958611873076, + "learning_rate": 9.84840535718331e-06, + "loss": 1.0078, + "step": 143 + }, + { + "epoch": 0.10753290394847381, + "grad_norm": 4.338171037781992, + "learning_rate": 9.845433827184894e-06, + "loss": 1.0381, + "step": 144 + }, + { + "epoch": 0.10827966022589378, + "grad_norm": 5.171977346386107, + "learning_rate": 9.842433912584066e-06, + "loss": 1.1569, + "step": 145 + }, + { + "epoch": 0.10902641650331373, + "grad_norm": 3.3107619410438662, + "learning_rate": 9.839405630954358e-06, + "loss": 1.086, + "step": 146 + }, + { + "epoch": 0.10977317278073369, + "grad_norm": 2.840297473307081, + "learning_rate": 9.836349000035477e-06, + "loss": 1.0401, + "step": 147 + }, + { + "epoch": 0.11051992905815365, + "grad_norm": 5.3601003369045594, + "learning_rate": 9.833264037733198e-06, + "loss": 1.1039, + "step": 148 + }, + { + "epoch": 0.1112666853355736, + "grad_norm": 4.4022528506550955, + "learning_rate": 9.83015076211926e-06, + "loss": 1.0417, + "step": 149 + }, + { + "epoch": 0.11201344161299356, + "grad_norm": 3.547705426654357, + "learning_rate": 9.827009191431271e-06, + "loss": 0.9775, + "step": 150 + }, + { + "epoch": 0.11276019789041351, + "grad_norm": 4.149903504313127, + "learning_rate": 9.823839344072582e-06, + "loss": 1.0646, + "step": 151 + }, + { + "epoch": 0.11350695416783348, + "grad_norm": 3.9437966555055355, + "learning_rate": 9.820641238612187e-06, + "loss": 1.1617, + "step": 152 + }, + { + "epoch": 0.11425371044525343, + "grad_norm": 3.443249729940194, + "learning_rate": 9.81741489378463e-06, + "loss": 1.0669, + "step": 153 + }, + { + "epoch": 0.11500046672267339, + "grad_norm": 3.975523331591033, + "learning_rate": 9.814160328489867e-06, + "loss": 1.0538, + "step": 154 + }, + { + "epoch": 0.11500046672267339, + "eval_loss": 1.0189063549041748, + "eval_runtime": 160.548, + "eval_samples_per_second": 112.297, + "eval_steps_per_second": 1.756, + "step": 154 + }, + { + "epoch": 0.11574722300009334, + "grad_norm": 6.747016403332581, + "learning_rate": 9.810877561793178e-06, + "loss": 1.0677, + "step": 155 + }, + { + "epoch": 0.1164939792775133, + "grad_norm": 3.502112711353132, + "learning_rate": 9.807566612925044e-06, + "loss": 1.1483, + "step": 156 + }, + { + "epoch": 0.11724073555493326, + "grad_norm": 4.033103295708667, + "learning_rate": 9.804227501281041e-06, + "loss": 1.0036, + "step": 157 + }, + { + "epoch": 0.11798749183235321, + "grad_norm": 3.353125159155293, + "learning_rate": 9.800860246421717e-06, + "loss": 1.0754, + "step": 158 + }, + { + "epoch": 0.11873424810977318, + "grad_norm": 5.233916333194442, + "learning_rate": 9.797464868072489e-06, + "loss": 1.0573, + "step": 159 + }, + { + "epoch": 0.11948100438719313, + "grad_norm": 4.551032759166754, + "learning_rate": 9.794041386123517e-06, + "loss": 1.1049, + "step": 160 + }, + { + "epoch": 0.12022776066461309, + "grad_norm": 4.518239629943308, + "learning_rate": 9.790589820629594e-06, + "loss": 1.024, + "step": 161 + }, + { + "epoch": 0.12097451694203304, + "grad_norm": 3.8193115164547984, + "learning_rate": 9.787110191810027e-06, + "loss": 1.13, + "step": 162 + }, + { + "epoch": 0.121721273219453, + "grad_norm": 3.7115307636722674, + "learning_rate": 9.783602520048524e-06, + "loss": 1.1001, + "step": 163 + }, + { + "epoch": 0.12246802949687295, + "grad_norm": 5.181860033123124, + "learning_rate": 9.780066825893055e-06, + "loss": 1.1233, + "step": 164 + }, + { + "epoch": 0.12321478577429291, + "grad_norm": 4.818202897694641, + "learning_rate": 9.776503130055758e-06, + "loss": 1.0216, + "step": 165 + }, + { + "epoch": 0.12396154205171288, + "grad_norm": 7.927525142394235, + "learning_rate": 9.7729114534128e-06, + "loss": 1.0463, + "step": 166 + }, + { + "epoch": 0.12470829832913283, + "grad_norm": 3.743721622719181, + "learning_rate": 9.76929181700426e-06, + "loss": 1.1261, + "step": 167 + }, + { + "epoch": 0.1254550546065528, + "grad_norm": 5.6431869998808875, + "learning_rate": 9.765644242034009e-06, + "loss": 1.0746, + "step": 168 + }, + { + "epoch": 0.1254550546065528, + "eval_loss": 1.017249345779419, + "eval_runtime": 160.6003, + "eval_samples_per_second": 112.26, + "eval_steps_per_second": 1.756, + "step": 168 + }, + { + "epoch": 0.12620181088397275, + "grad_norm": 3.131298638211881, + "learning_rate": 9.761968749869576e-06, + "loss": 0.9417, + "step": 169 + }, + { + "epoch": 0.1269485671613927, + "grad_norm": 7.862269405575784, + "learning_rate": 9.758265362042035e-06, + "loss": 1.1121, + "step": 170 + }, + { + "epoch": 0.12769532343881265, + "grad_norm": 3.640159123263889, + "learning_rate": 9.754534100245867e-06, + "loss": 1.0805, + "step": 171 + }, + { + "epoch": 0.12844207971623262, + "grad_norm": 4.5439716593057105, + "learning_rate": 9.750774986338851e-06, + "loss": 1.0338, + "step": 172 + }, + { + "epoch": 0.12918883599365258, + "grad_norm": 3.969721172469628, + "learning_rate": 9.746988042341907e-06, + "loss": 1.1218, + "step": 173 + }, + { + "epoch": 0.12993559227107254, + "grad_norm": 3.7519993464454453, + "learning_rate": 9.743173290438998e-06, + "loss": 1.0642, + "step": 174 + }, + { + "epoch": 0.13068234854849248, + "grad_norm": 3.9426723091303617, + "learning_rate": 9.739330752976981e-06, + "loss": 1.0608, + "step": 175 + }, + { + "epoch": 0.13142910482591244, + "grad_norm": 4.1886923692493285, + "learning_rate": 9.735460452465477e-06, + "loss": 1.1175, + "step": 176 + }, + { + "epoch": 0.1321758611033324, + "grad_norm": 3.7222485604926336, + "learning_rate": 9.731562411576751e-06, + "loss": 1.028, + "step": 177 + }, + { + "epoch": 0.13292261738075237, + "grad_norm": 3.3695163719588743, + "learning_rate": 9.727636653145567e-06, + "loss": 1.1298, + "step": 178 + }, + { + "epoch": 0.1336693736581723, + "grad_norm": 3.566733005393415, + "learning_rate": 9.723683200169059e-06, + "loss": 1.0149, + "step": 179 + }, + { + "epoch": 0.13441612993559227, + "grad_norm": 3.5062777269977223, + "learning_rate": 9.719702075806594e-06, + "loss": 1.0478, + "step": 180 + }, + { + "epoch": 0.13516288621301223, + "grad_norm": 4.408630041983221, + "learning_rate": 9.715693303379643e-06, + "loss": 1.0856, + "step": 181 + }, + { + "epoch": 0.1359096424904322, + "grad_norm": 4.602186459996313, + "learning_rate": 9.711656906371636e-06, + "loss": 1.0201, + "step": 182 + }, + { + "epoch": 0.1359096424904322, + "eval_loss": 1.0116474628448486, + "eval_runtime": 160.494, + "eval_samples_per_second": 112.334, + "eval_steps_per_second": 1.757, + "step": 182 + }, + { + "epoch": 0.13665639876785216, + "grad_norm": 4.712040307247732, + "learning_rate": 9.70759290842783e-06, + "loss": 1.0098, + "step": 183 + }, + { + "epoch": 0.1374031550452721, + "grad_norm": 5.620657515416197, + "learning_rate": 9.703501333355167e-06, + "loss": 1.0496, + "step": 184 + }, + { + "epoch": 0.13814991132269205, + "grad_norm": 5.801872955784515, + "learning_rate": 9.699382205122138e-06, + "loss": 1.1084, + "step": 185 + }, + { + "epoch": 0.13889666760011202, + "grad_norm": 3.8516216420714504, + "learning_rate": 9.695235547858638e-06, + "loss": 1.061, + "step": 186 + }, + { + "epoch": 0.13964342387753198, + "grad_norm": 3.8555280916375785, + "learning_rate": 9.69106138585583e-06, + "loss": 1.0538, + "step": 187 + }, + { + "epoch": 0.14039018015495192, + "grad_norm": 3.726076167412021, + "learning_rate": 9.686859743565997e-06, + "loss": 1.0334, + "step": 188 + }, + { + "epoch": 0.14113693643237188, + "grad_norm": 4.733689742037587, + "learning_rate": 9.682630645602409e-06, + "loss": 1.081, + "step": 189 + }, + { + "epoch": 0.14188369270979184, + "grad_norm": 3.858770100544451, + "learning_rate": 9.678374116739159e-06, + "loss": 1.0443, + "step": 190 + }, + { + "epoch": 0.1426304489872118, + "grad_norm": 4.070999582637023, + "learning_rate": 9.674090181911044e-06, + "loss": 1.1021, + "step": 191 + }, + { + "epoch": 0.14337720526463177, + "grad_norm": 4.5872247594173805, + "learning_rate": 9.669778866213397e-06, + "loss": 1.1395, + "step": 192 + }, + { + "epoch": 0.1441239615420517, + "grad_norm": 4.031163577626464, + "learning_rate": 9.665440194901951e-06, + "loss": 1.0292, + "step": 193 + }, + { + "epoch": 0.14487071781947167, + "grad_norm": 3.52474459878042, + "learning_rate": 9.661074193392689e-06, + "loss": 0.958, + "step": 194 + }, + { + "epoch": 0.14561747409689163, + "grad_norm": 3.436050793052391, + "learning_rate": 9.656680887261693e-06, + "loss": 0.986, + "step": 195 + }, + { + "epoch": 0.1463642303743116, + "grad_norm": 5.0419620789744295, + "learning_rate": 9.652260302244996e-06, + "loss": 1.0678, + "step": 196 + }, + { + "epoch": 0.1463642303743116, + "eval_loss": 1.0114675760269165, + "eval_runtime": 160.5029, + "eval_samples_per_second": 112.328, + "eval_steps_per_second": 1.757, + "step": 196 + }, + { + "epoch": 0.14711098665173153, + "grad_norm": 3.6147654687509694, + "learning_rate": 9.647812464238434e-06, + "loss": 1.028, + "step": 197 + }, + { + "epoch": 0.1478577429291515, + "grad_norm": 3.586767777623825, + "learning_rate": 9.643337399297485e-06, + "loss": 1.0449, + "step": 198 + }, + { + "epoch": 0.14860449920657146, + "grad_norm": 3.715503317019716, + "learning_rate": 9.638835133637129e-06, + "loss": 1.0655, + "step": 199 + }, + { + "epoch": 0.14935125548399142, + "grad_norm": 4.532690290017325, + "learning_rate": 9.634305693631686e-06, + "loss": 1.0047, + "step": 200 + }, + { + "epoch": 0.15009801176141138, + "grad_norm": 4.260013712201999, + "learning_rate": 9.629749105814664e-06, + "loss": 1.0509, + "step": 201 + }, + { + "epoch": 0.15084476803883132, + "grad_norm": 3.6166187921664794, + "learning_rate": 9.625165396878599e-06, + "loss": 1.0305, + "step": 202 + }, + { + "epoch": 0.15159152431625128, + "grad_norm": 4.163490064044642, + "learning_rate": 9.62055459367491e-06, + "loss": 1.093, + "step": 203 + }, + { + "epoch": 0.15233828059367124, + "grad_norm": 13.019898994869333, + "learning_rate": 9.615916723213728e-06, + "loss": 1.1112, + "step": 204 + }, + { + "epoch": 0.1530850368710912, + "grad_norm": 3.175945878707496, + "learning_rate": 9.611251812663748e-06, + "loss": 1.0688, + "step": 205 + }, + { + "epoch": 0.15383179314851114, + "grad_norm": 4.8715801343243195, + "learning_rate": 9.606559889352065e-06, + "loss": 1.0823, + "step": 206 + }, + { + "epoch": 0.1545785494259311, + "grad_norm": 3.232522890237694, + "learning_rate": 9.601840980764016e-06, + "loss": 0.9584, + "step": 207 + }, + { + "epoch": 0.15532530570335107, + "grad_norm": 4.556117894865469, + "learning_rate": 9.597095114543018e-06, + "loss": 1.0848, + "step": 208 + }, + { + "epoch": 0.15607206198077103, + "grad_norm": 3.404791411157451, + "learning_rate": 9.592322318490404e-06, + "loss": 1.0357, + "step": 209 + }, + { + "epoch": 0.156818818258191, + "grad_norm": 3.056791925639437, + "learning_rate": 9.587522620565263e-06, + "loss": 0.915, + "step": 210 + }, + { + "epoch": 0.156818818258191, + "eval_loss": 1.0087823867797852, + "eval_runtime": 160.6551, + "eval_samples_per_second": 112.222, + "eval_steps_per_second": 1.755, + "step": 210 + }, + { + "epoch": 0.15756557453561093, + "grad_norm": 3.7714924940308987, + "learning_rate": 9.582696048884277e-06, + "loss": 1.1256, + "step": 211 + }, + { + "epoch": 0.1583123308130309, + "grad_norm": 3.26219138631936, + "learning_rate": 9.577842631721553e-06, + "loss": 1.0654, + "step": 212 + }, + { + "epoch": 0.15905908709045086, + "grad_norm": 3.435138396589902, + "learning_rate": 9.57296239750846e-06, + "loss": 1.0228, + "step": 213 + }, + { + "epoch": 0.15980584336787082, + "grad_norm": 5.346686608096492, + "learning_rate": 9.568055374833463e-06, + "loss": 1.0289, + "step": 214 + }, + { + "epoch": 0.16055259964529076, + "grad_norm": 3.9966819427916347, + "learning_rate": 9.563121592441949e-06, + "loss": 1.1345, + "step": 215 + }, + { + "epoch": 0.16129935592271072, + "grad_norm": 5.555416407810244, + "learning_rate": 9.558161079236073e-06, + "loss": 1.1554, + "step": 216 + }, + { + "epoch": 0.16204611220013068, + "grad_norm": 4.159577619579252, + "learning_rate": 9.553173864274567e-06, + "loss": 1.0403, + "step": 217 + }, + { + "epoch": 0.16279286847755065, + "grad_norm": 3.318000661865423, + "learning_rate": 9.548159976772593e-06, + "loss": 1.0505, + "step": 218 + }, + { + "epoch": 0.1635396247549706, + "grad_norm": 3.2489399076259566, + "learning_rate": 9.543119446101556e-06, + "loss": 1.0231, + "step": 219 + }, + { + "epoch": 0.16428638103239054, + "grad_norm": 3.4135688094134933, + "learning_rate": 9.538052301788937e-06, + "loss": 1.1055, + "step": 220 + }, + { + "epoch": 0.1650331373098105, + "grad_norm": 3.268136264861273, + "learning_rate": 9.532958573518121e-06, + "loss": 0.9477, + "step": 221 + }, + { + "epoch": 0.16577989358723047, + "grad_norm": 3.5039317588782195, + "learning_rate": 9.527838291128222e-06, + "loss": 0.9773, + "step": 222 + }, + { + "epoch": 0.16652664986465043, + "grad_norm": 3.7195766398408088, + "learning_rate": 9.52269148461391e-06, + "loss": 1.0539, + "step": 223 + }, + { + "epoch": 0.16727340614207037, + "grad_norm": 3.5220614370608154, + "learning_rate": 9.51751818412523e-06, + "loss": 1.0342, + "step": 224 + }, + { + "epoch": 0.16727340614207037, + "eval_loss": 1.0059709548950195, + "eval_runtime": 160.5464, + "eval_samples_per_second": 112.298, + "eval_steps_per_second": 1.757, + "step": 224 + }, + { + "epoch": 0.16802016241949033, + "grad_norm": 4.075160908849732, + "learning_rate": 9.512318419967427e-06, + "loss": 1.0168, + "step": 225 + }, + { + "epoch": 0.1687669186969103, + "grad_norm": 2.9114360706072664, + "learning_rate": 9.507092222600783e-06, + "loss": 1.0028, + "step": 226 + }, + { + "epoch": 0.16951367497433026, + "grad_norm": 4.132080117167401, + "learning_rate": 9.50183962264041e-06, + "loss": 1.0656, + "step": 227 + }, + { + "epoch": 0.17026043125175022, + "grad_norm": 3.672183049449128, + "learning_rate": 9.496560650856097e-06, + "loss": 1.0088, + "step": 228 + }, + { + "epoch": 0.17100718752917016, + "grad_norm": 3.4549536917783943, + "learning_rate": 9.491255338172116e-06, + "loss": 1.0091, + "step": 229 + }, + { + "epoch": 0.17175394380659012, + "grad_norm": 5.640659404596868, + "learning_rate": 9.485923715667043e-06, + "loss": 1.1446, + "step": 230 + }, + { + "epoch": 0.17250070008401008, + "grad_norm": 3.6480354854741193, + "learning_rate": 9.48056581457358e-06, + "loss": 1.0238, + "step": 231 + }, + { + "epoch": 0.17324745636143005, + "grad_norm": 3.236903605406026, + "learning_rate": 9.47518166627837e-06, + "loss": 0.943, + "step": 232 + }, + { + "epoch": 0.17399421263884998, + "grad_norm": 4.153416002763863, + "learning_rate": 9.469771302321806e-06, + "loss": 1.0034, + "step": 233 + }, + { + "epoch": 0.17474096891626995, + "grad_norm": 3.6956623376560973, + "learning_rate": 9.464334754397861e-06, + "loss": 1.0605, + "step": 234 + }, + { + "epoch": 0.1754877251936899, + "grad_norm": 3.206378764325457, + "learning_rate": 9.458872054353888e-06, + "loss": 1.008, + "step": 235 + }, + { + "epoch": 0.17623448147110987, + "grad_norm": 5.157203956458338, + "learning_rate": 9.453383234190443e-06, + "loss": 1.085, + "step": 236 + }, + { + "epoch": 0.17698123774852983, + "grad_norm": 4.037061991227878, + "learning_rate": 9.44786832606109e-06, + "loss": 1.0144, + "step": 237 + }, + { + "epoch": 0.17772799402594977, + "grad_norm": 4.908695178645077, + "learning_rate": 9.44232736227222e-06, + "loss": 1.0413, + "step": 238 + }, + { + "epoch": 0.17772799402594977, + "eval_loss": 1.0027769804000854, + "eval_runtime": 160.4435, + "eval_samples_per_second": 112.37, + "eval_steps_per_second": 1.758, + "step": 238 + }, + { + "epoch": 0.17847475030336973, + "grad_norm": 4.237247843930998, + "learning_rate": 9.436760375282858e-06, + "loss": 1.1329, + "step": 239 + }, + { + "epoch": 0.1792215065807897, + "grad_norm": 3.13851110753343, + "learning_rate": 9.431167397704473e-06, + "loss": 1.0213, + "step": 240 + }, + { + "epoch": 0.17996826285820966, + "grad_norm": 4.018367507468041, + "learning_rate": 9.425548462300784e-06, + "loss": 1.0049, + "step": 241 + }, + { + "epoch": 0.1807150191356296, + "grad_norm": 3.847188380013737, + "learning_rate": 9.419903601987577e-06, + "loss": 0.9974, + "step": 242 + }, + { + "epoch": 0.18146177541304956, + "grad_norm": 3.6783145957465275, + "learning_rate": 9.414232849832501e-06, + "loss": 1.0258, + "step": 243 + }, + { + "epoch": 0.18220853169046952, + "grad_norm": 4.133477837431737, + "learning_rate": 9.408536239054881e-06, + "loss": 1.0285, + "step": 244 + }, + { + "epoch": 0.18295528796788949, + "grad_norm": 3.568010560178771, + "learning_rate": 9.402813803025526e-06, + "loss": 1.0521, + "step": 245 + }, + { + "epoch": 0.18370204424530945, + "grad_norm": 2.808739846372454, + "learning_rate": 9.397065575266524e-06, + "loss": 0.9989, + "step": 246 + }, + { + "epoch": 0.18444880052272938, + "grad_norm": 3.07097954551416, + "learning_rate": 9.391291589451056e-06, + "loss": 0.9804, + "step": 247 + }, + { + "epoch": 0.18519555680014935, + "grad_norm": 3.2507539771287566, + "learning_rate": 9.38549187940319e-06, + "loss": 1.0255, + "step": 248 + }, + { + "epoch": 0.1859423130775693, + "grad_norm": 2.936663911951155, + "learning_rate": 9.379666479097688e-06, + "loss": 1.0129, + "step": 249 + }, + { + "epoch": 0.18668906935498927, + "grad_norm": 4.046385332301949, + "learning_rate": 9.373815422659806e-06, + "loss": 0.9575, + "step": 250 + }, + { + "epoch": 0.1874358256324092, + "grad_norm": 3.1585373599718625, + "learning_rate": 9.3679387443651e-06, + "loss": 1.0272, + "step": 251 + }, + { + "epoch": 0.18818258190982917, + "grad_norm": 3.5612606806923277, + "learning_rate": 9.362036478639206e-06, + "loss": 1.0497, + "step": 252 + }, + { + "epoch": 0.18818258190982917, + "eval_loss": 1.000356912612915, + "eval_runtime": 160.5493, + "eval_samples_per_second": 112.296, + "eval_steps_per_second": 1.756, + "step": 252 + }, + { + "epoch": 0.18892933818724914, + "grad_norm": 3.4215406870010567, + "learning_rate": 9.356108660057662e-06, + "loss": 1.0153, + "step": 253 + }, + { + "epoch": 0.1896760944646691, + "grad_norm": 6.549260486707147, + "learning_rate": 9.35015532334569e-06, + "loss": 1.0474, + "step": 254 + }, + { + "epoch": 0.19042285074208906, + "grad_norm": 3.7221855827438604, + "learning_rate": 9.344176503378003e-06, + "loss": 1.0034, + "step": 255 + }, + { + "epoch": 0.191169607019509, + "grad_norm": 4.286172100728777, + "learning_rate": 9.33817223517859e-06, + "loss": 1.1106, + "step": 256 + }, + { + "epoch": 0.19191636329692896, + "grad_norm": 3.7718076814034602, + "learning_rate": 9.332142553920513e-06, + "loss": 1.004, + "step": 257 + }, + { + "epoch": 0.19266311957434892, + "grad_norm": 4.027126629991061, + "learning_rate": 9.326087494925715e-06, + "loss": 1.0292, + "step": 258 + }, + { + "epoch": 0.1934098758517689, + "grad_norm": 4.013082273454176, + "learning_rate": 9.32000709366479e-06, + "loss": 1.0707, + "step": 259 + }, + { + "epoch": 0.19415663212918885, + "grad_norm": 3.6414888535002388, + "learning_rate": 9.313901385756794e-06, + "loss": 1.0261, + "step": 260 + }, + { + "epoch": 0.19490338840660879, + "grad_norm": 3.1393801326018043, + "learning_rate": 9.307770406969032e-06, + "loss": 1.0124, + "step": 261 + }, + { + "epoch": 0.19565014468402875, + "grad_norm": 4.255795797008218, + "learning_rate": 9.301614193216837e-06, + "loss": 1.0726, + "step": 262 + }, + { + "epoch": 0.1963969009614487, + "grad_norm": 3.2995671562106095, + "learning_rate": 9.295432780563378e-06, + "loss": 0.9887, + "step": 263 + }, + { + "epoch": 0.19714365723886867, + "grad_norm": 3.023791866548189, + "learning_rate": 9.289226205219432e-06, + "loss": 0.9578, + "step": 264 + }, + { + "epoch": 0.1978904135162886, + "grad_norm": 2.9272861770466543, + "learning_rate": 9.282994503543185e-06, + "loss": 0.917, + "step": 265 + }, + { + "epoch": 0.19863716979370857, + "grad_norm": 3.9852539375546447, + "learning_rate": 9.276737712040012e-06, + "loss": 1.0247, + "step": 266 + }, + { + "epoch": 0.19863716979370857, + "eval_loss": 0.9999991059303284, + "eval_runtime": 160.5983, + "eval_samples_per_second": 112.261, + "eval_steps_per_second": 1.756, + "step": 266 + }, + { + "epoch": 0.19938392607112854, + "grad_norm": 4.4042754214389594, + "learning_rate": 9.270455867362262e-06, + "loss": 1.0177, + "step": 267 + }, + { + "epoch": 0.2001306823485485, + "grad_norm": 3.680747210357818, + "learning_rate": 9.264149006309048e-06, + "loss": 1.0334, + "step": 268 + }, + { + "epoch": 0.20087743862596846, + "grad_norm": 3.8140169790142218, + "learning_rate": 9.257817165826027e-06, + "loss": 1.0524, + "step": 269 + }, + { + "epoch": 0.2016241949033884, + "grad_norm": 5.114797128136089, + "learning_rate": 9.251460383005188e-06, + "loss": 1.0574, + "step": 270 + }, + { + "epoch": 0.20237095118080836, + "grad_norm": 3.1878689642025995, + "learning_rate": 9.245078695084632e-06, + "loss": 1.029, + "step": 271 + }, + { + "epoch": 0.20311770745822832, + "grad_norm": 3.8148562988799295, + "learning_rate": 9.238672139448354e-06, + "loss": 1.0469, + "step": 272 + }, + { + "epoch": 0.2038644637356483, + "grad_norm": 3.767249082048484, + "learning_rate": 9.232240753626027e-06, + "loss": 1.0501, + "step": 273 + }, + { + "epoch": 0.20461122001306822, + "grad_norm": 2.5236070719630885, + "learning_rate": 9.225784575292772e-06, + "loss": 0.9713, + "step": 274 + }, + { + "epoch": 0.2053579762904882, + "grad_norm": 4.582403432492713, + "learning_rate": 9.219303642268953e-06, + "loss": 1.0448, + "step": 275 + }, + { + "epoch": 0.20610473256790815, + "grad_norm": 2.810180928314182, + "learning_rate": 9.212797992519942e-06, + "loss": 0.963, + "step": 276 + }, + { + "epoch": 0.2068514888453281, + "grad_norm": 3.2399813947929834, + "learning_rate": 9.206267664155906e-06, + "loss": 0.9717, + "step": 277 + }, + { + "epoch": 0.20759824512274808, + "grad_norm": 4.912297938186508, + "learning_rate": 9.199712695431577e-06, + "loss": 1.1234, + "step": 278 + }, + { + "epoch": 0.208345001400168, + "grad_norm": 3.0084002580336304, + "learning_rate": 9.193133124746029e-06, + "loss": 1.0269, + "step": 279 + }, + { + "epoch": 0.20909175767758797, + "grad_norm": 3.760705263479795, + "learning_rate": 9.186528990642456e-06, + "loss": 1.0662, + "step": 280 + }, + { + "epoch": 0.20909175767758797, + "eval_loss": 0.9944674372673035, + "eval_runtime": 160.5401, + "eval_samples_per_second": 112.302, + "eval_steps_per_second": 1.757, + "step": 280 + }, + { + "epoch": 0.20983851395500794, + "grad_norm": 3.3378272588505196, + "learning_rate": 9.179900331807949e-06, + "loss": 1.0086, + "step": 281 + }, + { + "epoch": 0.2105852702324279, + "grad_norm": 3.4975415402391614, + "learning_rate": 9.173247187073258e-06, + "loss": 1.0851, + "step": 282 + }, + { + "epoch": 0.21133202650984784, + "grad_norm": 3.469930363846197, + "learning_rate": 9.166569595412576e-06, + "loss": 0.9715, + "step": 283 + }, + { + "epoch": 0.2120787827872678, + "grad_norm": 3.2839874663262734, + "learning_rate": 9.159867595943305e-06, + "loss": 1.0602, + "step": 284 + }, + { + "epoch": 0.21282553906468776, + "grad_norm": 5.543480811563083, + "learning_rate": 9.153141227925828e-06, + "loss": 1.0608, + "step": 285 + }, + { + "epoch": 0.21357229534210773, + "grad_norm": 2.8606552784980463, + "learning_rate": 9.146390530763281e-06, + "loss": 1.056, + "step": 286 + }, + { + "epoch": 0.2143190516195277, + "grad_norm": 3.948112135838377, + "learning_rate": 9.139615544001319e-06, + "loss": 1.1039, + "step": 287 + }, + { + "epoch": 0.21506580789694763, + "grad_norm": 3.2663451420355787, + "learning_rate": 9.132816307327886e-06, + "loss": 1.0605, + "step": 288 + }, + { + "epoch": 0.2158125641743676, + "grad_norm": 3.6151154254671596, + "learning_rate": 9.125992860572979e-06, + "loss": 1.0952, + "step": 289 + }, + { + "epoch": 0.21655932045178755, + "grad_norm": 4.912648681546459, + "learning_rate": 9.119145243708425e-06, + "loss": 1.0311, + "step": 290 + }, + { + "epoch": 0.21730607672920751, + "grad_norm": 4.21191863900281, + "learning_rate": 9.112273496847633e-06, + "loss": 1.1001, + "step": 291 + }, + { + "epoch": 0.21805283300662745, + "grad_norm": 3.3815445362628336, + "learning_rate": 9.10537766024537e-06, + "loss": 1.0159, + "step": 292 + }, + { + "epoch": 0.2187995892840474, + "grad_norm": 3.0735326892001917, + "learning_rate": 9.09845777429752e-06, + "loss": 0.9702, + "step": 293 + }, + { + "epoch": 0.21954634556146738, + "grad_norm": 4.053765796563824, + "learning_rate": 9.091513879540845e-06, + "loss": 0.9826, + "step": 294 + }, + { + "epoch": 0.21954634556146738, + "eval_loss": 0.9944608807563782, + "eval_runtime": 160.5629, + "eval_samples_per_second": 112.286, + "eval_steps_per_second": 1.756, + "step": 294 + }, + { + "epoch": 0.22029310183888734, + "grad_norm": 4.865522980455722, + "learning_rate": 9.084546016652758e-06, + "loss": 1.1022, + "step": 295 + }, + { + "epoch": 0.2210398581163073, + "grad_norm": 3.747759280503234, + "learning_rate": 9.07755422645107e-06, + "loss": 1.0149, + "step": 296 + }, + { + "epoch": 0.22178661439372724, + "grad_norm": 2.886224555660969, + "learning_rate": 9.070538549893762e-06, + "loss": 0.9359, + "step": 297 + }, + { + "epoch": 0.2225333706711472, + "grad_norm": 3.707336145864212, + "learning_rate": 9.063499028078742e-06, + "loss": 1.0696, + "step": 298 + }, + { + "epoch": 0.22328012694856716, + "grad_norm": 2.918913323052194, + "learning_rate": 9.056435702243601e-06, + "loss": 0.9829, + "step": 299 + }, + { + "epoch": 0.22402688322598713, + "grad_norm": 3.5655254434314485, + "learning_rate": 9.049348613765379e-06, + "loss": 1.1198, + "step": 300 + }, + { + "epoch": 0.22477363950340706, + "grad_norm": 3.87864421227377, + "learning_rate": 9.042237804160313e-06, + "loss": 1.045, + "step": 301 + }, + { + "epoch": 0.22552039578082703, + "grad_norm": 5.860458069015467, + "learning_rate": 9.035103315083603e-06, + "loss": 1.0307, + "step": 302 + }, + { + "epoch": 0.226267152058247, + "grad_norm": 3.383434386692763, + "learning_rate": 9.027945188329157e-06, + "loss": 0.989, + "step": 303 + }, + { + "epoch": 0.22701390833566695, + "grad_norm": 3.65424563045076, + "learning_rate": 9.020763465829361e-06, + "loss": 1.025, + "step": 304 + }, + { + "epoch": 0.22776066461308692, + "grad_norm": 3.6361185953370856, + "learning_rate": 9.013558189654819e-06, + "loss": 0.9718, + "step": 305 + }, + { + "epoch": 0.22850742089050685, + "grad_norm": 4.603618291504726, + "learning_rate": 9.006329402014115e-06, + "loss": 0.976, + "step": 306 + }, + { + "epoch": 0.22925417716792681, + "grad_norm": 4.4620408030223775, + "learning_rate": 8.999077145253564e-06, + "loss": 1.0732, + "step": 307 + }, + { + "epoch": 0.23000093344534678, + "grad_norm": 4.541176700346841, + "learning_rate": 8.991801461856961e-06, + "loss": 1.067, + "step": 308 + }, + { + "epoch": 0.23000093344534678, + "eval_loss": 0.9934854507446289, + "eval_runtime": 160.5833, + "eval_samples_per_second": 112.272, + "eval_steps_per_second": 1.756, + "step": 308 + }, + { + "epoch": 0.23074768972276674, + "grad_norm": 3.195979915169788, + "learning_rate": 8.984502394445338e-06, + "loss": 1.0315, + "step": 309 + }, + { + "epoch": 0.23149444600018668, + "grad_norm": 3.835832525500061, + "learning_rate": 8.977179985776707e-06, + "loss": 1.1007, + "step": 310 + }, + { + "epoch": 0.23224120227760664, + "grad_norm": 5.0483013462475075, + "learning_rate": 8.969834278745817e-06, + "loss": 1.0427, + "step": 311 + }, + { + "epoch": 0.2329879585550266, + "grad_norm": 4.252092405014885, + "learning_rate": 8.962465316383894e-06, + "loss": 1.0672, + "step": 312 + }, + { + "epoch": 0.23373471483244657, + "grad_norm": 3.3931880091176083, + "learning_rate": 8.955073141858401e-06, + "loss": 1.0445, + "step": 313 + }, + { + "epoch": 0.23448147110986653, + "grad_norm": 3.7519330808807987, + "learning_rate": 8.94765779847277e-06, + "loss": 1.0875, + "step": 314 + }, + { + "epoch": 0.23522822738728646, + "grad_norm": 3.404074995155419, + "learning_rate": 8.940219329666167e-06, + "loss": 1.0125, + "step": 315 + }, + { + "epoch": 0.23597498366470643, + "grad_norm": 3.978543801899103, + "learning_rate": 8.932757779013214e-06, + "loss": 1.0249, + "step": 316 + }, + { + "epoch": 0.2367217399421264, + "grad_norm": 3.7272416239455763, + "learning_rate": 8.925273190223756e-06, + "loss": 1.089, + "step": 317 + }, + { + "epoch": 0.23746849621954635, + "grad_norm": 3.8636012935517448, + "learning_rate": 8.917765607142594e-06, + "loss": 1.0153, + "step": 318 + }, + { + "epoch": 0.2382152524969663, + "grad_norm": 3.2747280611352894, + "learning_rate": 8.910235073749226e-06, + "loss": 1.0644, + "step": 319 + }, + { + "epoch": 0.23896200877438625, + "grad_norm": 5.279253406480692, + "learning_rate": 8.9026816341576e-06, + "loss": 1.0308, + "step": 320 + }, + { + "epoch": 0.23970876505180622, + "grad_norm": 3.11287763067271, + "learning_rate": 8.895105332615841e-06, + "loss": 1.0005, + "step": 321 + }, + { + "epoch": 0.24045552132922618, + "grad_norm": 4.973692199255152, + "learning_rate": 8.887506213506005e-06, + "loss": 1.0565, + "step": 322 + }, + { + "epoch": 0.24045552132922618, + "eval_loss": 0.9922440648078918, + "eval_runtime": 160.7313, + "eval_samples_per_second": 112.169, + "eval_steps_per_second": 1.754, + "step": 322 + }, + { + "epoch": 0.24120227760664614, + "grad_norm": 3.1961095681046605, + "learning_rate": 8.879884321343813e-06, + "loss": 1.0421, + "step": 323 + }, + { + "epoch": 0.24194903388406608, + "grad_norm": 3.4689654047952816, + "learning_rate": 8.872239700778387e-06, + "loss": 1.0353, + "step": 324 + }, + { + "epoch": 0.24269579016148604, + "grad_norm": 3.316236552129336, + "learning_rate": 8.864572396591996e-06, + "loss": 1.0268, + "step": 325 + }, + { + "epoch": 0.243442546438906, + "grad_norm": 4.010944666361697, + "learning_rate": 8.856882453699789e-06, + "loss": 1.0012, + "step": 326 + }, + { + "epoch": 0.24418930271632597, + "grad_norm": 3.6005519269255504, + "learning_rate": 8.849169917149532e-06, + "loss": 1.0483, + "step": 327 + }, + { + "epoch": 0.2449360589937459, + "grad_norm": 3.039548950649496, + "learning_rate": 8.841434832121345e-06, + "loss": 1.053, + "step": 328 + }, + { + "epoch": 0.24568281527116587, + "grad_norm": 4.617344719406804, + "learning_rate": 8.833677243927439e-06, + "loss": 0.9565, + "step": 329 + }, + { + "epoch": 0.24642957154858583, + "grad_norm": 3.0892815649842986, + "learning_rate": 8.825897198011847e-06, + "loss": 1.0735, + "step": 330 + }, + { + "epoch": 0.2471763278260058, + "grad_norm": 3.675909285421093, + "learning_rate": 8.818094739950157e-06, + "loss": 1.0462, + "step": 331 + }, + { + "epoch": 0.24792308410342576, + "grad_norm": 3.927754317009532, + "learning_rate": 8.810269915449255e-06, + "loss": 1.114, + "step": 332 + }, + { + "epoch": 0.2486698403808457, + "grad_norm": 2.990101012798793, + "learning_rate": 8.802422770347044e-06, + "loss": 1.0208, + "step": 333 + }, + { + "epoch": 0.24941659665826565, + "grad_norm": 3.3346146894348956, + "learning_rate": 8.79455335061218e-06, + "loss": 0.978, + "step": 334 + }, + { + "epoch": 0.2501633529356856, + "grad_norm": 3.319351140293654, + "learning_rate": 8.786661702343811e-06, + "loss": 1.0032, + "step": 335 + }, + { + "epoch": 0.2509101092131056, + "grad_norm": 3.1815233823997775, + "learning_rate": 8.778747871771293e-06, + "loss": 0.9834, + "step": 336 + }, + { + "epoch": 0.2509101092131056, + "eval_loss": 0.9869978427886963, + "eval_runtime": 160.5665, + "eval_samples_per_second": 112.284, + "eval_steps_per_second": 1.756, + "step": 336 + }, + { + "epoch": 0.2516568654905255, + "grad_norm": 3.8614964239023575, + "learning_rate": 8.770811905253929e-06, + "loss": 0.9907, + "step": 337 + }, + { + "epoch": 0.2524036217679455, + "grad_norm": 4.010000964702555, + "learning_rate": 8.762853849280692e-06, + "loss": 0.9913, + "step": 338 + }, + { + "epoch": 0.25315037804536544, + "grad_norm": 3.390634860886564, + "learning_rate": 8.754873750469964e-06, + "loss": 1.0259, + "step": 339 + }, + { + "epoch": 0.2538971343227854, + "grad_norm": 3.889606894130351, + "learning_rate": 8.746871655569245e-06, + "loss": 1.1029, + "step": 340 + }, + { + "epoch": 0.25464389060020537, + "grad_norm": 5.308002153451313, + "learning_rate": 8.738847611454887e-06, + "loss": 1.0702, + "step": 341 + }, + { + "epoch": 0.2553906468776253, + "grad_norm": 3.588824400370797, + "learning_rate": 8.730801665131831e-06, + "loss": 1.0099, + "step": 342 + }, + { + "epoch": 0.2561374031550453, + "grad_norm": 5.807491833529212, + "learning_rate": 8.722733863733314e-06, + "loss": 1.1411, + "step": 343 + }, + { + "epoch": 0.25688415943246523, + "grad_norm": 3.2267656811014396, + "learning_rate": 8.714644254520599e-06, + "loss": 1.0253, + "step": 344 + }, + { + "epoch": 0.25763091570988517, + "grad_norm": 4.5457930141059, + "learning_rate": 8.706532884882704e-06, + "loss": 1.1174, + "step": 345 + }, + { + "epoch": 0.25837767198730516, + "grad_norm": 10.348057960352017, + "learning_rate": 8.698399802336117e-06, + "loss": 0.9912, + "step": 346 + }, + { + "epoch": 0.2591244282647251, + "grad_norm": 3.066110108501509, + "learning_rate": 8.690245054524522e-06, + "loss": 1.0336, + "step": 347 + }, + { + "epoch": 0.2598711845421451, + "grad_norm": 3.370718413246445, + "learning_rate": 8.682068689218517e-06, + "loss": 1.0249, + "step": 348 + }, + { + "epoch": 0.260617940819565, + "grad_norm": 3.1496821083014184, + "learning_rate": 8.673870754315336e-06, + "loss": 0.9605, + "step": 349 + }, + { + "epoch": 0.26136469709698495, + "grad_norm": 3.8758345843988233, + "learning_rate": 8.665651297838572e-06, + "loss": 1.0054, + "step": 350 + }, + { + "epoch": 0.26136469709698495, + "eval_loss": 0.9857860207557678, + "eval_runtime": 160.6029, + "eval_samples_per_second": 112.258, + "eval_steps_per_second": 1.756, + "step": 350 + }, + { + "epoch": 0.26211145337440495, + "grad_norm": 3.651482334551038, + "learning_rate": 8.65741036793788e-06, + "loss": 1.0849, + "step": 351 + }, + { + "epoch": 0.2628582096518249, + "grad_norm": 3.3246917496100306, + "learning_rate": 8.649148012888717e-06, + "loss": 0.959, + "step": 352 + }, + { + "epoch": 0.2636049659292448, + "grad_norm": 3.4869853483297235, + "learning_rate": 8.640864281092051e-06, + "loss": 1.0238, + "step": 353 + }, + { + "epoch": 0.2643517222066648, + "grad_norm": 3.62379026669033, + "learning_rate": 8.632559221074063e-06, + "loss": 1.0621, + "step": 354 + }, + { + "epoch": 0.26509847848408474, + "grad_norm": 3.407945964026508, + "learning_rate": 8.624232881485887e-06, + "loss": 1.0635, + "step": 355 + }, + { + "epoch": 0.26584523476150473, + "grad_norm": 2.7911014267413417, + "learning_rate": 8.615885311103306e-06, + "loss": 1.0197, + "step": 356 + }, + { + "epoch": 0.26659199103892467, + "grad_norm": 3.087248719032191, + "learning_rate": 8.607516558826477e-06, + "loss": 1.0095, + "step": 357 + }, + { + "epoch": 0.2673387473163446, + "grad_norm": 3.232901307867413, + "learning_rate": 8.599126673679636e-06, + "loss": 0.984, + "step": 358 + }, + { + "epoch": 0.2680855035937646, + "grad_norm": 4.085245454288716, + "learning_rate": 8.590715704810823e-06, + "loss": 1.0483, + "step": 359 + }, + { + "epoch": 0.26883225987118453, + "grad_norm": 6.945488421551598, + "learning_rate": 8.582283701491576e-06, + "loss": 1.0444, + "step": 360 + }, + { + "epoch": 0.2695790161486045, + "grad_norm": 3.6035457167003067, + "learning_rate": 8.573830713116663e-06, + "loss": 0.989, + "step": 361 + }, + { + "epoch": 0.27032577242602446, + "grad_norm": 4.121541341754975, + "learning_rate": 8.565356789203781e-06, + "loss": 1.0037, + "step": 362 + }, + { + "epoch": 0.2710725287034444, + "grad_norm": 3.79404977076993, + "learning_rate": 8.556861979393263e-06, + "loss": 1.0403, + "step": 363 + }, + { + "epoch": 0.2718192849808644, + "grad_norm": 4.030031666437055, + "learning_rate": 8.548346333447794e-06, + "loss": 1.0121, + "step": 364 + }, + { + "epoch": 0.2718192849808644, + "eval_loss": 0.9849192500114441, + "eval_runtime": 160.5363, + "eval_samples_per_second": 112.305, + "eval_steps_per_second": 1.757, + "step": 364 + }, + { + "epoch": 0.2725660412582843, + "grad_norm": 2.888450802695812, + "learning_rate": 8.539809901252118e-06, + "loss": 0.9501, + "step": 365 + }, + { + "epoch": 0.2733127975357043, + "grad_norm": 3.9527081940985545, + "learning_rate": 8.531252732812744e-06, + "loss": 1.0831, + "step": 366 + }, + { + "epoch": 0.27405955381312425, + "grad_norm": 6.463294702133942, + "learning_rate": 8.522674878257658e-06, + "loss": 1.0366, + "step": 367 + }, + { + "epoch": 0.2748063100905442, + "grad_norm": 3.095330854017702, + "learning_rate": 8.514076387836022e-06, + "loss": 0.9823, + "step": 368 + }, + { + "epoch": 0.2755530663679642, + "grad_norm": 3.8233257574656743, + "learning_rate": 8.505457311917878e-06, + "loss": 1.0125, + "step": 369 + }, + { + "epoch": 0.2762998226453841, + "grad_norm": 4.0903866779791676, + "learning_rate": 8.496817700993869e-06, + "loss": 1.0439, + "step": 370 + }, + { + "epoch": 0.27704657892280404, + "grad_norm": 3.8768659844134743, + "learning_rate": 8.488157605674924e-06, + "loss": 1.1039, + "step": 371 + }, + { + "epoch": 0.27779333520022403, + "grad_norm": 2.9318267792600357, + "learning_rate": 8.479477076691975e-06, + "loss": 0.9654, + "step": 372 + }, + { + "epoch": 0.27854009147764397, + "grad_norm": 4.231056764323647, + "learning_rate": 8.47077616489565e-06, + "loss": 0.9985, + "step": 373 + }, + { + "epoch": 0.27928684775506396, + "grad_norm": 3.0018019927339936, + "learning_rate": 8.462054921255984e-06, + "loss": 1.07, + "step": 374 + }, + { + "epoch": 0.2800336040324839, + "grad_norm": 3.860901132155319, + "learning_rate": 8.453313396862113e-06, + "loss": 1.0714, + "step": 375 + }, + { + "epoch": 0.28078036030990383, + "grad_norm": 3.397395596102485, + "learning_rate": 8.44455164292198e-06, + "loss": 1.0387, + "step": 376 + }, + { + "epoch": 0.2815271165873238, + "grad_norm": 2.8794162395513654, + "learning_rate": 8.43576971076203e-06, + "loss": 0.9757, + "step": 377 + }, + { + "epoch": 0.28227387286474376, + "grad_norm": 3.4033543290966928, + "learning_rate": 8.426967651826914e-06, + "loss": 0.9365, + "step": 378 + }, + { + "epoch": 0.28227387286474376, + "eval_loss": 0.9828009009361267, + "eval_runtime": 160.585, + "eval_samples_per_second": 112.271, + "eval_steps_per_second": 1.756, + "step": 378 + }, + { + "epoch": 0.28302062914216375, + "grad_norm": 3.3989624743375617, + "learning_rate": 8.418145517679188e-06, + "loss": 1.0231, + "step": 379 + }, + { + "epoch": 0.2837673854195837, + "grad_norm": 4.328116900729524, + "learning_rate": 8.409303359999007e-06, + "loss": 0.9752, + "step": 380 + }, + { + "epoch": 0.2845141416970036, + "grad_norm": 3.602577669926632, + "learning_rate": 8.400441230583822e-06, + "loss": 1.0199, + "step": 381 + }, + { + "epoch": 0.2852608979744236, + "grad_norm": 3.5809369419716752, + "learning_rate": 8.391559181348081e-06, + "loss": 0.9868, + "step": 382 + }, + { + "epoch": 0.28600765425184355, + "grad_norm": 3.395383899337209, + "learning_rate": 8.382657264322924e-06, + "loss": 1.0212, + "step": 383 + }, + { + "epoch": 0.28675441052926354, + "grad_norm": 6.772601000085416, + "learning_rate": 8.37373553165587e-06, + "loss": 1.1117, + "step": 384 + }, + { + "epoch": 0.2875011668066835, + "grad_norm": 3.252451755396697, + "learning_rate": 8.364794035610527e-06, + "loss": 1.0028, + "step": 385 + }, + { + "epoch": 0.2882479230841034, + "grad_norm": 3.5098322414248395, + "learning_rate": 8.355832828566273e-06, + "loss": 0.9566, + "step": 386 + }, + { + "epoch": 0.2889946793615234, + "grad_norm": 2.9266861457219187, + "learning_rate": 8.346851963017952e-06, + "loss": 0.9624, + "step": 387 + }, + { + "epoch": 0.28974143563894333, + "grad_norm": 3.8574986005071814, + "learning_rate": 8.337851491575569e-06, + "loss": 1.1127, + "step": 388 + }, + { + "epoch": 0.29048819191636327, + "grad_norm": 3.7829383097996323, + "learning_rate": 8.32883146696398e-06, + "loss": 1.054, + "step": 389 + }, + { + "epoch": 0.29123494819378326, + "grad_norm": 4.299196127336594, + "learning_rate": 8.319791942022586e-06, + "loss": 1.1059, + "step": 390 + }, + { + "epoch": 0.2919817044712032, + "grad_norm": 3.537643283730942, + "learning_rate": 8.310732969705018e-06, + "loss": 1.0657, + "step": 391 + }, + { + "epoch": 0.2927284607486232, + "grad_norm": 3.1645266690038616, + "learning_rate": 8.301654603078832e-06, + "loss": 1.037, + "step": 392 + }, + { + "epoch": 0.2927284607486232, + "eval_loss": 0.9834852814674377, + "eval_runtime": 160.6069, + "eval_samples_per_second": 112.255, + "eval_steps_per_second": 1.756, + "step": 392 + }, + { + "epoch": 0.2934752170260431, + "grad_norm": 4.187400458258949, + "learning_rate": 8.292556895325195e-06, + "loss": 0.998, + "step": 393 + }, + { + "epoch": 0.29422197330346306, + "grad_norm": 3.5667687656977387, + "learning_rate": 8.283439899738574e-06, + "loss": 1.0179, + "step": 394 + }, + { + "epoch": 0.29496872958088305, + "grad_norm": 3.765218136068551, + "learning_rate": 8.274303669726427e-06, + "loss": 1.0147, + "step": 395 + }, + { + "epoch": 0.295715485858303, + "grad_norm": 3.864131300606489, + "learning_rate": 8.265148258808884e-06, + "loss": 0.9901, + "step": 396 + }, + { + "epoch": 0.296462242135723, + "grad_norm": 2.949157107673803, + "learning_rate": 8.255973720618438e-06, + "loss": 1.0055, + "step": 397 + }, + { + "epoch": 0.2972089984131429, + "grad_norm": 3.7436652842793374, + "learning_rate": 8.246780108899635e-06, + "loss": 1.1692, + "step": 398 + }, + { + "epoch": 0.29795575469056285, + "grad_norm": 3.2192130117671343, + "learning_rate": 8.237567477508744e-06, + "loss": 0.9998, + "step": 399 + }, + { + "epoch": 0.29870251096798284, + "grad_norm": 3.6902903702330443, + "learning_rate": 8.228335880413458e-06, + "loss": 1.0037, + "step": 400 + }, + { + "epoch": 0.2994492672454028, + "grad_norm": 3.4271815650751436, + "learning_rate": 8.219085371692573e-06, + "loss": 1.0365, + "step": 401 + }, + { + "epoch": 0.30019602352282276, + "grad_norm": 3.4017253839996133, + "learning_rate": 8.209816005535665e-06, + "loss": 1.0868, + "step": 402 + }, + { + "epoch": 0.3009427798002427, + "grad_norm": 2.7458347961593566, + "learning_rate": 8.200527836242775e-06, + "loss": 0.9653, + "step": 403 + }, + { + "epoch": 0.30168953607766263, + "grad_norm": 3.1745766085250007, + "learning_rate": 8.191220918224102e-06, + "loss": 0.993, + "step": 404 + }, + { + "epoch": 0.3024362923550826, + "grad_norm": 3.090654241484588, + "learning_rate": 8.181895305999665e-06, + "loss": 0.9881, + "step": 405 + }, + { + "epoch": 0.30318304863250256, + "grad_norm": 3.6314806742898282, + "learning_rate": 8.172551054199002e-06, + "loss": 1.0375, + "step": 406 + }, + { + "epoch": 0.30318304863250256, + "eval_loss": 0.9811131358146667, + "eval_runtime": 160.5504, + "eval_samples_per_second": 112.295, + "eval_steps_per_second": 1.756, + "step": 406 + }, + { + "epoch": 0.3039298049099225, + "grad_norm": 3.7357325248458624, + "learning_rate": 8.16318821756084e-06, + "loss": 1.0248, + "step": 407 + }, + { + "epoch": 0.3046765611873425, + "grad_norm": 2.5737530837652383, + "learning_rate": 8.153806850932771e-06, + "loss": 1.0217, + "step": 408 + }, + { + "epoch": 0.3054233174647624, + "grad_norm": 3.1463228977712467, + "learning_rate": 8.144407009270939e-06, + "loss": 1.0029, + "step": 409 + }, + { + "epoch": 0.3061700737421824, + "grad_norm": 9.739807547203563, + "learning_rate": 8.134988747639719e-06, + "loss": 1.0793, + "step": 410 + }, + { + "epoch": 0.30691683001960235, + "grad_norm": 3.4794447208645187, + "learning_rate": 8.125552121211385e-06, + "loss": 1.0399, + "step": 411 + }, + { + "epoch": 0.3076635862970223, + "grad_norm": 3.3525431927586578, + "learning_rate": 8.116097185265793e-06, + "loss": 1.0431, + "step": 412 + }, + { + "epoch": 0.3084103425744423, + "grad_norm": 3.4084157252106837, + "learning_rate": 8.106623995190058e-06, + "loss": 1.0727, + "step": 413 + }, + { + "epoch": 0.3091570988518622, + "grad_norm": 3.476679438345711, + "learning_rate": 8.09713260647823e-06, + "loss": 1.1045, + "step": 414 + }, + { + "epoch": 0.3099038551292822, + "grad_norm": 3.5348116522716078, + "learning_rate": 8.08762307473096e-06, + "loss": 0.9603, + "step": 415 + }, + { + "epoch": 0.31065061140670214, + "grad_norm": 3.6167949527936254, + "learning_rate": 8.078095455655188e-06, + "loss": 1.0288, + "step": 416 + }, + { + "epoch": 0.3113973676841221, + "grad_norm": 3.0637330231202013, + "learning_rate": 8.068549805063806e-06, + "loss": 0.9912, + "step": 417 + }, + { + "epoch": 0.31214412396154206, + "grad_norm": 3.240446093775731, + "learning_rate": 8.058986178875337e-06, + "loss": 0.9449, + "step": 418 + }, + { + "epoch": 0.312890880238962, + "grad_norm": 3.5029259821878593, + "learning_rate": 8.0494046331136e-06, + "loss": 0.9791, + "step": 419 + }, + { + "epoch": 0.313637636516382, + "grad_norm": 4.857373088195226, + "learning_rate": 8.039805223907396e-06, + "loss": 1.0656, + "step": 420 + }, + { + "epoch": 0.313637636516382, + "eval_loss": 0.9795615673065186, + "eval_runtime": 160.5273, + "eval_samples_per_second": 112.311, + "eval_steps_per_second": 1.757, + "step": 420 + }, + { + "epoch": 0.3143843927938019, + "grad_norm": 3.621695917176158, + "learning_rate": 8.030188007490164e-06, + "loss": 0.9901, + "step": 421 + }, + { + "epoch": 0.31513114907122186, + "grad_norm": 3.134150159669535, + "learning_rate": 8.020553040199654e-06, + "loss": 1.017, + "step": 422 + }, + { + "epoch": 0.31587790534864185, + "grad_norm": 3.7042154643581395, + "learning_rate": 8.010900378477612e-06, + "loss": 1.0453, + "step": 423 + }, + { + "epoch": 0.3166246616260618, + "grad_norm": 2.91252648100696, + "learning_rate": 8.00123007886943e-06, + "loss": 1.0093, + "step": 424 + }, + { + "epoch": 0.3173714179034818, + "grad_norm": 3.211153472938924, + "learning_rate": 7.991542198023827e-06, + "loss": 0.9395, + "step": 425 + }, + { + "epoch": 0.3181181741809017, + "grad_norm": 3.2362263093734764, + "learning_rate": 7.981836792692508e-06, + "loss": 1.0098, + "step": 426 + }, + { + "epoch": 0.31886493045832165, + "grad_norm": 4.070963322234874, + "learning_rate": 7.97211391972984e-06, + "loss": 1.0552, + "step": 427 + }, + { + "epoch": 0.31961168673574164, + "grad_norm": 3.212324506648779, + "learning_rate": 7.962373636092517e-06, + "loss": 0.9705, + "step": 428 + }, + { + "epoch": 0.3203584430131616, + "grad_norm": 3.861278889833234, + "learning_rate": 7.952615998839222e-06, + "loss": 1.0376, + "step": 429 + }, + { + "epoch": 0.3211051992905815, + "grad_norm": 3.040788601978854, + "learning_rate": 7.942841065130296e-06, + "loss": 0.9765, + "step": 430 + }, + { + "epoch": 0.3218519555680015, + "grad_norm": 3.7318713732273934, + "learning_rate": 7.933048892227406e-06, + "loss": 1.0966, + "step": 431 + }, + { + "epoch": 0.32259871184542144, + "grad_norm": 3.0465602998502153, + "learning_rate": 7.923239537493204e-06, + "loss": 0.9536, + "step": 432 + }, + { + "epoch": 0.32334546812284143, + "grad_norm": 3.3796680686549117, + "learning_rate": 7.913413058390989e-06, + "loss": 1.1145, + "step": 433 + }, + { + "epoch": 0.32409222440026136, + "grad_norm": 3.6473790330312825, + "learning_rate": 7.903569512484383e-06, + "loss": 1.0295, + "step": 434 + }, + { + "epoch": 0.32409222440026136, + "eval_loss": 0.9785324335098267, + "eval_runtime": 160.554, + "eval_samples_per_second": 112.292, + "eval_steps_per_second": 1.756, + "step": 434 + }, + { + "epoch": 0.3248389806776813, + "grad_norm": 2.9530024474970236, + "learning_rate": 7.893708957436982e-06, + "loss": 1.0426, + "step": 435 + }, + { + "epoch": 0.3255857369551013, + "grad_norm": 3.766650384182543, + "learning_rate": 7.88383145101202e-06, + "loss": 1.0902, + "step": 436 + }, + { + "epoch": 0.3263324932325212, + "grad_norm": 3.0225531834819157, + "learning_rate": 7.873937051072037e-06, + "loss": 1.0197, + "step": 437 + }, + { + "epoch": 0.3270792495099412, + "grad_norm": 6.747688160994902, + "learning_rate": 7.864025815578524e-06, + "loss": 1.0138, + "step": 438 + }, + { + "epoch": 0.32782600578736115, + "grad_norm": 3.3502645696292466, + "learning_rate": 7.85409780259161e-06, + "loss": 1.026, + "step": 439 + }, + { + "epoch": 0.3285727620647811, + "grad_norm": 3.390696827000236, + "learning_rate": 7.844153070269697e-06, + "loss": 1.0403, + "step": 440 + }, + { + "epoch": 0.3293195183422011, + "grad_norm": 3.270199160958159, + "learning_rate": 7.834191676869135e-06, + "loss": 1.0221, + "step": 441 + }, + { + "epoch": 0.330066274619621, + "grad_norm": 3.361500760720287, + "learning_rate": 7.824213680743867e-06, + "loss": 0.9559, + "step": 442 + }, + { + "epoch": 0.330813030897041, + "grad_norm": 3.6775652760129125, + "learning_rate": 7.8142191403451e-06, + "loss": 1.0061, + "step": 443 + }, + { + "epoch": 0.33155978717446094, + "grad_norm": 2.9665469767448425, + "learning_rate": 7.80420811422096e-06, + "loss": 1.0567, + "step": 444 + }, + { + "epoch": 0.3323065434518809, + "grad_norm": 3.5075474240286377, + "learning_rate": 7.794180661016143e-06, + "loss": 1.0241, + "step": 445 + }, + { + "epoch": 0.33305329972930087, + "grad_norm": 3.1914109742785266, + "learning_rate": 7.784136839471573e-06, + "loss": 1.0758, + "step": 446 + }, + { + "epoch": 0.3338000560067208, + "grad_norm": 4.066852799813992, + "learning_rate": 7.774076708424062e-06, + "loss": 1.0315, + "step": 447 + }, + { + "epoch": 0.33454681228414074, + "grad_norm": 3.0279095506480496, + "learning_rate": 7.764000326805967e-06, + "loss": 0.9507, + "step": 448 + }, + { + "epoch": 0.33454681228414074, + "eval_loss": 0.9775028228759766, + "eval_runtime": 160.6104, + "eval_samples_per_second": 112.253, + "eval_steps_per_second": 1.756, + "step": 448 + }, + { + "epoch": 0.33529356856156073, + "grad_norm": 3.102821716169309, + "learning_rate": 7.753907753644835e-06, + "loss": 1.0906, + "step": 449 + }, + { + "epoch": 0.33604032483898066, + "grad_norm": 3.1371422748805644, + "learning_rate": 7.74379904806307e-06, + "loss": 1.038, + "step": 450 + }, + { + "epoch": 0.33678708111640066, + "grad_norm": 3.664718417806466, + "learning_rate": 7.733674269277572e-06, + "loss": 1.0063, + "step": 451 + }, + { + "epoch": 0.3375338373938206, + "grad_norm": 8.663230070425483, + "learning_rate": 7.7235334765994e-06, + "loss": 0.9922, + "step": 452 + }, + { + "epoch": 0.3382805936712405, + "grad_norm": 3.4498456909319053, + "learning_rate": 7.71337672943343e-06, + "loss": 0.929, + "step": 453 + }, + { + "epoch": 0.3390273499486605, + "grad_norm": 3.3294787014480924, + "learning_rate": 7.703204087277989e-06, + "loss": 1.025, + "step": 454 + }, + { + "epoch": 0.33977410622608045, + "grad_norm": 10.093120102912547, + "learning_rate": 7.693015609724524e-06, + "loss": 1.0889, + "step": 455 + }, + { + "epoch": 0.34052086250350044, + "grad_norm": 3.427826565448867, + "learning_rate": 7.682811356457245e-06, + "loss": 1.0335, + "step": 456 + }, + { + "epoch": 0.3412676187809204, + "grad_norm": 3.1300498945062705, + "learning_rate": 7.672591387252773e-06, + "loss": 0.9643, + "step": 457 + }, + { + "epoch": 0.3420143750583403, + "grad_norm": 4.834020434756831, + "learning_rate": 7.662355761979794e-06, + "loss": 1.0139, + "step": 458 + }, + { + "epoch": 0.3427611313357603, + "grad_norm": 3.206146987660176, + "learning_rate": 7.652104540598712e-06, + "loss": 0.9946, + "step": 459 + }, + { + "epoch": 0.34350788761318024, + "grad_norm": 3.0060430098945, + "learning_rate": 7.64183778316129e-06, + "loss": 0.9548, + "step": 460 + }, + { + "epoch": 0.34425464389060023, + "grad_norm": 2.700718919045169, + "learning_rate": 7.6315555498103e-06, + "loss": 1.0126, + "step": 461 + }, + { + "epoch": 0.34500140016802017, + "grad_norm": 4.606267437808485, + "learning_rate": 7.621257900779173e-06, + "loss": 1.014, + "step": 462 + }, + { + "epoch": 0.34500140016802017, + "eval_loss": 0.9751178026199341, + "eval_runtime": 162.1058, + "eval_samples_per_second": 111.217, + "eval_steps_per_second": 1.74, + "step": 462 + }, + { + "epoch": 0.3457481564454401, + "grad_norm": 3.8917302169214243, + "learning_rate": 7.610944896391644e-06, + "loss": 1.0268, + "step": 463 + }, + { + "epoch": 0.3464949127228601, + "grad_norm": 3.3196051557196586, + "learning_rate": 7.6006165970614045e-06, + "loss": 1.0196, + "step": 464 + }, + { + "epoch": 0.34724166900028003, + "grad_norm": 3.53901881155908, + "learning_rate": 7.5902730632917395e-06, + "loss": 0.9993, + "step": 465 + }, + { + "epoch": 0.34798842527769996, + "grad_norm": 3.0682482146093477, + "learning_rate": 7.579914355675177e-06, + "loss": 0.9473, + "step": 466 + }, + { + "epoch": 0.34873518155511996, + "grad_norm": 3.087914341379048, + "learning_rate": 7.569540534893139e-06, + "loss": 1.0363, + "step": 467 + }, + { + "epoch": 0.3494819378325399, + "grad_norm": 3.8282273155400426, + "learning_rate": 7.559151661715574e-06, + "loss": 0.9911, + "step": 468 + }, + { + "epoch": 0.3502286941099599, + "grad_norm": 3.7879619577438475, + "learning_rate": 7.548747797000611e-06, + "loss": 1.1124, + "step": 469 + }, + { + "epoch": 0.3509754503873798, + "grad_norm": 3.678911283929549, + "learning_rate": 7.5383290016942e-06, + "loss": 1.089, + "step": 470 + }, + { + "epoch": 0.35172220666479975, + "grad_norm": 3.153223573399342, + "learning_rate": 7.527895336829754e-06, + "loss": 0.9562, + "step": 471 + }, + { + "epoch": 0.35246896294221974, + "grad_norm": 4.016430707128402, + "learning_rate": 7.517446863527791e-06, + "loss": 1.0842, + "step": 472 + }, + { + "epoch": 0.3532157192196397, + "grad_norm": 3.016008204142368, + "learning_rate": 7.506983642995576e-06, + "loss": 0.9873, + "step": 473 + }, + { + "epoch": 0.35396247549705967, + "grad_norm": 3.138460161693774, + "learning_rate": 7.496505736526769e-06, + "loss": 1.0124, + "step": 474 + }, + { + "epoch": 0.3547092317744796, + "grad_norm": 2.8855105537883046, + "learning_rate": 7.486013205501053e-06, + "loss": 0.9669, + "step": 475 + }, + { + "epoch": 0.35545598805189954, + "grad_norm": 3.1177066824549597, + "learning_rate": 7.475506111383787e-06, + "loss": 0.9873, + "step": 476 + }, + { + "epoch": 0.35545598805189954, + "eval_loss": 0.9743978977203369, + "eval_runtime": 160.456, + "eval_samples_per_second": 112.361, + "eval_steps_per_second": 1.757, + "step": 476 + }, + { + "epoch": 0.35620274432931953, + "grad_norm": 3.0598520129588374, + "learning_rate": 7.464984515725638e-06, + "loss": 0.9891, + "step": 477 + }, + { + "epoch": 0.35694950060673947, + "grad_norm": 2.98269608960095, + "learning_rate": 7.454448480162226e-06, + "loss": 1.0347, + "step": 478 + }, + { + "epoch": 0.35769625688415946, + "grad_norm": 3.2757092754418107, + "learning_rate": 7.443898066413755e-06, + "loss": 1.1134, + "step": 479 + }, + { + "epoch": 0.3584430131615794, + "grad_norm": 3.846293749413655, + "learning_rate": 7.433333336284665e-06, + "loss": 1.0546, + "step": 480 + }, + { + "epoch": 0.35918976943899933, + "grad_norm": 3.0031396693349346, + "learning_rate": 7.422754351663252e-06, + "loss": 0.9413, + "step": 481 + }, + { + "epoch": 0.3599365257164193, + "grad_norm": 5.879276289496315, + "learning_rate": 7.412161174521321e-06, + "loss": 1.0645, + "step": 482 + }, + { + "epoch": 0.36068328199383926, + "grad_norm": 2.9630046791659304, + "learning_rate": 7.4015538669138144e-06, + "loss": 1.0015, + "step": 483 + }, + { + "epoch": 0.3614300382712592, + "grad_norm": 3.1007268346585417, + "learning_rate": 7.390932490978453e-06, + "loss": 0.9843, + "step": 484 + }, + { + "epoch": 0.3621767945486792, + "grad_norm": 3.5996328301501155, + "learning_rate": 7.3802971089353696e-06, + "loss": 1.0323, + "step": 485 + }, + { + "epoch": 0.3629235508260991, + "grad_norm": 3.6048723852155846, + "learning_rate": 7.369647783086742e-06, + "loss": 1.015, + "step": 486 + }, + { + "epoch": 0.3636703071035191, + "grad_norm": 2.7669538427602487, + "learning_rate": 7.358984575816437e-06, + "loss": 0.9157, + "step": 487 + }, + { + "epoch": 0.36441706338093904, + "grad_norm": 2.7712108102753126, + "learning_rate": 7.3483075495896296e-06, + "loss": 0.9782, + "step": 488 + }, + { + "epoch": 0.365163819658359, + "grad_norm": 3.151704417287714, + "learning_rate": 7.337616766952455e-06, + "loss": 0.9626, + "step": 489 + }, + { + "epoch": 0.36591057593577897, + "grad_norm": 3.5437250736566366, + "learning_rate": 7.326912290531634e-06, + "loss": 1.0085, + "step": 490 + }, + { + "epoch": 0.36591057593577897, + "eval_loss": 0.9715595841407776, + "eval_runtime": 160.5498, + "eval_samples_per_second": 112.295, + "eval_steps_per_second": 1.756, + "step": 490 + }, + { + "epoch": 0.3666573322131989, + "grad_norm": 3.774562387910661, + "learning_rate": 7.316194183034096e-06, + "loss": 1.087, + "step": 491 + }, + { + "epoch": 0.3674040884906189, + "grad_norm": 2.8372057210924746, + "learning_rate": 7.30546250724663e-06, + "loss": 0.9763, + "step": 492 + }, + { + "epoch": 0.36815084476803883, + "grad_norm": 2.560789631009045, + "learning_rate": 7.294717326035508e-06, + "loss": 0.9571, + "step": 493 + }, + { + "epoch": 0.36889760104545877, + "grad_norm": 3.5990199559087785, + "learning_rate": 7.283958702346111e-06, + "loss": 0.9785, + "step": 494 + }, + { + "epoch": 0.36964435732287876, + "grad_norm": 2.669595595250994, + "learning_rate": 7.273186699202572e-06, + "loss": 0.9195, + "step": 495 + }, + { + "epoch": 0.3703911136002987, + "grad_norm": 4.258528121719663, + "learning_rate": 7.262401379707401e-06, + "loss": 0.983, + "step": 496 + }, + { + "epoch": 0.3711378698777187, + "grad_norm": 9.165533012445232, + "learning_rate": 7.251602807041111e-06, + "loss": 1.0183, + "step": 497 + }, + { + "epoch": 0.3718846261551386, + "grad_norm": 5.239008418773379, + "learning_rate": 7.240791044461853e-06, + "loss": 1.0623, + "step": 498 + }, + { + "epoch": 0.37263138243255856, + "grad_norm": 4.43366736674882, + "learning_rate": 7.2299661553050474e-06, + "loss": 1.0722, + "step": 499 + }, + { + "epoch": 0.37337813870997855, + "grad_norm": 2.7329812801042426, + "learning_rate": 7.21912820298301e-06, + "loss": 1.0754, + "step": 500 + }, + { + "epoch": 0.3741248949873985, + "grad_norm": 3.716579945077537, + "learning_rate": 7.208277250984577e-06, + "loss": 1.0111, + "step": 501 + }, + { + "epoch": 0.3748716512648184, + "grad_norm": 3.196376735234458, + "learning_rate": 7.1974133628747435e-06, + "loss": 1.032, + "step": 502 + }, + { + "epoch": 0.3756184075422384, + "grad_norm": 3.418307936162068, + "learning_rate": 7.186536602294278e-06, + "loss": 0.9652, + "step": 503 + }, + { + "epoch": 0.37636516381965834, + "grad_norm": 3.0583787014333748, + "learning_rate": 7.175647032959358e-06, + "loss": 0.99, + "step": 504 + }, + { + "epoch": 0.37636516381965834, + "eval_loss": 0.970821738243103, + "eval_runtime": 160.5549, + "eval_samples_per_second": 112.292, + "eval_steps_per_second": 1.756, + "step": 504 + }, + { + "epoch": 0.37711192009707833, + "grad_norm": 3.9375426470992196, + "learning_rate": 7.164744718661198e-06, + "loss": 1.0763, + "step": 505 + }, + { + "epoch": 0.37785867637449827, + "grad_norm": 3.670004043591105, + "learning_rate": 7.153829723265666e-06, + "loss": 1.0512, + "step": 506 + }, + { + "epoch": 0.3786054326519182, + "grad_norm": 4.08797931465596, + "learning_rate": 7.142902110712925e-06, + "loss": 1.0742, + "step": 507 + }, + { + "epoch": 0.3793521889293382, + "grad_norm": 2.969955919693604, + "learning_rate": 7.131961945017041e-06, + "loss": 0.9621, + "step": 508 + }, + { + "epoch": 0.38009894520675813, + "grad_norm": 3.5178350581181874, + "learning_rate": 7.121009290265619e-06, + "loss": 0.9393, + "step": 509 + }, + { + "epoch": 0.3808457014841781, + "grad_norm": 3.307634748721632, + "learning_rate": 7.11004421061943e-06, + "loss": 0.9751, + "step": 510 + }, + { + "epoch": 0.38159245776159806, + "grad_norm": 3.0771731917177365, + "learning_rate": 7.099066770312023e-06, + "loss": 1.1015, + "step": 511 + }, + { + "epoch": 0.382339214039018, + "grad_norm": 3.0141374010446724, + "learning_rate": 7.088077033649359e-06, + "loss": 1.0432, + "step": 512 + }, + { + "epoch": 0.383085970316438, + "grad_norm": 3.1493573609897063, + "learning_rate": 7.0770750650094335e-06, + "loss": 1.0117, + "step": 513 + }, + { + "epoch": 0.3838327265938579, + "grad_norm": 2.5211640616380278, + "learning_rate": 7.066060928841891e-06, + "loss": 0.9526, + "step": 514 + }, + { + "epoch": 0.3845794828712779, + "grad_norm": 2.7231216253469874, + "learning_rate": 7.055034689667661e-06, + "loss": 0.962, + "step": 515 + }, + { + "epoch": 0.38532623914869785, + "grad_norm": 2.944911073727331, + "learning_rate": 7.0439964120785665e-06, + "loss": 0.9764, + "step": 516 + }, + { + "epoch": 0.3860729954261178, + "grad_norm": 4.058582096841929, + "learning_rate": 7.032946160736956e-06, + "loss": 1.0465, + "step": 517 + }, + { + "epoch": 0.3868197517035378, + "grad_norm": 3.3936349644680566, + "learning_rate": 7.021884000375315e-06, + "loss": 1.0345, + "step": 518 + }, + { + "epoch": 0.3868197517035378, + "eval_loss": 0.9695369005203247, + "eval_runtime": 162.0438, + "eval_samples_per_second": 111.26, + "eval_steps_per_second": 1.74, + "step": 518 + }, + { + "epoch": 0.3875665079809577, + "grad_norm": 3.0088080373888877, + "learning_rate": 7.010809995795897e-06, + "loss": 0.9921, + "step": 519 + }, + { + "epoch": 0.3883132642583777, + "grad_norm": 3.7876856086376463, + "learning_rate": 6.999724211870339e-06, + "loss": 1.0344, + "step": 520 + }, + { + "epoch": 0.38906002053579763, + "grad_norm": 3.554627765753071, + "learning_rate": 6.98862671353928e-06, + "loss": 1.0494, + "step": 521 + }, + { + "epoch": 0.38980677681321757, + "grad_norm": 3.6313523510559493, + "learning_rate": 6.977517565811977e-06, + "loss": 1.1087, + "step": 522 + }, + { + "epoch": 0.39055353309063756, + "grad_norm": 2.7107679507341143, + "learning_rate": 6.966396833765941e-06, + "loss": 0.9847, + "step": 523 + }, + { + "epoch": 0.3913002893680575, + "grad_norm": 3.1970196122685417, + "learning_rate": 6.955264582546536e-06, + "loss": 0.9458, + "step": 524 + }, + { + "epoch": 0.39204704564547743, + "grad_norm": 3.7052386420585766, + "learning_rate": 6.944120877366605e-06, + "loss": 0.9409, + "step": 525 + }, + { + "epoch": 0.3927938019228974, + "grad_norm": 2.8311602798021216, + "learning_rate": 6.932965783506089e-06, + "loss": 0.9862, + "step": 526 + }, + { + "epoch": 0.39354055820031736, + "grad_norm": 3.1398517884202954, + "learning_rate": 6.92179936631165e-06, + "loss": 0.9867, + "step": 527 + }, + { + "epoch": 0.39428731447773735, + "grad_norm": 3.2070435051142065, + "learning_rate": 6.910621691196274e-06, + "loss": 1.0831, + "step": 528 + }, + { + "epoch": 0.3950340707551573, + "grad_norm": 3.2754924049472924, + "learning_rate": 6.8994328236389006e-06, + "loss": 0.9178, + "step": 529 + }, + { + "epoch": 0.3957808270325772, + "grad_norm": 2.758911582337925, + "learning_rate": 6.888232829184035e-06, + "loss": 1.0161, + "step": 530 + }, + { + "epoch": 0.3965275833099972, + "grad_norm": 3.081654379594517, + "learning_rate": 6.8770217734413606e-06, + "loss": 0.9263, + "step": 531 + }, + { + "epoch": 0.39727433958741715, + "grad_norm": 3.2054417126811683, + "learning_rate": 6.8657997220853615e-06, + "loss": 0.9714, + "step": 532 + }, + { + "epoch": 0.39727433958741715, + "eval_loss": 0.9671504497528076, + "eval_runtime": 160.5647, + "eval_samples_per_second": 112.285, + "eval_steps_per_second": 1.756, + "step": 532 + }, + { + "epoch": 0.39802109586483714, + "grad_norm": 3.5709587722352683, + "learning_rate": 6.854566740854932e-06, + "loss": 1.0274, + "step": 533 + }, + { + "epoch": 0.3987678521422571, + "grad_norm": 2.8375890052138493, + "learning_rate": 6.843322895552995e-06, + "loss": 1.0538, + "step": 534 + }, + { + "epoch": 0.399514608419677, + "grad_norm": 3.8436688249330153, + "learning_rate": 6.832068252046116e-06, + "loss": 0.9943, + "step": 535 + }, + { + "epoch": 0.400261364697097, + "grad_norm": 3.0838296325254593, + "learning_rate": 6.820802876264112e-06, + "loss": 1.0524, + "step": 536 + }, + { + "epoch": 0.40100812097451694, + "grad_norm": 3.2520329037248814, + "learning_rate": 6.809526834199675e-06, + "loss": 0.9378, + "step": 537 + }, + { + "epoch": 0.4017548772519369, + "grad_norm": 3.437376705017015, + "learning_rate": 6.798240191907979e-06, + "loss": 0.9232, + "step": 538 + }, + { + "epoch": 0.40250163352935686, + "grad_norm": 3.8117996083043564, + "learning_rate": 6.786943015506292e-06, + "loss": 1.1646, + "step": 539 + }, + { + "epoch": 0.4032483898067768, + "grad_norm": 3.086449782517495, + "learning_rate": 6.775635371173595e-06, + "loss": 1.0376, + "step": 540 + }, + { + "epoch": 0.4039951460841968, + "grad_norm": 3.5594550947217605, + "learning_rate": 6.764317325150183e-06, + "loss": 1.0664, + "step": 541 + }, + { + "epoch": 0.4047419023616167, + "grad_norm": 2.929617271103529, + "learning_rate": 6.752988943737291e-06, + "loss": 0.9074, + "step": 542 + }, + { + "epoch": 0.40548865863903666, + "grad_norm": 3.4414325543869397, + "learning_rate": 6.7416502932967e-06, + "loss": 1.0103, + "step": 543 + }, + { + "epoch": 0.40623541491645665, + "grad_norm": 3.6632742569900927, + "learning_rate": 6.730301440250337e-06, + "loss": 1.0326, + "step": 544 + }, + { + "epoch": 0.4069821711938766, + "grad_norm": 2.8687836258205666, + "learning_rate": 6.718942451079911e-06, + "loss": 1.0152, + "step": 545 + }, + { + "epoch": 0.4077289274712966, + "grad_norm": 3.321912105204247, + "learning_rate": 6.707573392326493e-06, + "loss": 1.0539, + "step": 546 + }, + { + "epoch": 0.4077289274712966, + "eval_loss": 0.966361403465271, + "eval_runtime": 160.7022, + "eval_samples_per_second": 112.189, + "eval_steps_per_second": 1.755, + "step": 546 + }, + { + "epoch": 0.4084756837487165, + "grad_norm": 3.272095356187146, + "learning_rate": 6.6961943305901515e-06, + "loss": 1.0243, + "step": 547 + }, + { + "epoch": 0.40922244002613645, + "grad_norm": 4.464138677869332, + "learning_rate": 6.6848053325295525e-06, + "loss": 0.9455, + "step": 548 + }, + { + "epoch": 0.40996919630355644, + "grad_norm": 3.6980284220501702, + "learning_rate": 6.673406464861563e-06, + "loss": 0.968, + "step": 549 + }, + { + "epoch": 0.4107159525809764, + "grad_norm": 3.9712580585477326, + "learning_rate": 6.661997794360872e-06, + "loss": 1.0356, + "step": 550 + }, + { + "epoch": 0.41146270885839636, + "grad_norm": 3.6234064719129013, + "learning_rate": 6.65057938785959e-06, + "loss": 0.9782, + "step": 551 + }, + { + "epoch": 0.4122094651358163, + "grad_norm": 3.430576862937015, + "learning_rate": 6.639151312246863e-06, + "loss": 1.0377, + "step": 552 + }, + { + "epoch": 0.41295622141323624, + "grad_norm": 3.653367041079211, + "learning_rate": 6.62771363446848e-06, + "loss": 0.9338, + "step": 553 + }, + { + "epoch": 0.4137029776906562, + "grad_norm": 2.9772192093714778, + "learning_rate": 6.616266421526477e-06, + "loss": 0.9065, + "step": 554 + }, + { + "epoch": 0.41444973396807616, + "grad_norm": 5.845516786477963, + "learning_rate": 6.604809740478748e-06, + "loss": 1.0693, + "step": 555 + }, + { + "epoch": 0.41519649024549615, + "grad_norm": 3.306244287174315, + "learning_rate": 6.593343658438649e-06, + "loss": 1.0179, + "step": 556 + }, + { + "epoch": 0.4159432465229161, + "grad_norm": 3.092506060971144, + "learning_rate": 6.581868242574613e-06, + "loss": 1.0373, + "step": 557 + }, + { + "epoch": 0.416690002800336, + "grad_norm": 2.793320476040383, + "learning_rate": 6.570383560109745e-06, + "loss": 1.0184, + "step": 558 + }, + { + "epoch": 0.417436759077756, + "grad_norm": 3.4955390820801853, + "learning_rate": 6.558889678321436e-06, + "loss": 1.0392, + "step": 559 + }, + { + "epoch": 0.41818351535517595, + "grad_norm": 3.273061028251747, + "learning_rate": 6.547386664540968e-06, + "loss": 0.9409, + "step": 560 + }, + { + "epoch": 0.41818351535517595, + "eval_loss": 0.9662355780601501, + "eval_runtime": 160.6718, + "eval_samples_per_second": 112.21, + "eval_steps_per_second": 1.755, + "step": 560 + }, + { + "epoch": 0.4189302716325959, + "grad_norm": 4.087603010650717, + "learning_rate": 6.535874586153115e-06, + "loss": 1.0936, + "step": 561 + }, + { + "epoch": 0.4196770279100159, + "grad_norm": 3.1783778248379226, + "learning_rate": 6.524353510595754e-06, + "loss": 1.0416, + "step": 562 + }, + { + "epoch": 0.4204237841874358, + "grad_norm": 3.5032912207274247, + "learning_rate": 6.512823505359469e-06, + "loss": 0.9697, + "step": 563 + }, + { + "epoch": 0.4211705404648558, + "grad_norm": 2.924411630831047, + "learning_rate": 6.501284637987148e-06, + "loss": 0.9833, + "step": 564 + }, + { + "epoch": 0.42191729674227574, + "grad_norm": 3.292319469279369, + "learning_rate": 6.489736976073603e-06, + "loss": 0.9739, + "step": 565 + }, + { + "epoch": 0.4226640530196957, + "grad_norm": 2.8074608140919146, + "learning_rate": 6.4781805872651536e-06, + "loss": 1.0121, + "step": 566 + }, + { + "epoch": 0.42341080929711566, + "grad_norm": 3.8979341939201237, + "learning_rate": 6.466615539259252e-06, + "loss": 0.9085, + "step": 567 + }, + { + "epoch": 0.4241575655745356, + "grad_norm": 3.0105273821322642, + "learning_rate": 6.4550418998040686e-06, + "loss": 0.9803, + "step": 568 + }, + { + "epoch": 0.4249043218519556, + "grad_norm": 3.018386515792839, + "learning_rate": 6.443459736698106e-06, + "loss": 1.0349, + "step": 569 + }, + { + "epoch": 0.4256510781293755, + "grad_norm": 4.3297434955866105, + "learning_rate": 6.431869117789797e-06, + "loss": 0.9927, + "step": 570 + }, + { + "epoch": 0.42639783440679546, + "grad_norm": 3.6098279806564326, + "learning_rate": 6.4202701109771105e-06, + "loss": 1.0685, + "step": 571 + }, + { + "epoch": 0.42714459068421545, + "grad_norm": 3.27616752754168, + "learning_rate": 6.408662784207149e-06, + "loss": 0.9819, + "step": 572 + }, + { + "epoch": 0.4278913469616354, + "grad_norm": 2.755375498502825, + "learning_rate": 6.397047205475757e-06, + "loss": 0.9871, + "step": 573 + }, + { + "epoch": 0.4286381032390554, + "grad_norm": 2.978173123042259, + "learning_rate": 6.385423442827116e-06, + "loss": 0.9861, + "step": 574 + }, + { + "epoch": 0.4286381032390554, + "eval_loss": 0.9649081826210022, + "eval_runtime": 160.6682, + "eval_samples_per_second": 112.213, + "eval_steps_per_second": 1.755, + "step": 574 + }, + { + "epoch": 0.4293848595164753, + "grad_norm": 3.1552421177846997, + "learning_rate": 6.3737915643533484e-06, + "loss": 1.0343, + "step": 575 + }, + { + "epoch": 0.43013161579389525, + "grad_norm": 4.0712453485088345, + "learning_rate": 6.362151638194125e-06, + "loss": 0.9875, + "step": 576 + }, + { + "epoch": 0.43087837207131524, + "grad_norm": 3.6072794575911353, + "learning_rate": 6.3505037325362515e-06, + "loss": 0.9823, + "step": 577 + }, + { + "epoch": 0.4316251283487352, + "grad_norm": 3.74072340333252, + "learning_rate": 6.338847915613285e-06, + "loss": 1.0738, + "step": 578 + }, + { + "epoch": 0.4323718846261551, + "grad_norm": 3.9042759629608668, + "learning_rate": 6.327184255705123e-06, + "loss": 0.9318, + "step": 579 + }, + { + "epoch": 0.4331186409035751, + "grad_norm": 2.7602755638221805, + "learning_rate": 6.315512821137606e-06, + "loss": 0.9505, + "step": 580 + }, + { + "epoch": 0.43386539718099504, + "grad_norm": 3.663747781306997, + "learning_rate": 6.303833680282125e-06, + "loss": 0.9821, + "step": 581 + }, + { + "epoch": 0.43461215345841503, + "grad_norm": 4.012768339701778, + "learning_rate": 6.292146901555207e-06, + "loss": 1.0239, + "step": 582 + }, + { + "epoch": 0.43535890973583496, + "grad_norm": 3.6313447848071903, + "learning_rate": 6.280452553418126e-06, + "loss": 1.0401, + "step": 583 + }, + { + "epoch": 0.4361056660132549, + "grad_norm": 3.2153420258632255, + "learning_rate": 6.268750704376494e-06, + "loss": 1.0054, + "step": 584 + }, + { + "epoch": 0.4368524222906749, + "grad_norm": 3.4771984642840947, + "learning_rate": 6.257041422979871e-06, + "loss": 0.9913, + "step": 585 + }, + { + "epoch": 0.4375991785680948, + "grad_norm": 2.792906968691294, + "learning_rate": 6.245324777821346e-06, + "loss": 0.953, + "step": 586 + }, + { + "epoch": 0.4383459348455148, + "grad_norm": 3.1678763709553266, + "learning_rate": 6.233600837537153e-06, + "loss": 1.0841, + "step": 587 + }, + { + "epoch": 0.43909269112293475, + "grad_norm": 3.9018561243297136, + "learning_rate": 6.221869670806257e-06, + "loss": 0.9856, + "step": 588 + }, + { + "epoch": 0.43909269112293475, + "eval_loss": 0.9642106294631958, + "eval_runtime": 162.0199, + "eval_samples_per_second": 111.276, + "eval_steps_per_second": 1.741, + "step": 588 + }, + { + "epoch": 0.4398394474003547, + "grad_norm": 3.434760853329825, + "learning_rate": 6.210131346349953e-06, + "loss": 0.9926, + "step": 589 + }, + { + "epoch": 0.4405862036777747, + "grad_norm": 3.4084803668234738, + "learning_rate": 6.1983859329314745e-06, + "loss": 1.0572, + "step": 590 + }, + { + "epoch": 0.4413329599551946, + "grad_norm": 2.968509928932182, + "learning_rate": 6.186633499355576e-06, + "loss": 0.9848, + "step": 591 + }, + { + "epoch": 0.4420797162326146, + "grad_norm": 5.328332695240182, + "learning_rate": 6.174874114468132e-06, + "loss": 1.0451, + "step": 592 + }, + { + "epoch": 0.44282647251003454, + "grad_norm": 3.0263469144237143, + "learning_rate": 6.16310784715575e-06, + "loss": 0.971, + "step": 593 + }, + { + "epoch": 0.4435732287874545, + "grad_norm": 3.1339960008604826, + "learning_rate": 6.151334766345345e-06, + "loss": 1.0407, + "step": 594 + }, + { + "epoch": 0.44431998506487447, + "grad_norm": 3.0931627663439984, + "learning_rate": 6.139554941003747e-06, + "loss": 1.0377, + "step": 595 + }, + { + "epoch": 0.4450667413422944, + "grad_norm": 2.909763131231064, + "learning_rate": 6.127768440137298e-06, + "loss": 1.0148, + "step": 596 + }, + { + "epoch": 0.44581349761971434, + "grad_norm": 3.4356772897225287, + "learning_rate": 6.115975332791446e-06, + "loss": 0.9894, + "step": 597 + }, + { + "epoch": 0.44656025389713433, + "grad_norm": 2.8810376760097474, + "learning_rate": 6.104175688050336e-06, + "loss": 1.0024, + "step": 598 + }, + { + "epoch": 0.44730701017455426, + "grad_norm": 2.963947770292573, + "learning_rate": 6.092369575036411e-06, + "loss": 0.9927, + "step": 599 + }, + { + "epoch": 0.44805376645197426, + "grad_norm": 3.4408370155382513, + "learning_rate": 6.0805570629100075e-06, + "loss": 0.9892, + "step": 600 + }, + { + "epoch": 0.4488005227293942, + "grad_norm": 3.2679972430155235, + "learning_rate": 6.068738220868944e-06, + "loss": 1.0083, + "step": 601 + }, + { + "epoch": 0.4495472790068141, + "grad_norm": 3.3266341400886725, + "learning_rate": 6.056913118148122e-06, + "loss": 0.9947, + "step": 602 + }, + { + "epoch": 0.4495472790068141, + "eval_loss": 0.9621589183807373, + "eval_runtime": 162.1706, + "eval_samples_per_second": 111.173, + "eval_steps_per_second": 1.739, + "step": 602 + }, + { + "epoch": 0.4502940352842341, + "grad_norm": 2.760574168207489, + "learning_rate": 6.045081824019119e-06, + "loss": 1.0301, + "step": 603 + }, + { + "epoch": 0.45104079156165405, + "grad_norm": 4.395437647043472, + "learning_rate": 6.03324440778978e-06, + "loss": 0.9877, + "step": 604 + }, + { + "epoch": 0.45178754783907404, + "grad_norm": 2.9581029369662892, + "learning_rate": 6.021400938803813e-06, + "loss": 1.0625, + "step": 605 + }, + { + "epoch": 0.452534304116494, + "grad_norm": 4.9741175183125845, + "learning_rate": 6.009551486440387e-06, + "loss": 0.984, + "step": 606 + }, + { + "epoch": 0.4532810603939139, + "grad_norm": 3.083559757545901, + "learning_rate": 5.9976961201137155e-06, + "loss": 0.9369, + "step": 607 + }, + { + "epoch": 0.4540278166713339, + "grad_norm": 3.5027682562154014, + "learning_rate": 5.985834909272661e-06, + "loss": 1.0684, + "step": 608 + }, + { + "epoch": 0.45477457294875384, + "grad_norm": 4.142396452688926, + "learning_rate": 5.973967923400321e-06, + "loss": 1.0164, + "step": 609 + }, + { + "epoch": 0.45552132922617383, + "grad_norm": 2.8673317625362835, + "learning_rate": 5.9620952320136225e-06, + "loss": 0.9815, + "step": 610 + }, + { + "epoch": 0.45626808550359377, + "grad_norm": 3.021928936402679, + "learning_rate": 5.95021690466292e-06, + "loss": 1.0028, + "step": 611 + }, + { + "epoch": 0.4570148417810137, + "grad_norm": 3.2317785993970007, + "learning_rate": 5.938333010931578e-06, + "loss": 1.1046, + "step": 612 + }, + { + "epoch": 0.4577615980584337, + "grad_norm": 2.726268768685177, + "learning_rate": 5.926443620435572e-06, + "loss": 1.0048, + "step": 613 + }, + { + "epoch": 0.45850835433585363, + "grad_norm": 3.1064415558539706, + "learning_rate": 5.914548802823077e-06, + "loss": 0.9325, + "step": 614 + }, + { + "epoch": 0.4592551106132736, + "grad_norm": 2.6123605318850665, + "learning_rate": 5.902648627774059e-06, + "loss": 0.9743, + "step": 615 + }, + { + "epoch": 0.46000186689069356, + "grad_norm": 3.0579324009097517, + "learning_rate": 5.8907431649998695e-06, + "loss": 0.9612, + "step": 616 + }, + { + "epoch": 0.46000186689069356, + "eval_loss": 0.9606157541275024, + "eval_runtime": 160.5917, + "eval_samples_per_second": 112.266, + "eval_steps_per_second": 1.756, + "step": 616 + }, + { + "epoch": 0.4607486231681135, + "grad_norm": 3.772368202333684, + "learning_rate": 5.878832484242833e-06, + "loss": 1.0323, + "step": 617 + }, + { + "epoch": 0.4614953794455335, + "grad_norm": 3.0158939394506943, + "learning_rate": 5.866916655275846e-06, + "loss": 0.947, + "step": 618 + }, + { + "epoch": 0.4622421357229534, + "grad_norm": 2.828941271333206, + "learning_rate": 5.854995747901958e-06, + "loss": 0.9681, + "step": 619 + }, + { + "epoch": 0.46298889200037335, + "grad_norm": 5.490674513764256, + "learning_rate": 5.84306983195397e-06, + "loss": 1.0043, + "step": 620 + }, + { + "epoch": 0.46373564827779334, + "grad_norm": 3.6863195592071727, + "learning_rate": 5.831138977294025e-06, + "loss": 1.0246, + "step": 621 + }, + { + "epoch": 0.4644824045552133, + "grad_norm": 8.996745121039877, + "learning_rate": 5.819203253813194e-06, + "loss": 0.9893, + "step": 622 + }, + { + "epoch": 0.46522916083263327, + "grad_norm": 2.9938080204280606, + "learning_rate": 5.807262731431069e-06, + "loss": 0.9104, + "step": 623 + }, + { + "epoch": 0.4659759171100532, + "grad_norm": 3.2831084055526207, + "learning_rate": 5.795317480095361e-06, + "loss": 0.9939, + "step": 624 + }, + { + "epoch": 0.46672267338747314, + "grad_norm": 3.491567046308877, + "learning_rate": 5.783367569781474e-06, + "loss": 0.9589, + "step": 625 + }, + { + "epoch": 0.46746942966489313, + "grad_norm": 3.8464462248209936, + "learning_rate": 5.77141307049211e-06, + "loss": 0.9308, + "step": 626 + }, + { + "epoch": 0.46821618594231307, + "grad_norm": 3.518690870541806, + "learning_rate": 5.7594540522568495e-06, + "loss": 0.9067, + "step": 627 + }, + { + "epoch": 0.46896294221973306, + "grad_norm": 4.039900909656756, + "learning_rate": 5.7474905851317505e-06, + "loss": 0.9626, + "step": 628 + }, + { + "epoch": 0.469709698497153, + "grad_norm": 3.6025011295590863, + "learning_rate": 5.73552273919893e-06, + "loss": 0.9861, + "step": 629 + }, + { + "epoch": 0.47045645477457293, + "grad_norm": 4.493424057626318, + "learning_rate": 5.723550584566151e-06, + "loss": 0.9795, + "step": 630 + }, + { + "epoch": 0.47045645477457293, + "eval_loss": 0.9606096744537354, + "eval_runtime": 160.7417, + "eval_samples_per_second": 112.161, + "eval_steps_per_second": 1.754, + "step": 630 + }, + { + "epoch": 0.4712032110519929, + "grad_norm": 4.352468775772123, + "learning_rate": 5.711574191366427e-06, + "loss": 0.975, + "step": 631 + }, + { + "epoch": 0.47194996732941286, + "grad_norm": 4.037016067946189, + "learning_rate": 5.699593629757591e-06, + "loss": 1.0385, + "step": 632 + }, + { + "epoch": 0.47269672360683285, + "grad_norm": 2.9029927769111388, + "learning_rate": 5.6876089699219016e-06, + "loss": 1.0004, + "step": 633 + }, + { + "epoch": 0.4734434798842528, + "grad_norm": 6.004548534705236, + "learning_rate": 5.675620282065621e-06, + "loss": 0.9879, + "step": 634 + }, + { + "epoch": 0.4741902361616727, + "grad_norm": 3.391029290471578, + "learning_rate": 5.663627636418611e-06, + "loss": 1.064, + "step": 635 + }, + { + "epoch": 0.4749369924390927, + "grad_norm": 3.043307222730041, + "learning_rate": 5.651631103233914e-06, + "loss": 1.0092, + "step": 636 + }, + { + "epoch": 0.47568374871651264, + "grad_norm": 3.5752213883205366, + "learning_rate": 5.639630752787349e-06, + "loss": 0.9599, + "step": 637 + }, + { + "epoch": 0.4764305049939326, + "grad_norm": 3.1804375282053288, + "learning_rate": 5.627626655377094e-06, + "loss": 0.9713, + "step": 638 + }, + { + "epoch": 0.47717726127135257, + "grad_norm": 2.790112215997805, + "learning_rate": 5.6156188813232806e-06, + "loss": 0.9912, + "step": 639 + }, + { + "epoch": 0.4779240175487725, + "grad_norm": 3.246922721538881, + "learning_rate": 5.603607500967574e-06, + "loss": 0.963, + "step": 640 + }, + { + "epoch": 0.4786707738261925, + "grad_norm": 3.392869083327263, + "learning_rate": 5.591592584672767e-06, + "loss": 1.0094, + "step": 641 + }, + { + "epoch": 0.47941753010361243, + "grad_norm": 3.0902607501283668, + "learning_rate": 5.579574202822366e-06, + "loss": 0.9703, + "step": 642 + }, + { + "epoch": 0.48016428638103237, + "grad_norm": 2.974464180221207, + "learning_rate": 5.567552425820177e-06, + "loss": 0.9171, + "step": 643 + }, + { + "epoch": 0.48091104265845236, + "grad_norm": 2.821388532495256, + "learning_rate": 5.5555273240899e-06, + "loss": 0.9527, + "step": 644 + }, + { + "epoch": 0.48091104265845236, + "eval_loss": 0.9591115713119507, + "eval_runtime": 160.7107, + "eval_samples_per_second": 112.183, + "eval_steps_per_second": 1.755, + "step": 644 + }, + { + "epoch": 0.4816577989358723, + "grad_norm": 3.260572717739725, + "learning_rate": 5.543498968074704e-06, + "loss": 0.9426, + "step": 645 + }, + { + "epoch": 0.4824045552132923, + "grad_norm": 2.903284442683706, + "learning_rate": 5.531467428236827e-06, + "loss": 1.0116, + "step": 646 + }, + { + "epoch": 0.4831513114907122, + "grad_norm": 3.2692857499423456, + "learning_rate": 5.519432775057158e-06, + "loss": 1.0015, + "step": 647 + }, + { + "epoch": 0.48389806776813216, + "grad_norm": 3.5780325125931833, + "learning_rate": 5.507395079034816e-06, + "loss": 1.0247, + "step": 648 + }, + { + "epoch": 0.48464482404555215, + "grad_norm": 2.943060253168757, + "learning_rate": 5.4953544106867594e-06, + "loss": 0.8597, + "step": 649 + }, + { + "epoch": 0.4853915803229721, + "grad_norm": 3.7864502304307854, + "learning_rate": 5.4833108405473425e-06, + "loss": 0.9909, + "step": 650 + }, + { + "epoch": 0.4861383366003921, + "grad_norm": 3.34360958188669, + "learning_rate": 5.471264439167932e-06, + "loss": 1.0366, + "step": 651 + }, + { + "epoch": 0.486885092877812, + "grad_norm": 3.22674707296479, + "learning_rate": 5.45921527711647e-06, + "loss": 1.0114, + "step": 652 + }, + { + "epoch": 0.48763184915523194, + "grad_norm": 3.3442512460984632, + "learning_rate": 5.447163424977076e-06, + "loss": 0.9198, + "step": 653 + }, + { + "epoch": 0.48837860543265194, + "grad_norm": 3.4465116733186485, + "learning_rate": 5.4351089533496286e-06, + "loss": 0.9371, + "step": 654 + }, + { + "epoch": 0.48912536171007187, + "grad_norm": 3.6231881310417995, + "learning_rate": 5.423051932849348e-06, + "loss": 1.0369, + "step": 655 + }, + { + "epoch": 0.4898721179874918, + "grad_norm": 3.150691425065748, + "learning_rate": 5.410992434106387e-06, + "loss": 0.9355, + "step": 656 + }, + { + "epoch": 0.4906188742649118, + "grad_norm": 2.7706082715810365, + "learning_rate": 5.398930527765416e-06, + "loss": 0.9466, + "step": 657 + }, + { + "epoch": 0.49136563054233173, + "grad_norm": 2.9257751655736337, + "learning_rate": 5.386866284485212e-06, + "loss": 1.0128, + "step": 658 + }, + { + "epoch": 0.49136563054233173, + "eval_loss": 0.9578000903129578, + "eval_runtime": 160.6458, + "eval_samples_per_second": 112.228, + "eval_steps_per_second": 1.755, + "step": 658 + }, + { + "epoch": 0.4921123868197517, + "grad_norm": 4.572313571723131, + "learning_rate": 5.374799774938236e-06, + "loss": 1.0165, + "step": 659 + }, + { + "epoch": 0.49285914309717166, + "grad_norm": 3.254503964827434, + "learning_rate": 5.36273106981023e-06, + "loss": 0.961, + "step": 660 + }, + { + "epoch": 0.4936058993745916, + "grad_norm": 2.941336279988955, + "learning_rate": 5.350660239799795e-06, + "loss": 1.0056, + "step": 661 + }, + { + "epoch": 0.4943526556520116, + "grad_norm": 3.3699370670436863, + "learning_rate": 5.338587355617981e-06, + "loss": 0.9501, + "step": 662 + }, + { + "epoch": 0.4950994119294315, + "grad_norm": 3.1050200793960356, + "learning_rate": 5.326512487987871e-06, + "loss": 0.9597, + "step": 663 + }, + { + "epoch": 0.4958461682068515, + "grad_norm": 3.1508330708960584, + "learning_rate": 5.314435707644166e-06, + "loss": 0.9765, + "step": 664 + }, + { + "epoch": 0.49659292448427145, + "grad_norm": 3.5815299911038587, + "learning_rate": 5.3023570853327725e-06, + "loss": 0.9374, + "step": 665 + }, + { + "epoch": 0.4973396807616914, + "grad_norm": 2.8686531462477918, + "learning_rate": 5.290276691810388e-06, + "loss": 0.9601, + "step": 666 + }, + { + "epoch": 0.4980864370391114, + "grad_norm": 3.117674265246126, + "learning_rate": 5.278194597844083e-06, + "loss": 1.0391, + "step": 667 + }, + { + "epoch": 0.4988331933165313, + "grad_norm": 3.093494135534134, + "learning_rate": 5.266110874210893e-06, + "loss": 0.9477, + "step": 668 + }, + { + "epoch": 0.4995799495939513, + "grad_norm": 3.139075567298836, + "learning_rate": 5.2540255916974005e-06, + "loss": 1.0269, + "step": 669 + }, + { + "epoch": 0.5003267058713712, + "grad_norm": 3.3071077238436963, + "learning_rate": 5.241938821099313e-06, + "loss": 1.0532, + "step": 670 + }, + { + "epoch": 0.5010734621487912, + "grad_norm": 2.816646250547839, + "learning_rate": 5.229850633221063e-06, + "loss": 0.9031, + "step": 671 + }, + { + "epoch": 0.5018202184262112, + "grad_norm": 3.4889209023924144, + "learning_rate": 5.217761098875383e-06, + "loss": 0.9739, + "step": 672 + }, + { + "epoch": 0.5018202184262112, + "eval_loss": 0.9554553627967834, + "eval_runtime": 160.5922, + "eval_samples_per_second": 112.266, + "eval_steps_per_second": 1.756, + "step": 672 + }, + { + "epoch": 0.5025669747036311, + "grad_norm": 3.042163670813616, + "learning_rate": 5.205670288882889e-06, + "loss": 1.0226, + "step": 673 + }, + { + "epoch": 0.503313730981051, + "grad_norm": 3.5555861229297205, + "learning_rate": 5.19357827407168e-06, + "loss": 1.0199, + "step": 674 + }, + { + "epoch": 0.504060487258471, + "grad_norm": 2.6753462198941347, + "learning_rate": 5.181485125276898e-06, + "loss": 0.9951, + "step": 675 + }, + { + "epoch": 0.504807243535891, + "grad_norm": 3.430987000958468, + "learning_rate": 5.169390913340342e-06, + "loss": 0.9776, + "step": 676 + }, + { + "epoch": 0.505553999813311, + "grad_norm": 2.836448999390065, + "learning_rate": 5.157295709110031e-06, + "loss": 0.9318, + "step": 677 + }, + { + "epoch": 0.5063007560907309, + "grad_norm": 3.9055759931822704, + "learning_rate": 5.1451995834397975e-06, + "loss": 1.0254, + "step": 678 + }, + { + "epoch": 0.5070475123681508, + "grad_norm": 3.2262289649091387, + "learning_rate": 5.133102607188875e-06, + "loss": 1.0698, + "step": 679 + }, + { + "epoch": 0.5077942686455708, + "grad_norm": 6.111956461530283, + "learning_rate": 5.121004851221477e-06, + "loss": 0.9786, + "step": 680 + }, + { + "epoch": 0.5085410249229908, + "grad_norm": 3.4939998439456303, + "learning_rate": 5.108906386406385e-06, + "loss": 0.9358, + "step": 681 + }, + { + "epoch": 0.5092877812004107, + "grad_norm": 2.927278408485738, + "learning_rate": 5.096807283616535e-06, + "loss": 0.964, + "step": 682 + }, + { + "epoch": 0.5100345374778307, + "grad_norm": 3.476986083429208, + "learning_rate": 5.084707613728598e-06, + "loss": 0.9, + "step": 683 + }, + { + "epoch": 0.5107812937552506, + "grad_norm": 3.775237997290502, + "learning_rate": 5.0726074476225675e-06, + "loss": 0.969, + "step": 684 + }, + { + "epoch": 0.5115280500326705, + "grad_norm": 4.127873726003733, + "learning_rate": 5.060506856181342e-06, + "loss": 1.0825, + "step": 685 + }, + { + "epoch": 0.5122748063100906, + "grad_norm": 3.435131733296269, + "learning_rate": 5.0484059102903174e-06, + "loss": 0.979, + "step": 686 + }, + { + "epoch": 0.5122748063100906, + "eval_loss": 0.9553431272506714, + "eval_runtime": 160.5802, + "eval_samples_per_second": 112.274, + "eval_steps_per_second": 1.756, + "step": 686 + }, + { + "epoch": 0.5130215625875105, + "grad_norm": 3.339425996015394, + "learning_rate": 5.036304680836959e-06, + "loss": 1.0607, + "step": 687 + }, + { + "epoch": 0.5137683188649305, + "grad_norm": 4.076321482116909, + "learning_rate": 5.0242032387103974e-06, + "loss": 1.0518, + "step": 688 + }, + { + "epoch": 0.5145150751423504, + "grad_norm": 3.089030123276057, + "learning_rate": 5.01210165480101e-06, + "loss": 0.9193, + "step": 689 + }, + { + "epoch": 0.5152618314197703, + "grad_norm": 4.513388204954891, + "learning_rate": 5e-06, + "loss": 1.036, + "step": 690 + }, + { + "epoch": 0.5160085876971904, + "grad_norm": 3.369925467145746, + "learning_rate": 4.9878983451989904e-06, + "loss": 0.9661, + "step": 691 + }, + { + "epoch": 0.5167553439746103, + "grad_norm": 3.2836780434682993, + "learning_rate": 4.9757967612896025e-06, + "loss": 0.9677, + "step": 692 + }, + { + "epoch": 0.5175021002520303, + "grad_norm": 4.0775074108281215, + "learning_rate": 4.963695319163041e-06, + "loss": 1.0827, + "step": 693 + }, + { + "epoch": 0.5182488565294502, + "grad_norm": 3.325029804236844, + "learning_rate": 4.951594089709685e-06, + "loss": 1.0138, + "step": 694 + }, + { + "epoch": 0.5189956128068701, + "grad_norm": 3.5885563133061487, + "learning_rate": 4.939493143818659e-06, + "loss": 1.0677, + "step": 695 + }, + { + "epoch": 0.5197423690842902, + "grad_norm": 3.156282854149789, + "learning_rate": 4.927392552377434e-06, + "loss": 0.9617, + "step": 696 + }, + { + "epoch": 0.5204891253617101, + "grad_norm": 3.0416344384601026, + "learning_rate": 4.915292386271403e-06, + "loss": 0.9855, + "step": 697 + }, + { + "epoch": 0.52123588163913, + "grad_norm": 3.2733777511786095, + "learning_rate": 4.9031927163834655e-06, + "loss": 0.8939, + "step": 698 + }, + { + "epoch": 0.52198263791655, + "grad_norm": 3.398814579607184, + "learning_rate": 4.891093613593615e-06, + "loss": 0.9741, + "step": 699 + }, + { + "epoch": 0.5227293941939699, + "grad_norm": 3.1153916205245777, + "learning_rate": 4.878995148778525e-06, + "loss": 0.9968, + "step": 700 + }, + { + "epoch": 0.5227293941939699, + "eval_loss": 0.9541786313056946, + "eval_runtime": 162.0716, + "eval_samples_per_second": 111.241, + "eval_steps_per_second": 1.74, + "step": 700 + }, + { + "epoch": 0.5234761504713898, + "grad_norm": 3.3647452892177117, + "learning_rate": 4.866897392811127e-06, + "loss": 0.9962, + "step": 701 + }, + { + "epoch": 0.5242229067488099, + "grad_norm": 3.113204959768887, + "learning_rate": 4.854800416560205e-06, + "loss": 1.0054, + "step": 702 + }, + { + "epoch": 0.5249696630262298, + "grad_norm": 3.8495342835228583, + "learning_rate": 4.842704290889971e-06, + "loss": 0.8952, + "step": 703 + }, + { + "epoch": 0.5257164193036498, + "grad_norm": 3.174795087074365, + "learning_rate": 4.830609086659659e-06, + "loss": 0.9851, + "step": 704 + }, + { + "epoch": 0.5264631755810697, + "grad_norm": 2.729342539133593, + "learning_rate": 4.818514874723103e-06, + "loss": 0.9213, + "step": 705 + }, + { + "epoch": 0.5272099318584896, + "grad_norm": 3.076282518758111, + "learning_rate": 4.806421725928323e-06, + "loss": 0.9964, + "step": 706 + }, + { + "epoch": 0.5279566881359097, + "grad_norm": 3.2836882108192165, + "learning_rate": 4.7943297111171115e-06, + "loss": 0.9746, + "step": 707 + }, + { + "epoch": 0.5287034444133296, + "grad_norm": 3.8352591409956336, + "learning_rate": 4.782238901124618e-06, + "loss": 1.043, + "step": 708 + }, + { + "epoch": 0.5294502006907496, + "grad_norm": 2.7535052802015634, + "learning_rate": 4.770149366778938e-06, + "loss": 0.8983, + "step": 709 + }, + { + "epoch": 0.5301969569681695, + "grad_norm": 3.852480148863445, + "learning_rate": 4.758061178900687e-06, + "loss": 0.9914, + "step": 710 + }, + { + "epoch": 0.5309437132455894, + "grad_norm": 3.4470151143902337, + "learning_rate": 4.745974408302602e-06, + "loss": 0.9474, + "step": 711 + }, + { + "epoch": 0.5316904695230095, + "grad_norm": 2.5890177372536516, + "learning_rate": 4.7338891257891085e-06, + "loss": 0.9773, + "step": 712 + }, + { + "epoch": 0.5324372258004294, + "grad_norm": 4.222415888473242, + "learning_rate": 4.721805402155919e-06, + "loss": 1.0616, + "step": 713 + }, + { + "epoch": 0.5331839820778493, + "grad_norm": 3.0869687607345395, + "learning_rate": 4.709723308189614e-06, + "loss": 1.0079, + "step": 714 + }, + { + "epoch": 0.5331839820778493, + "eval_loss": 0.9527038931846619, + "eval_runtime": 160.8101, + "eval_samples_per_second": 112.114, + "eval_steps_per_second": 1.754, + "step": 714 + }, + { + "epoch": 0.5339307383552693, + "grad_norm": 3.670148127602683, + "learning_rate": 4.697642914667229e-06, + "loss": 1.0192, + "step": 715 + }, + { + "epoch": 0.5346774946326892, + "grad_norm": 3.107343398040105, + "learning_rate": 4.6855642923558345e-06, + "loss": 0.9228, + "step": 716 + }, + { + "epoch": 0.5354242509101093, + "grad_norm": 3.5996371455869367, + "learning_rate": 4.67348751201213e-06, + "loss": 1.0376, + "step": 717 + }, + { + "epoch": 0.5361710071875292, + "grad_norm": 3.1878765692346605, + "learning_rate": 4.661412644382021e-06, + "loss": 0.9582, + "step": 718 + }, + { + "epoch": 0.5369177634649491, + "grad_norm": 3.771190985798084, + "learning_rate": 4.649339760200206e-06, + "loss": 0.9596, + "step": 719 + }, + { + "epoch": 0.5376645197423691, + "grad_norm": 3.38014392455014, + "learning_rate": 4.637268930189772e-06, + "loss": 0.9532, + "step": 720 + }, + { + "epoch": 0.538411276019789, + "grad_norm": 3.8749251965793965, + "learning_rate": 4.625200225061765e-06, + "loss": 1.0393, + "step": 721 + }, + { + "epoch": 0.539158032297209, + "grad_norm": 3.9117960489662127, + "learning_rate": 4.61313371551479e-06, + "loss": 1.006, + "step": 722 + }, + { + "epoch": 0.539904788574629, + "grad_norm": 2.8086121993308857, + "learning_rate": 4.601069472234584e-06, + "loss": 0.9323, + "step": 723 + }, + { + "epoch": 0.5406515448520489, + "grad_norm": 2.7551273219778825, + "learning_rate": 4.589007565893615e-06, + "loss": 0.9317, + "step": 724 + }, + { + "epoch": 0.5413983011294689, + "grad_norm": 3.362198847742376, + "learning_rate": 4.576948067150655e-06, + "loss": 0.9358, + "step": 725 + }, + { + "epoch": 0.5421450574068888, + "grad_norm": 3.5974105806012235, + "learning_rate": 4.564891046650373e-06, + "loss": 1.0596, + "step": 726 + }, + { + "epoch": 0.5428918136843088, + "grad_norm": 3.552430736775191, + "learning_rate": 4.552836575022925e-06, + "loss": 0.9611, + "step": 727 + }, + { + "epoch": 0.5436385699617288, + "grad_norm": 2.573569543965171, + "learning_rate": 4.540784722883532e-06, + "loss": 0.9552, + "step": 728 + }, + { + "epoch": 0.5436385699617288, + "eval_loss": 0.951053261756897, + "eval_runtime": 160.6105, + "eval_samples_per_second": 112.253, + "eval_steps_per_second": 1.756, + "step": 728 + }, + { + "epoch": 0.5443853262391487, + "grad_norm": 3.565425106205681, + "learning_rate": 4.528735560832071e-06, + "loss": 1.0667, + "step": 729 + }, + { + "epoch": 0.5451320825165686, + "grad_norm": 2.7926401591953383, + "learning_rate": 4.51668915945266e-06, + "loss": 0.976, + "step": 730 + }, + { + "epoch": 0.5458788387939886, + "grad_norm": 3.0421797900802003, + "learning_rate": 4.504645589313243e-06, + "loss": 1.0462, + "step": 731 + }, + { + "epoch": 0.5466255950714086, + "grad_norm": 3.503454575591918, + "learning_rate": 4.492604920965185e-06, + "loss": 1.0857, + "step": 732 + }, + { + "epoch": 0.5473723513488286, + "grad_norm": 2.904815956092458, + "learning_rate": 4.480567224942845e-06, + "loss": 1.0072, + "step": 733 + }, + { + "epoch": 0.5481191076262485, + "grad_norm": 3.5102905705105742, + "learning_rate": 4.468532571763174e-06, + "loss": 0.9472, + "step": 734 + }, + { + "epoch": 0.5488658639036684, + "grad_norm": 3.7183800704590033, + "learning_rate": 4.456501031925297e-06, + "loss": 1.0463, + "step": 735 + }, + { + "epoch": 0.5496126201810884, + "grad_norm": 2.9419058702603555, + "learning_rate": 4.444472675910103e-06, + "loss": 1.031, + "step": 736 + }, + { + "epoch": 0.5503593764585084, + "grad_norm": 3.9094266797244877, + "learning_rate": 4.4324475741798235e-06, + "loss": 0.9969, + "step": 737 + }, + { + "epoch": 0.5511061327359283, + "grad_norm": 3.20510351193743, + "learning_rate": 4.420425797177637e-06, + "loss": 1.0058, + "step": 738 + }, + { + "epoch": 0.5518528890133483, + "grad_norm": 2.862584885372328, + "learning_rate": 4.4084074153272346e-06, + "loss": 0.9266, + "step": 739 + }, + { + "epoch": 0.5525996452907682, + "grad_norm": 3.230765906011071, + "learning_rate": 4.396392499032428e-06, + "loss": 0.928, + "step": 740 + }, + { + "epoch": 0.5533464015681882, + "grad_norm": 7.447272293937854, + "learning_rate": 4.38438111867672e-06, + "loss": 1.0577, + "step": 741 + }, + { + "epoch": 0.5540931578456081, + "grad_norm": 4.421418489574816, + "learning_rate": 4.372373344622906e-06, + "loss": 1.0461, + "step": 742 + }, + { + "epoch": 0.5540931578456081, + "eval_loss": 0.9515445232391357, + "eval_runtime": 162.0634, + "eval_samples_per_second": 111.247, + "eval_steps_per_second": 1.74, + "step": 742 + }, + { + "epoch": 0.5548399141230281, + "grad_norm": 2.7712010504789544, + "learning_rate": 4.360369247212653e-06, + "loss": 0.988, + "step": 743 + }, + { + "epoch": 0.5555866704004481, + "grad_norm": 2.8683975287912435, + "learning_rate": 4.3483688967660875e-06, + "loss": 0.8898, + "step": 744 + }, + { + "epoch": 0.556333426677868, + "grad_norm": 3.3419411279482003, + "learning_rate": 4.336372363581391e-06, + "loss": 1.0419, + "step": 745 + }, + { + "epoch": 0.5570801829552879, + "grad_norm": 3.422260194867919, + "learning_rate": 4.3243797179343795e-06, + "loss": 0.9626, + "step": 746 + }, + { + "epoch": 0.5578269392327079, + "grad_norm": 3.932467663326973, + "learning_rate": 4.3123910300781e-06, + "loss": 1.0097, + "step": 747 + }, + { + "epoch": 0.5585736955101279, + "grad_norm": 3.220509235991095, + "learning_rate": 4.300406370242409e-06, + "loss": 0.9915, + "step": 748 + }, + { + "epoch": 0.5593204517875479, + "grad_norm": 3.1936031478949873, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.958, + "step": 749 + }, + { + "epoch": 0.5600672080649678, + "grad_norm": 3.1854522468121935, + "learning_rate": 4.276449415433851e-06, + "loss": 0.9964, + "step": 750 + }, + { + "epoch": 0.5608139643423877, + "grad_norm": 2.6714557730618553, + "learning_rate": 4.264477260801072e-06, + "loss": 1.0101, + "step": 751 + }, + { + "epoch": 0.5615607206198077, + "grad_norm": 3.647313021227066, + "learning_rate": 4.25250941486825e-06, + "loss": 0.9056, + "step": 752 + }, + { + "epoch": 0.5623074768972277, + "grad_norm": 3.3124446614726826, + "learning_rate": 4.2405459477431505e-06, + "loss": 0.9657, + "step": 753 + }, + { + "epoch": 0.5630542331746476, + "grad_norm": 3.4799141929813904, + "learning_rate": 4.228586929507892e-06, + "loss": 0.9853, + "step": 754 + }, + { + "epoch": 0.5638009894520676, + "grad_norm": 3.506153943735322, + "learning_rate": 4.216632430218528e-06, + "loss": 0.9887, + "step": 755 + }, + { + "epoch": 0.5645477457294875, + "grad_norm": 2.5744780478609335, + "learning_rate": 4.204682519904641e-06, + "loss": 0.936, + "step": 756 + }, + { + "epoch": 0.5645477457294875, + "eval_loss": 0.9495623707771301, + "eval_runtime": 160.7518, + "eval_samples_per_second": 112.154, + "eval_steps_per_second": 1.754, + "step": 756 + }, + { + "epoch": 0.5652945020069075, + "grad_norm": 3.098794189486562, + "learning_rate": 4.1927372685689315e-06, + "loss": 0.9661, + "step": 757 + }, + { + "epoch": 0.5660412582843275, + "grad_norm": 3.4619013278607578, + "learning_rate": 4.180796746186808e-06, + "loss": 0.9115, + "step": 758 + }, + { + "epoch": 0.5667880145617474, + "grad_norm": 3.5592551507334433, + "learning_rate": 4.168861022705976e-06, + "loss": 1.0279, + "step": 759 + }, + { + "epoch": 0.5675347708391674, + "grad_norm": 3.57134196506803, + "learning_rate": 4.1569301680460304e-06, + "loss": 0.9903, + "step": 760 + }, + { + "epoch": 0.5682815271165873, + "grad_norm": 2.9011475792179664, + "learning_rate": 4.145004252098044e-06, + "loss": 0.9266, + "step": 761 + }, + { + "epoch": 0.5690282833940072, + "grad_norm": 3.3121205852213564, + "learning_rate": 4.133083344724156e-06, + "loss": 0.9647, + "step": 762 + }, + { + "epoch": 0.5697750396714273, + "grad_norm": 4.324275423359752, + "learning_rate": 4.121167515757168e-06, + "loss": 0.926, + "step": 763 + }, + { + "epoch": 0.5705217959488472, + "grad_norm": 2.8518532935568475, + "learning_rate": 4.109256835000132e-06, + "loss": 1.0484, + "step": 764 + }, + { + "epoch": 0.5712685522262672, + "grad_norm": 2.8603023862843737, + "learning_rate": 4.097351372225943e-06, + "loss": 0.9625, + "step": 765 + }, + { + "epoch": 0.5720153085036871, + "grad_norm": 3.2495297372313936, + "learning_rate": 4.085451197176924e-06, + "loss": 0.9934, + "step": 766 + }, + { + "epoch": 0.572762064781107, + "grad_norm": 2.9168937333210736, + "learning_rate": 4.073556379564429e-06, + "loss": 0.9692, + "step": 767 + }, + { + "epoch": 0.5735088210585271, + "grad_norm": 3.4739754957829994, + "learning_rate": 4.061666989068423e-06, + "loss": 1.0596, + "step": 768 + }, + { + "epoch": 0.574255577335947, + "grad_norm": 2.524612817650384, + "learning_rate": 4.049783095337081e-06, + "loss": 0.934, + "step": 769 + }, + { + "epoch": 0.575002333613367, + "grad_norm": 3.1228879201643713, + "learning_rate": 4.037904767986378e-06, + "loss": 0.9707, + "step": 770 + }, + { + "epoch": 0.575002333613367, + "eval_loss": 0.9477403163909912, + "eval_runtime": 162.0633, + "eval_samples_per_second": 111.247, + "eval_steps_per_second": 1.74, + "step": 770 + }, + { + "epoch": 0.5757490898907869, + "grad_norm": 3.257360843044048, + "learning_rate": 4.026032076599681e-06, + "loss": 0.9881, + "step": 771 + }, + { + "epoch": 0.5764958461682068, + "grad_norm": 4.000425859190567, + "learning_rate": 4.014165090727341e-06, + "loss": 0.9536, + "step": 772 + }, + { + "epoch": 0.5772426024456269, + "grad_norm": 2.753962998604683, + "learning_rate": 4.002303879886288e-06, + "loss": 0.9246, + "step": 773 + }, + { + "epoch": 0.5779893587230468, + "grad_norm": 3.7186990398787967, + "learning_rate": 3.990448513559615e-06, + "loss": 1.0088, + "step": 774 + }, + { + "epoch": 0.5787361150004667, + "grad_norm": 3.158075187326005, + "learning_rate": 3.978599061196188e-06, + "loss": 0.9959, + "step": 775 + }, + { + "epoch": 0.5794828712778867, + "grad_norm": 4.303108732794814, + "learning_rate": 3.9667555922102214e-06, + "loss": 1.0725, + "step": 776 + }, + { + "epoch": 0.5802296275553066, + "grad_norm": 2.870021324090936, + "learning_rate": 3.954918175980882e-06, + "loss": 0.924, + "step": 777 + }, + { + "epoch": 0.5809763838327265, + "grad_norm": 4.264645613049714, + "learning_rate": 3.9430868818518786e-06, + "loss": 0.9837, + "step": 778 + }, + { + "epoch": 0.5817231401101466, + "grad_norm": 3.3495172772032986, + "learning_rate": 3.931261779131058e-06, + "loss": 0.9841, + "step": 779 + }, + { + "epoch": 0.5824698963875665, + "grad_norm": 3.246610836443149, + "learning_rate": 3.919442937089996e-06, + "loss": 0.9947, + "step": 780 + }, + { + "epoch": 0.5832166526649865, + "grad_norm": 2.7004006268478444, + "learning_rate": 3.9076304249635905e-06, + "loss": 0.9138, + "step": 781 + }, + { + "epoch": 0.5839634089424064, + "grad_norm": 2.7834796179870325, + "learning_rate": 3.895824311949665e-06, + "loss": 1.0329, + "step": 782 + }, + { + "epoch": 0.5847101652198263, + "grad_norm": 2.8998183145518124, + "learning_rate": 3.884024667208556e-06, + "loss": 1.0747, + "step": 783 + }, + { + "epoch": 0.5854569214972464, + "grad_norm": 3.5440597227303194, + "learning_rate": 3.872231559862702e-06, + "loss": 0.9257, + "step": 784 + }, + { + "epoch": 0.5854569214972464, + "eval_loss": 0.9469641447067261, + "eval_runtime": 160.547, + "eval_samples_per_second": 112.297, + "eval_steps_per_second": 1.756, + "step": 784 + }, + { + "epoch": 0.5862036777746663, + "grad_norm": 3.3382847763731767, + "learning_rate": 3.860445058996255e-06, + "loss": 0.9998, + "step": 785 + }, + { + "epoch": 0.5869504340520862, + "grad_norm": 2.6762939914119226, + "learning_rate": 3.848665233654658e-06, + "loss": 0.9951, + "step": 786 + }, + { + "epoch": 0.5876971903295062, + "grad_norm": 2.4239663023709093, + "learning_rate": 3.836892152844251e-06, + "loss": 1.0196, + "step": 787 + }, + { + "epoch": 0.5884439466069261, + "grad_norm": 2.7049762394604313, + "learning_rate": 3.825125885531869e-06, + "loss": 0.9878, + "step": 788 + }, + { + "epoch": 0.5891907028843462, + "grad_norm": 3.486650028911791, + "learning_rate": 3.813366500644426e-06, + "loss": 0.977, + "step": 789 + }, + { + "epoch": 0.5899374591617661, + "grad_norm": 3.9630866861155116, + "learning_rate": 3.8016140670685263e-06, + "loss": 0.9626, + "step": 790 + }, + { + "epoch": 0.590684215439186, + "grad_norm": 4.033104870432513, + "learning_rate": 3.789868653650046e-06, + "loss": 0.9761, + "step": 791 + }, + { + "epoch": 0.591430971716606, + "grad_norm": 3.346871826227189, + "learning_rate": 3.7781303291937453e-06, + "loss": 1.0169, + "step": 792 + }, + { + "epoch": 0.5921777279940259, + "grad_norm": 3.936340048603582, + "learning_rate": 3.7663991624628495e-06, + "loss": 0.8851, + "step": 793 + }, + { + "epoch": 0.592924484271446, + "grad_norm": 3.6485292207922058, + "learning_rate": 3.7546752221786553e-06, + "loss": 0.8865, + "step": 794 + }, + { + "epoch": 0.5936712405488659, + "grad_norm": 3.6264257513729343, + "learning_rate": 3.7429585770201314e-06, + "loss": 0.9835, + "step": 795 + }, + { + "epoch": 0.5944179968262858, + "grad_norm": 4.573602430354129, + "learning_rate": 3.7312492956235058e-06, + "loss": 1.0293, + "step": 796 + }, + { + "epoch": 0.5951647531037058, + "grad_norm": 2.85790348242624, + "learning_rate": 3.719547446581876e-06, + "loss": 0.976, + "step": 797 + }, + { + "epoch": 0.5959115093811257, + "grad_norm": 2.8881328636872623, + "learning_rate": 3.7078530984447956e-06, + "loss": 0.9718, + "step": 798 + }, + { + "epoch": 0.5959115093811257, + "eval_loss": 0.9462713599205017, + "eval_runtime": 160.8542, + "eval_samples_per_second": 112.083, + "eval_steps_per_second": 1.753, + "step": 798 + }, + { + "epoch": 0.5966582656585457, + "grad_norm": 3.2246121361724254, + "learning_rate": 3.6961663197178767e-06, + "loss": 0.8918, + "step": 799 + }, + { + "epoch": 0.5974050219359657, + "grad_norm": 4.065115346169568, + "learning_rate": 3.6844871788623946e-06, + "loss": 0.9987, + "step": 800 + }, + { + "epoch": 0.5981517782133856, + "grad_norm": 3.1428749321319236, + "learning_rate": 3.6728157442948786e-06, + "loss": 1.0329, + "step": 801 + }, + { + "epoch": 0.5988985344908055, + "grad_norm": 3.2108025912704763, + "learning_rate": 3.6611520843867155e-06, + "loss": 0.9817, + "step": 802 + }, + { + "epoch": 0.5996452907682255, + "grad_norm": 3.3376030924698674, + "learning_rate": 3.649496267463749e-06, + "loss": 0.9244, + "step": 803 + }, + { + "epoch": 0.6003920470456455, + "grad_norm": 2.797596396920881, + "learning_rate": 3.6378483618058774e-06, + "loss": 0.9516, + "step": 804 + }, + { + "epoch": 0.6011388033230655, + "grad_norm": 2.6634774502196605, + "learning_rate": 3.626208435646652e-06, + "loss": 0.9045, + "step": 805 + }, + { + "epoch": 0.6018855596004854, + "grad_norm": 4.020178611133311, + "learning_rate": 3.6145765571728863e-06, + "loss": 0.997, + "step": 806 + }, + { + "epoch": 0.6026323158779053, + "grad_norm": 3.4517830596099497, + "learning_rate": 3.6029527945242436e-06, + "loss": 0.9844, + "step": 807 + }, + { + "epoch": 0.6033790721553253, + "grad_norm": 3.408570855279641, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.9571, + "step": 808 + }, + { + "epoch": 0.6041258284327453, + "grad_norm": 3.102469639035291, + "learning_rate": 3.5797298890228903e-06, + "loss": 0.9399, + "step": 809 + }, + { + "epoch": 0.6048725847101653, + "grad_norm": 4.116854993159461, + "learning_rate": 3.5681308822102046e-06, + "loss": 1.0615, + "step": 810 + }, + { + "epoch": 0.6056193409875852, + "grad_norm": 3.3466880091581372, + "learning_rate": 3.5565402633018963e-06, + "loss": 1.0105, + "step": 811 + }, + { + "epoch": 0.6063660972650051, + "grad_norm": 2.8545068302805126, + "learning_rate": 3.5449581001959327e-06, + "loss": 1.0286, + "step": 812 + }, + { + "epoch": 0.6063660972650051, + "eval_loss": 0.9455689191818237, + "eval_runtime": 160.9732, + "eval_samples_per_second": 112.0, + "eval_steps_per_second": 1.752, + "step": 812 + }, + { + "epoch": 0.6071128535424251, + "grad_norm": 3.155802958052963, + "learning_rate": 3.5333844607407497e-06, + "loss": 0.924, + "step": 813 + }, + { + "epoch": 0.607859609819845, + "grad_norm": 2.9985600832229333, + "learning_rate": 3.521819412734846e-06, + "loss": 1.0208, + "step": 814 + }, + { + "epoch": 0.608606366097265, + "grad_norm": 3.456904877508385, + "learning_rate": 3.5102630239263986e-06, + "loss": 0.9405, + "step": 815 + }, + { + "epoch": 0.609353122374685, + "grad_norm": 3.506919400122488, + "learning_rate": 3.4987153620128534e-06, + "loss": 1.0131, + "step": 816 + }, + { + "epoch": 0.6100998786521049, + "grad_norm": 3.562709368000536, + "learning_rate": 3.487176494640533e-06, + "loss": 1.0204, + "step": 817 + }, + { + "epoch": 0.6108466349295248, + "grad_norm": 3.029728546928659, + "learning_rate": 3.4756464894042475e-06, + "loss": 0.9355, + "step": 818 + }, + { + "epoch": 0.6115933912069448, + "grad_norm": 4.701806928246403, + "learning_rate": 3.464125413846886e-06, + "loss": 1.0966, + "step": 819 + }, + { + "epoch": 0.6123401474843648, + "grad_norm": 4.345487604862701, + "learning_rate": 3.4526133354590342e-06, + "loss": 1.0304, + "step": 820 + }, + { + "epoch": 0.6130869037617848, + "grad_norm": 2.9882707953224097, + "learning_rate": 3.4411103216785645e-06, + "loss": 1.0434, + "step": 821 + }, + { + "epoch": 0.6138336600392047, + "grad_norm": 3.6824324317588997, + "learning_rate": 3.4296164398902576e-06, + "loss": 1.0781, + "step": 822 + }, + { + "epoch": 0.6145804163166246, + "grad_norm": 3.1022094976858012, + "learning_rate": 3.418131757425389e-06, + "loss": 0.9307, + "step": 823 + }, + { + "epoch": 0.6153271725940446, + "grad_norm": 6.184516709697887, + "learning_rate": 3.4066563415613523e-06, + "loss": 0.9767, + "step": 824 + }, + { + "epoch": 0.6160739288714646, + "grad_norm": 3.18927582757545, + "learning_rate": 3.395190259521254e-06, + "loss": 0.9867, + "step": 825 + }, + { + "epoch": 0.6168206851488846, + "grad_norm": 3.103890715286569, + "learning_rate": 3.3837335784735244e-06, + "loss": 0.9872, + "step": 826 + }, + { + "epoch": 0.6168206851488846, + "eval_loss": 0.9444602727890015, + "eval_runtime": 160.7087, + "eval_samples_per_second": 112.184, + "eval_steps_per_second": 1.755, + "step": 826 + }, + { + "epoch": 0.6175674414263045, + "grad_norm": 4.650015908882197, + "learning_rate": 3.37228636553152e-06, + "loss": 0.9987, + "step": 827 + }, + { + "epoch": 0.6183141977037244, + "grad_norm": 3.5399264590081265, + "learning_rate": 3.360848687753138e-06, + "loss": 0.9564, + "step": 828 + }, + { + "epoch": 0.6190609539811444, + "grad_norm": 3.0564069474101756, + "learning_rate": 3.349420612140412e-06, + "loss": 0.936, + "step": 829 + }, + { + "epoch": 0.6198077102585644, + "grad_norm": 3.6031607198080886, + "learning_rate": 3.33800220563913e-06, + "loss": 0.9446, + "step": 830 + }, + { + "epoch": 0.6205544665359843, + "grad_norm": 3.4094903377690953, + "learning_rate": 3.3265935351384386e-06, + "loss": 0.9977, + "step": 831 + }, + { + "epoch": 0.6213012228134043, + "grad_norm": 2.848149256988196, + "learning_rate": 3.3151946674704487e-06, + "loss": 0.9697, + "step": 832 + }, + { + "epoch": 0.6220479790908242, + "grad_norm": 3.0079233857559977, + "learning_rate": 3.3038056694098485e-06, + "loss": 0.9622, + "step": 833 + }, + { + "epoch": 0.6227947353682441, + "grad_norm": 3.518051937950098, + "learning_rate": 3.2924266076735094e-06, + "loss": 0.9171, + "step": 834 + }, + { + "epoch": 0.6235414916456642, + "grad_norm": 3.3430473288801013, + "learning_rate": 3.281057548920091e-06, + "loss": 0.9487, + "step": 835 + }, + { + "epoch": 0.6242882479230841, + "grad_norm": 3.258832229112217, + "learning_rate": 3.2696985597496633e-06, + "loss": 0.9402, + "step": 836 + }, + { + "epoch": 0.6250350042005041, + "grad_norm": 4.025430224245148, + "learning_rate": 3.258349706703302e-06, + "loss": 1.035, + "step": 837 + }, + { + "epoch": 0.625781760477924, + "grad_norm": 3.6604686310111374, + "learning_rate": 3.24701105626271e-06, + "loss": 0.9754, + "step": 838 + }, + { + "epoch": 0.6265285167553439, + "grad_norm": 3.25423601241418, + "learning_rate": 3.2356826748498182e-06, + "loss": 0.984, + "step": 839 + }, + { + "epoch": 0.627275273032764, + "grad_norm": 3.28584168969409, + "learning_rate": 3.2243646288264073e-06, + "loss": 0.9791, + "step": 840 + }, + { + "epoch": 0.627275273032764, + "eval_loss": 0.9437100291252136, + "eval_runtime": 160.7821, + "eval_samples_per_second": 112.133, + "eval_steps_per_second": 1.754, + "step": 840 + }, + { + "epoch": 0.6280220293101839, + "grad_norm": 4.366912401633733, + "learning_rate": 3.2130569844937097e-06, + "loss": 0.9599, + "step": 841 + }, + { + "epoch": 0.6287687855876039, + "grad_norm": 2.9219395535212587, + "learning_rate": 3.2017598080920224e-06, + "loss": 0.9939, + "step": 842 + }, + { + "epoch": 0.6295155418650238, + "grad_norm": 3.786606223724064, + "learning_rate": 3.1904731658003264e-06, + "loss": 0.9879, + "step": 843 + }, + { + "epoch": 0.6302622981424437, + "grad_norm": 3.3886784078360432, + "learning_rate": 3.1791971237358893e-06, + "loss": 0.9483, + "step": 844 + }, + { + "epoch": 0.6310090544198638, + "grad_norm": 2.644489445271042, + "learning_rate": 3.1679317479538864e-06, + "loss": 0.9832, + "step": 845 + }, + { + "epoch": 0.6317558106972837, + "grad_norm": 3.2481495332858237, + "learning_rate": 3.1566771044470057e-06, + "loss": 1.0207, + "step": 846 + }, + { + "epoch": 0.6325025669747036, + "grad_norm": 3.053150214876181, + "learning_rate": 3.1454332591450697e-06, + "loss": 1.0517, + "step": 847 + }, + { + "epoch": 0.6332493232521236, + "grad_norm": 3.1143024638740635, + "learning_rate": 3.1342002779146398e-06, + "loss": 0.8988, + "step": 848 + }, + { + "epoch": 0.6339960795295435, + "grad_norm": 3.123010178193279, + "learning_rate": 3.122978226558641e-06, + "loss": 0.9868, + "step": 849 + }, + { + "epoch": 0.6347428358069636, + "grad_norm": 3.1659905184153048, + "learning_rate": 3.1117671708159665e-06, + "loss": 1.0189, + "step": 850 + }, + { + "epoch": 0.6354895920843835, + "grad_norm": 2.875486846492392, + "learning_rate": 3.1005671763611003e-06, + "loss": 0.9028, + "step": 851 + }, + { + "epoch": 0.6362363483618034, + "grad_norm": 2.9689142045380406, + "learning_rate": 3.0893783088037264e-06, + "loss": 0.8993, + "step": 852 + }, + { + "epoch": 0.6369831046392234, + "grad_norm": 3.6622092513517517, + "learning_rate": 3.078200633688352e-06, + "loss": 0.9756, + "step": 853 + }, + { + "epoch": 0.6377298609166433, + "grad_norm": 3.8793883255926582, + "learning_rate": 3.0670342164939126e-06, + "loss": 0.9594, + "step": 854 + }, + { + "epoch": 0.6377298609166433, + "eval_loss": 0.9425017237663269, + "eval_runtime": 160.7149, + "eval_samples_per_second": 112.18, + "eval_steps_per_second": 1.755, + "step": 854 + }, + { + "epoch": 0.6384766171940632, + "grad_norm": 2.9732643744244034, + "learning_rate": 3.0558791226333974e-06, + "loss": 0.9466, + "step": 855 + }, + { + "epoch": 0.6392233734714833, + "grad_norm": 2.548903115004312, + "learning_rate": 3.044735417453466e-06, + "loss": 0.9401, + "step": 856 + }, + { + "epoch": 0.6399701297489032, + "grad_norm": 2.9378623619360753, + "learning_rate": 3.033603166234059e-06, + "loss": 0.9978, + "step": 857 + }, + { + "epoch": 0.6407168860263232, + "grad_norm": 3.3994193331015143, + "learning_rate": 3.0224824341880226e-06, + "loss": 0.9302, + "step": 858 + }, + { + "epoch": 0.6414636423037431, + "grad_norm": 3.7318362244634526, + "learning_rate": 3.0113732864607236e-06, + "loss": 0.9188, + "step": 859 + }, + { + "epoch": 0.642210398581163, + "grad_norm": 3.6257765842788543, + "learning_rate": 3.000275788129662e-06, + "loss": 0.9515, + "step": 860 + }, + { + "epoch": 0.6429571548585831, + "grad_norm": 2.741192045611565, + "learning_rate": 2.9891900042041043e-06, + "loss": 0.9686, + "step": 861 + }, + { + "epoch": 0.643703911136003, + "grad_norm": 3.262238253576203, + "learning_rate": 2.978115999624686e-06, + "loss": 0.9904, + "step": 862 + }, + { + "epoch": 0.6444506674134229, + "grad_norm": 3.0017525383643124, + "learning_rate": 2.967053839263046e-06, + "loss": 0.9432, + "step": 863 + }, + { + "epoch": 0.6451974236908429, + "grad_norm": 3.6774257261294023, + "learning_rate": 2.956003587921433e-06, + "loss": 0.9398, + "step": 864 + }, + { + "epoch": 0.6459441799682628, + "grad_norm": 3.027953563829782, + "learning_rate": 2.9449653103323405e-06, + "loss": 0.9681, + "step": 865 + }, + { + "epoch": 0.6466909362456829, + "grad_norm": 3.112438742457364, + "learning_rate": 2.9339390711581105e-06, + "loss": 0.9468, + "step": 866 + }, + { + "epoch": 0.6474376925231028, + "grad_norm": 3.5341048864406948, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.9825, + "step": 867 + }, + { + "epoch": 0.6481844488005227, + "grad_norm": 3.54521516556626, + "learning_rate": 2.9119229663506417e-06, + "loss": 1.0349, + "step": 868 + }, + { + "epoch": 0.6481844488005227, + "eval_loss": 0.9415974617004395, + "eval_runtime": 162.2767, + "eval_samples_per_second": 111.1, + "eval_steps_per_second": 1.738, + "step": 868 + }, + { + "epoch": 0.6489312050779427, + "grad_norm": 3.1153407411887657, + "learning_rate": 2.900933229687978e-06, + "loss": 0.9653, + "step": 869 + }, + { + "epoch": 0.6496779613553626, + "grad_norm": 3.3514637459968695, + "learning_rate": 2.889955789380572e-06, + "loss": 0.9628, + "step": 870 + }, + { + "epoch": 0.6504247176327826, + "grad_norm": 3.085482591143154, + "learning_rate": 2.8789907097343818e-06, + "loss": 0.9068, + "step": 871 + }, + { + "epoch": 0.6511714739102026, + "grad_norm": 3.5723275185915515, + "learning_rate": 2.868038054982962e-06, + "loss": 0.9639, + "step": 872 + }, + { + "epoch": 0.6519182301876225, + "grad_norm": 2.8915352489194492, + "learning_rate": 2.8570978892870777e-06, + "loss": 1.0256, + "step": 873 + }, + { + "epoch": 0.6526649864650425, + "grad_norm": 3.091290299462198, + "learning_rate": 2.8461702767343336e-06, + "loss": 0.9228, + "step": 874 + }, + { + "epoch": 0.6534117427424624, + "grad_norm": 3.248482495382901, + "learning_rate": 2.8352552813388035e-06, + "loss": 0.9985, + "step": 875 + }, + { + "epoch": 0.6541584990198824, + "grad_norm": 3.6787026511903984, + "learning_rate": 2.824352967040642e-06, + "loss": 0.9921, + "step": 876 + }, + { + "epoch": 0.6549052552973024, + "grad_norm": 3.4197970247343727, + "learning_rate": 2.8134633977057236e-06, + "loss": 0.8962, + "step": 877 + }, + { + "epoch": 0.6556520115747223, + "grad_norm": 2.914535165179358, + "learning_rate": 2.802586637125258e-06, + "loss": 0.859, + "step": 878 + }, + { + "epoch": 0.6563987678521422, + "grad_norm": 2.9645614071642097, + "learning_rate": 2.791722749015424e-06, + "loss": 1.0076, + "step": 879 + }, + { + "epoch": 0.6571455241295622, + "grad_norm": 3.3043036104064014, + "learning_rate": 2.7808717970169928e-06, + "loss": 1.022, + "step": 880 + }, + { + "epoch": 0.6578922804069822, + "grad_norm": 3.9485650303396382, + "learning_rate": 2.770033844694954e-06, + "loss": 0.9906, + "step": 881 + }, + { + "epoch": 0.6586390366844022, + "grad_norm": 4.366695146187443, + "learning_rate": 2.7592089555381486e-06, + "loss": 1.081, + "step": 882 + }, + { + "epoch": 0.6586390366844022, + "eval_loss": 0.94098299741745, + "eval_runtime": 160.6764, + "eval_samples_per_second": 112.207, + "eval_steps_per_second": 1.755, + "step": 882 + }, + { + "epoch": 0.6593857929618221, + "grad_norm": 3.0599765753650563, + "learning_rate": 2.748397192958893e-06, + "loss": 0.9745, + "step": 883 + }, + { + "epoch": 0.660132549239242, + "grad_norm": 3.0490735460024525, + "learning_rate": 2.7375986202926003e-06, + "loss": 0.9087, + "step": 884 + }, + { + "epoch": 0.660879305516662, + "grad_norm": 2.944396457362802, + "learning_rate": 2.7268133007974284e-06, + "loss": 0.9665, + "step": 885 + }, + { + "epoch": 0.661626061794082, + "grad_norm": 4.033811803016879, + "learning_rate": 2.716041297653891e-06, + "loss": 1.0236, + "step": 886 + }, + { + "epoch": 0.662372818071502, + "grad_norm": 3.4943770273065016, + "learning_rate": 2.705282673964495e-06, + "loss": 0.9446, + "step": 887 + }, + { + "epoch": 0.6631195743489219, + "grad_norm": 4.178024868422968, + "learning_rate": 2.69453749275337e-06, + "loss": 0.9134, + "step": 888 + }, + { + "epoch": 0.6638663306263418, + "grad_norm": 2.9796556875857503, + "learning_rate": 2.6838058169659076e-06, + "loss": 0.9714, + "step": 889 + }, + { + "epoch": 0.6646130869037618, + "grad_norm": 2.5919847870009023, + "learning_rate": 2.67308770946837e-06, + "loss": 0.9558, + "step": 890 + }, + { + "epoch": 0.6653598431811817, + "grad_norm": 3.2509492718731527, + "learning_rate": 2.6623832330475454e-06, + "loss": 0.9933, + "step": 891 + }, + { + "epoch": 0.6661065994586017, + "grad_norm": 2.998873106447495, + "learning_rate": 2.651692450410372e-06, + "loss": 1.005, + "step": 892 + }, + { + "epoch": 0.6668533557360217, + "grad_norm": 3.561009888811348, + "learning_rate": 2.6410154241835663e-06, + "loss": 0.9905, + "step": 893 + }, + { + "epoch": 0.6676001120134416, + "grad_norm": 3.0540541341319694, + "learning_rate": 2.630352216913258e-06, + "loss": 0.8646, + "step": 894 + }, + { + "epoch": 0.6683468682908615, + "grad_norm": 2.823799259947434, + "learning_rate": 2.6197028910646304e-06, + "loss": 1.01, + "step": 895 + }, + { + "epoch": 0.6690936245682815, + "grad_norm": 2.711618389928562, + "learning_rate": 2.609067509021549e-06, + "loss": 0.9544, + "step": 896 + }, + { + "epoch": 0.6690936245682815, + "eval_loss": 0.9402115345001221, + "eval_runtime": 162.2582, + "eval_samples_per_second": 111.113, + "eval_steps_per_second": 1.738, + "step": 896 + }, + { + "epoch": 0.6698403808457015, + "grad_norm": 3.009980995035587, + "learning_rate": 2.5984461330861864e-06, + "loss": 1.024, + "step": 897 + }, + { + "epoch": 0.6705871371231215, + "grad_norm": 3.2368826166146385, + "learning_rate": 2.58783882547868e-06, + "loss": 0.9605, + "step": 898 + }, + { + "epoch": 0.6713338934005414, + "grad_norm": 4.00112194900498, + "learning_rate": 2.57724564833675e-06, + "loss": 0.99, + "step": 899 + }, + { + "epoch": 0.6720806496779613, + "grad_norm": 3.4500845570375462, + "learning_rate": 2.566666663715337e-06, + "loss": 1.0087, + "step": 900 + }, + { + "epoch": 0.6728274059553813, + "grad_norm": 2.8366354860399525, + "learning_rate": 2.5561019335862435e-06, + "loss": 0.8821, + "step": 901 + }, + { + "epoch": 0.6735741622328013, + "grad_norm": 7.210495835093528, + "learning_rate": 2.545551519837777e-06, + "loss": 1.0363, + "step": 902 + }, + { + "epoch": 0.6743209185102212, + "grad_norm": 3.348557095005959, + "learning_rate": 2.5350154842743643e-06, + "loss": 0.9791, + "step": 903 + }, + { + "epoch": 0.6750676747876412, + "grad_norm": 2.988330744084953, + "learning_rate": 2.524493888616214e-06, + "loss": 0.9316, + "step": 904 + }, + { + "epoch": 0.6758144310650611, + "grad_norm": 3.8502952989746797, + "learning_rate": 2.5139867944989483e-06, + "loss": 0.9556, + "step": 905 + }, + { + "epoch": 0.676561187342481, + "grad_norm": 3.0740946428548774, + "learning_rate": 2.503494263473233e-06, + "loss": 0.9857, + "step": 906 + }, + { + "epoch": 0.6773079436199011, + "grad_norm": 2.639674000747994, + "learning_rate": 2.4930163570044245e-06, + "loss": 0.897, + "step": 907 + }, + { + "epoch": 0.678054699897321, + "grad_norm": 3.76069599349086, + "learning_rate": 2.482553136472211e-06, + "loss": 0.9361, + "step": 908 + }, + { + "epoch": 0.678801456174741, + "grad_norm": 2.99525819871286, + "learning_rate": 2.4721046631702478e-06, + "loss": 0.9624, + "step": 909 + }, + { + "epoch": 0.6795482124521609, + "grad_norm": 3.3991168090917254, + "learning_rate": 2.461670998305802e-06, + "loss": 1.0163, + "step": 910 + }, + { + "epoch": 0.6795482124521609, + "eval_loss": 0.9402364492416382, + "eval_runtime": 160.891, + "eval_samples_per_second": 112.057, + "eval_steps_per_second": 1.753, + "step": 910 + }, + { + "epoch": 0.6802949687295808, + "grad_norm": 5.192382465431668, + "learning_rate": 2.451252202999389e-06, + "loss": 0.9618, + "step": 911 + }, + { + "epoch": 0.6810417250070009, + "grad_norm": 2.994654361310334, + "learning_rate": 2.440848338284427e-06, + "loss": 0.9835, + "step": 912 + }, + { + "epoch": 0.6817884812844208, + "grad_norm": 3.856560498633684, + "learning_rate": 2.4304594651068626e-06, + "loss": 1.0109, + "step": 913 + }, + { + "epoch": 0.6825352375618408, + "grad_norm": 4.578051753619612, + "learning_rate": 2.420085644324824e-06, + "loss": 1.0303, + "step": 914 + }, + { + "epoch": 0.6832819938392607, + "grad_norm": 3.4159235584433425, + "learning_rate": 2.409726936708263e-06, + "loss": 0.9201, + "step": 915 + }, + { + "epoch": 0.6840287501166806, + "grad_norm": 3.266824512755902, + "learning_rate": 2.3993834029385976e-06, + "loss": 0.9419, + "step": 916 + }, + { + "epoch": 0.6847755063941007, + "grad_norm": 3.1027816566962696, + "learning_rate": 2.3890551036083564e-06, + "loss": 0.989, + "step": 917 + }, + { + "epoch": 0.6855222626715206, + "grad_norm": 2.7420536258047963, + "learning_rate": 2.378742099220829e-06, + "loss": 0.9537, + "step": 918 + }, + { + "epoch": 0.6862690189489405, + "grad_norm": 3.4229182865230983, + "learning_rate": 2.3684444501897012e-06, + "loss": 1.0261, + "step": 919 + }, + { + "epoch": 0.6870157752263605, + "grad_norm": 2.848588746926309, + "learning_rate": 2.3581622168387107e-06, + "loss": 0.9573, + "step": 920 + }, + { + "epoch": 0.6877625315037804, + "grad_norm": 3.740798933248872, + "learning_rate": 2.3478954594012884e-06, + "loss": 0.9586, + "step": 921 + }, + { + "epoch": 0.6885092877812005, + "grad_norm": 4.476847035187571, + "learning_rate": 2.337644238020207e-06, + "loss": 0.9325, + "step": 922 + }, + { + "epoch": 0.6892560440586204, + "grad_norm": 3.797199429587135, + "learning_rate": 2.32740861274723e-06, + "loss": 0.9058, + "step": 923 + }, + { + "epoch": 0.6900028003360403, + "grad_norm": 2.787060332572636, + "learning_rate": 2.3171886435427567e-06, + "loss": 0.9898, + "step": 924 + }, + { + "epoch": 0.6900028003360403, + "eval_loss": 0.9391377568244934, + "eval_runtime": 160.7788, + "eval_samples_per_second": 112.135, + "eval_steps_per_second": 1.754, + "step": 924 + }, + { + "epoch": 0.6907495566134603, + "grad_norm": 3.3922204747022375, + "learning_rate": 2.3069843902754767e-06, + "loss": 1.0498, + "step": 925 + }, + { + "epoch": 0.6914963128908802, + "grad_norm": 3.0040350675291787, + "learning_rate": 2.296795912722014e-06, + "loss": 0.9636, + "step": 926 + }, + { + "epoch": 0.6922430691683003, + "grad_norm": 3.0670888522236286, + "learning_rate": 2.286623270566572e-06, + "loss": 0.95, + "step": 927 + }, + { + "epoch": 0.6929898254457202, + "grad_norm": 2.992366541085708, + "learning_rate": 2.2764665234006008e-06, + "loss": 0.983, + "step": 928 + }, + { + "epoch": 0.6937365817231401, + "grad_norm": 3.862457594099046, + "learning_rate": 2.2663257307224308e-06, + "loss": 0.9717, + "step": 929 + }, + { + "epoch": 0.6944833380005601, + "grad_norm": 2.865765904294805, + "learning_rate": 2.2562009519369314e-06, + "loss": 0.9047, + "step": 930 + }, + { + "epoch": 0.69523009427798, + "grad_norm": 2.6036045157584264, + "learning_rate": 2.246092246355163e-06, + "loss": 0.8313, + "step": 931 + }, + { + "epoch": 0.6959768505553999, + "grad_norm": 3.031699920926545, + "learning_rate": 2.2359996731940348e-06, + "loss": 0.9678, + "step": 932 + }, + { + "epoch": 0.69672360683282, + "grad_norm": 2.740843180643503, + "learning_rate": 2.22592329157594e-06, + "loss": 0.9964, + "step": 933 + }, + { + "epoch": 0.6974703631102399, + "grad_norm": 2.861698814059541, + "learning_rate": 2.215863160528429e-06, + "loss": 0.9205, + "step": 934 + }, + { + "epoch": 0.6982171193876598, + "grad_norm": 2.757313651567981, + "learning_rate": 2.205819338983859e-06, + "loss": 0.9189, + "step": 935 + }, + { + "epoch": 0.6989638756650798, + "grad_norm": 3.2909156447917725, + "learning_rate": 2.195791885779041e-06, + "loss": 0.9368, + "step": 936 + }, + { + "epoch": 0.6997106319424997, + "grad_norm": 6.203638391772724, + "learning_rate": 2.1857808596548992e-06, + "loss": 0.9975, + "step": 937 + }, + { + "epoch": 0.7004573882199198, + "grad_norm": 3.0377699161386076, + "learning_rate": 2.1757863192561356e-06, + "loss": 0.9513, + "step": 938 + }, + { + "epoch": 0.7004573882199198, + "eval_loss": 0.9378783106803894, + "eval_runtime": 160.7274, + "eval_samples_per_second": 112.171, + "eval_steps_per_second": 1.755, + "step": 938 + }, + { + "epoch": 0.7012041444973397, + "grad_norm": 3.8530318792360383, + "learning_rate": 2.165808323130868e-06, + "loss": 0.9743, + "step": 939 + }, + { + "epoch": 0.7019509007747596, + "grad_norm": 3.764384225357672, + "learning_rate": 2.1558469297303025e-06, + "loss": 0.8979, + "step": 940 + }, + { + "epoch": 0.7026976570521796, + "grad_norm": 3.607231090891381, + "learning_rate": 2.1459021974083905e-06, + "loss": 0.8938, + "step": 941 + }, + { + "epoch": 0.7034444133295995, + "grad_norm": 2.850451936817253, + "learning_rate": 2.135974184421477e-06, + "loss": 0.9309, + "step": 942 + }, + { + "epoch": 0.7041911696070196, + "grad_norm": 3.4414586735950548, + "learning_rate": 2.1260629489279662e-06, + "loss": 0.9612, + "step": 943 + }, + { + "epoch": 0.7049379258844395, + "grad_norm": 3.306379000135249, + "learning_rate": 2.1161685489879784e-06, + "loss": 0.9423, + "step": 944 + }, + { + "epoch": 0.7056846821618594, + "grad_norm": 3.3569042890381877, + "learning_rate": 2.106291042563019e-06, + "loss": 1.0236, + "step": 945 + }, + { + "epoch": 0.7064314384392794, + "grad_norm": 3.3135012728201017, + "learning_rate": 2.096430487515618e-06, + "loss": 0.9959, + "step": 946 + }, + { + "epoch": 0.7071781947166993, + "grad_norm": 3.038549894496397, + "learning_rate": 2.086586941609011e-06, + "loss": 1.0162, + "step": 947 + }, + { + "epoch": 0.7079249509941193, + "grad_norm": 3.08173088117554, + "learning_rate": 2.076760462506798e-06, + "loss": 0.9131, + "step": 948 + }, + { + "epoch": 0.7086717072715393, + "grad_norm": 3.028394966652236, + "learning_rate": 2.0669511077725945e-06, + "loss": 0.9259, + "step": 949 + }, + { + "epoch": 0.7094184635489592, + "grad_norm": 3.3989843796174934, + "learning_rate": 2.0571589348697045e-06, + "loss": 0.9344, + "step": 950 + }, + { + "epoch": 0.7101652198263791, + "grad_norm": 2.975011090701034, + "learning_rate": 2.04738400116078e-06, + "loss": 0.9218, + "step": 951 + }, + { + "epoch": 0.7109119761037991, + "grad_norm": 3.265896334464912, + "learning_rate": 2.037626363907485e-06, + "loss": 0.9765, + "step": 952 + }, + { + "epoch": 0.7109119761037991, + "eval_loss": 0.9376602172851562, + "eval_runtime": 160.7633, + "eval_samples_per_second": 112.146, + "eval_steps_per_second": 1.754, + "step": 952 + }, + { + "epoch": 0.7116587323812191, + "grad_norm": 3.6778027198208982, + "learning_rate": 2.0278860802701616e-06, + "loss": 0.9278, + "step": 953 + }, + { + "epoch": 0.7124054886586391, + "grad_norm": 2.6384835692393542, + "learning_rate": 2.0181632073074925e-06, + "loss": 0.9659, + "step": 954 + }, + { + "epoch": 0.713152244936059, + "grad_norm": 3.786013581216713, + "learning_rate": 2.0084578019761738e-06, + "loss": 0.9555, + "step": 955 + }, + { + "epoch": 0.7138990012134789, + "grad_norm": 3.4033801567287436, + "learning_rate": 1.9987699211305696e-06, + "loss": 0.8764, + "step": 956 + }, + { + "epoch": 0.7146457574908989, + "grad_norm": 3.026170739363414, + "learning_rate": 1.9890996215223885e-06, + "loss": 0.9646, + "step": 957 + }, + { + "epoch": 0.7153925137683189, + "grad_norm": 2.8698966567286646, + "learning_rate": 1.979446959800347e-06, + "loss": 0.958, + "step": 958 + }, + { + "epoch": 0.7161392700457389, + "grad_norm": 3.094820756318611, + "learning_rate": 1.9698119925098398e-06, + "loss": 1.0365, + "step": 959 + }, + { + "epoch": 0.7168860263231588, + "grad_norm": 3.283111017009602, + "learning_rate": 1.9601947760926044e-06, + "loss": 0.9294, + "step": 960 + }, + { + "epoch": 0.7176327826005787, + "grad_norm": 3.3801881029401977, + "learning_rate": 1.9505953668863996e-06, + "loss": 0.9826, + "step": 961 + }, + { + "epoch": 0.7183795388779987, + "grad_norm": 2.6507459684657437, + "learning_rate": 1.9410138211246644e-06, + "loss": 0.9045, + "step": 962 + }, + { + "epoch": 0.7191262951554187, + "grad_norm": 3.20895770074743, + "learning_rate": 1.9314501949361946e-06, + "loss": 0.9533, + "step": 963 + }, + { + "epoch": 0.7198730514328386, + "grad_norm": 2.9463529533656967, + "learning_rate": 1.9219045443448133e-06, + "loss": 0.9577, + "step": 964 + }, + { + "epoch": 0.7206198077102586, + "grad_norm": 3.0591917801650887, + "learning_rate": 1.912376925269041e-06, + "loss": 0.9461, + "step": 965 + }, + { + "epoch": 0.7213665639876785, + "grad_norm": 3.333232970293983, + "learning_rate": 1.9028673935217723e-06, + "loss": 0.8945, + "step": 966 + }, + { + "epoch": 0.7213665639876785, + "eval_loss": 0.9377442002296448, + "eval_runtime": 162.3492, + "eval_samples_per_second": 111.051, + "eval_steps_per_second": 1.737, + "step": 966 + }, + { + "epoch": 0.7221133202650984, + "grad_norm": 2.964178856719797, + "learning_rate": 1.893376004809942e-06, + "loss": 0.9926, + "step": 967 + }, + { + "epoch": 0.7228600765425184, + "grad_norm": 2.9396645690446093, + "learning_rate": 1.8839028147342087e-06, + "loss": 0.996, + "step": 968 + }, + { + "epoch": 0.7236068328199384, + "grad_norm": 2.9374240680791277, + "learning_rate": 1.8744478787886188e-06, + "loss": 0.9515, + "step": 969 + }, + { + "epoch": 0.7243535890973584, + "grad_norm": 3.1383072383295123, + "learning_rate": 1.8650112523602832e-06, + "loss": 0.9053, + "step": 970 + }, + { + "epoch": 0.7251003453747783, + "grad_norm": 2.9457028560535043, + "learning_rate": 1.8555929907290627e-06, + "loss": 0.8926, + "step": 971 + }, + { + "epoch": 0.7258471016521982, + "grad_norm": 3.6033895225192185, + "learning_rate": 1.846193149067232e-06, + "loss": 1.0745, + "step": 972 + }, + { + "epoch": 0.7265938579296182, + "grad_norm": 2.905441286184863, + "learning_rate": 1.8368117824391623e-06, + "loss": 0.961, + "step": 973 + }, + { + "epoch": 0.7273406142070382, + "grad_norm": 3.823010945665864, + "learning_rate": 1.827448945800997e-06, + "loss": 0.9202, + "step": 974 + }, + { + "epoch": 0.7280873704844582, + "grad_norm": 2.9910986917683755, + "learning_rate": 1.8181046940003366e-06, + "loss": 0.9435, + "step": 975 + }, + { + "epoch": 0.7288341267618781, + "grad_norm": 3.862933976303629, + "learning_rate": 1.808779081775901e-06, + "loss": 1.0576, + "step": 976 + }, + { + "epoch": 0.729580883039298, + "grad_norm": 4.606305556163073, + "learning_rate": 1.799472163757226e-06, + "loss": 1.0184, + "step": 977 + }, + { + "epoch": 0.730327639316718, + "grad_norm": 5.027834361171428, + "learning_rate": 1.7901839944643373e-06, + "loss": 1.0321, + "step": 978 + }, + { + "epoch": 0.731074395594138, + "grad_norm": 2.9581566301260147, + "learning_rate": 1.780914628307428e-06, + "loss": 0.9176, + "step": 979 + }, + { + "epoch": 0.7318211518715579, + "grad_norm": 4.036645532947177, + "learning_rate": 1.7716641195865408e-06, + "loss": 1.0221, + "step": 980 + }, + { + "epoch": 0.7318211518715579, + "eval_loss": 0.9362857937812805, + "eval_runtime": 160.7578, + "eval_samples_per_second": 112.15, + "eval_steps_per_second": 1.754, + "step": 980 + }, + { + "epoch": 0.7325679081489779, + "grad_norm": 3.986822927862001, + "learning_rate": 1.762432522491258e-06, + "loss": 1.043, + "step": 981 + }, + { + "epoch": 0.7333146644263978, + "grad_norm": 3.585322403590216, + "learning_rate": 1.7532198911003677e-06, + "loss": 0.9715, + "step": 982 + }, + { + "epoch": 0.7340614207038177, + "grad_norm": 3.452113127334812, + "learning_rate": 1.7440262793815615e-06, + "loss": 0.966, + "step": 983 + }, + { + "epoch": 0.7348081769812378, + "grad_norm": 3.0003018548266818, + "learning_rate": 1.7348517411911176e-06, + "loss": 0.8986, + "step": 984 + }, + { + "epoch": 0.7355549332586577, + "grad_norm": 3.7074201960046462, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.9799, + "step": 985 + }, + { + "epoch": 0.7363016895360777, + "grad_norm": 3.0268210048371826, + "learning_rate": 1.716560100261428e-06, + "loss": 1.004, + "step": 986 + }, + { + "epoch": 0.7370484458134976, + "grad_norm": 4.555026395818409, + "learning_rate": 1.7074431046748075e-06, + "loss": 0.9813, + "step": 987 + }, + { + "epoch": 0.7377952020909175, + "grad_norm": 2.854485460589325, + "learning_rate": 1.6983453969211706e-06, + "loss": 0.9223, + "step": 988 + }, + { + "epoch": 0.7385419583683376, + "grad_norm": 3.780899514822457, + "learning_rate": 1.6892670302949842e-06, + "loss": 1.0049, + "step": 989 + }, + { + "epoch": 0.7392887146457575, + "grad_norm": 2.708058621198111, + "learning_rate": 1.680208057977415e-06, + "loss": 0.9578, + "step": 990 + }, + { + "epoch": 0.7400354709231775, + "grad_norm": 5.261839966075888, + "learning_rate": 1.6711685330360212e-06, + "loss": 0.9491, + "step": 991 + }, + { + "epoch": 0.7407822272005974, + "grad_norm": 3.554553077382045, + "learning_rate": 1.6621485084244327e-06, + "loss": 0.9547, + "step": 992 + }, + { + "epoch": 0.7415289834780173, + "grad_norm": 3.4576454634816116, + "learning_rate": 1.6531480369820497e-06, + "loss": 0.9735, + "step": 993 + }, + { + "epoch": 0.7422757397554374, + "grad_norm": 2.7953413513430068, + "learning_rate": 1.6441671714337288e-06, + "loss": 0.9573, + "step": 994 + }, + { + "epoch": 0.7422757397554374, + "eval_loss": 0.9361117482185364, + "eval_runtime": 162.2503, + "eval_samples_per_second": 111.118, + "eval_steps_per_second": 1.738, + "step": 994 + }, + { + "epoch": 0.7430224960328573, + "grad_norm": 3.5618656933093877, + "learning_rate": 1.635205964389474e-06, + "loss": 0.9789, + "step": 995 + }, + { + "epoch": 0.7437692523102772, + "grad_norm": 3.508297212134497, + "learning_rate": 1.626264468344132e-06, + "loss": 0.9089, + "step": 996 + }, + { + "epoch": 0.7445160085876972, + "grad_norm": 2.8461158774574113, + "learning_rate": 1.6173427356770778e-06, + "loss": 0.9407, + "step": 997 + }, + { + "epoch": 0.7452627648651171, + "grad_norm": 2.8680121302683914, + "learning_rate": 1.6084408186519195e-06, + "loss": 0.9893, + "step": 998 + }, + { + "epoch": 0.7460095211425372, + "grad_norm": 2.6994982967303023, + "learning_rate": 1.599558769416179e-06, + "loss": 0.9235, + "step": 999 + }, + { + "epoch": 0.7467562774199571, + "grad_norm": 2.946565912220613, + "learning_rate": 1.5906966400009944e-06, + "loss": 0.9575, + "step": 1000 + }, + { + "epoch": 0.747503033697377, + "grad_norm": 2.924215952122836, + "learning_rate": 1.5818544823208126e-06, + "loss": 0.8939, + "step": 1001 + }, + { + "epoch": 0.748249789974797, + "grad_norm": 3.0590249491881147, + "learning_rate": 1.573032348173087e-06, + "loss": 1.0089, + "step": 1002 + }, + { + "epoch": 0.7489965462522169, + "grad_norm": 3.196914793142116, + "learning_rate": 1.5642302892379708e-06, + "loss": 0.9879, + "step": 1003 + }, + { + "epoch": 0.7497433025296368, + "grad_norm": 3.794505980226742, + "learning_rate": 1.555448357078021e-06, + "loss": 1.0676, + "step": 1004 + }, + { + "epoch": 0.7504900588070569, + "grad_norm": 3.347035703229042, + "learning_rate": 1.5466866031378874e-06, + "loss": 0.9242, + "step": 1005 + }, + { + "epoch": 0.7512368150844768, + "grad_norm": 5.075461756876524, + "learning_rate": 1.5379450787440163e-06, + "loss": 1.0153, + "step": 1006 + }, + { + "epoch": 0.7519835713618968, + "grad_norm": 3.532477622458222, + "learning_rate": 1.5292238351043503e-06, + "loss": 0.9921, + "step": 1007 + }, + { + "epoch": 0.7527303276393167, + "grad_norm": 2.756518632689471, + "learning_rate": 1.5205229233080266e-06, + "loss": 0.9969, + "step": 1008 + }, + { + "epoch": 0.7527303276393167, + "eval_loss": 0.935323178768158, + "eval_runtime": 160.7419, + "eval_samples_per_second": 112.161, + "eval_steps_per_second": 1.754, + "step": 1008 + }, + { + "epoch": 0.7534770839167366, + "grad_norm": 3.276940054933946, + "learning_rate": 1.511842394325077e-06, + "loss": 0.9367, + "step": 1009 + }, + { + "epoch": 0.7542238401941567, + "grad_norm": 4.197551947365772, + "learning_rate": 1.5031822990061318e-06, + "loss": 1.0045, + "step": 1010 + }, + { + "epoch": 0.7549705964715766, + "grad_norm": 2.742225216417972, + "learning_rate": 1.494542688082123e-06, + "loss": 0.9001, + "step": 1011 + }, + { + "epoch": 0.7557173527489965, + "grad_norm": 3.1759063839874093, + "learning_rate": 1.485923612163982e-06, + "loss": 0.9752, + "step": 1012 + }, + { + "epoch": 0.7564641090264165, + "grad_norm": 3.234441153665573, + "learning_rate": 1.4773251217423424e-06, + "loss": 0.9566, + "step": 1013 + }, + { + "epoch": 0.7572108653038364, + "grad_norm": 3.328867377350979, + "learning_rate": 1.468747267187256e-06, + "loss": 0.8425, + "step": 1014 + }, + { + "epoch": 0.7579576215812565, + "grad_norm": 3.4507904655828963, + "learning_rate": 1.4601900987478834e-06, + "loss": 1.0111, + "step": 1015 + }, + { + "epoch": 0.7587043778586764, + "grad_norm": 3.0879533408630526, + "learning_rate": 1.451653666552208e-06, + "loss": 0.884, + "step": 1016 + }, + { + "epoch": 0.7594511341360963, + "grad_norm": 2.9599334649521345, + "learning_rate": 1.4431380206067374e-06, + "loss": 0.9592, + "step": 1017 + }, + { + "epoch": 0.7601978904135163, + "grad_norm": 3.355339344519482, + "learning_rate": 1.4346432107962205e-06, + "loss": 0.9678, + "step": 1018 + }, + { + "epoch": 0.7609446466909362, + "grad_norm": 4.7621784133806795, + "learning_rate": 1.4261692868833376e-06, + "loss": 0.9767, + "step": 1019 + }, + { + "epoch": 0.7616914029683562, + "grad_norm": 3.0462672117608225, + "learning_rate": 1.4177162985084242e-06, + "loss": 0.9758, + "step": 1020 + }, + { + "epoch": 0.7624381592457762, + "grad_norm": 3.3235265510570327, + "learning_rate": 1.4092842951891788e-06, + "loss": 0.9605, + "step": 1021 + }, + { + "epoch": 0.7631849155231961, + "grad_norm": 2.9873139379742044, + "learning_rate": 1.400873326320364e-06, + "loss": 0.947, + "step": 1022 + }, + { + "epoch": 0.7631849155231961, + "eval_loss": 0.934901237487793, + "eval_runtime": 160.8713, + "eval_samples_per_second": 112.071, + "eval_steps_per_second": 1.753, + "step": 1022 + }, + { + "epoch": 0.763931671800616, + "grad_norm": 2.925828560741243, + "learning_rate": 1.3924834411735238e-06, + "loss": 0.9255, + "step": 1023 + }, + { + "epoch": 0.764678428078036, + "grad_norm": 2.929366653908739, + "learning_rate": 1.3841146888966944e-06, + "loss": 0.9649, + "step": 1024 + }, + { + "epoch": 0.765425184355456, + "grad_norm": 3.5848412522150093, + "learning_rate": 1.3757671185141136e-06, + "loss": 0.9942, + "step": 1025 + }, + { + "epoch": 0.766171940632876, + "grad_norm": 4.313625493258559, + "learning_rate": 1.367440778925938e-06, + "loss": 0.9989, + "step": 1026 + }, + { + "epoch": 0.7669186969102959, + "grad_norm": 3.1883660056342444, + "learning_rate": 1.35913571890795e-06, + "loss": 0.934, + "step": 1027 + }, + { + "epoch": 0.7676654531877158, + "grad_norm": 3.0451342971999034, + "learning_rate": 1.350851987111283e-06, + "loss": 0.9309, + "step": 1028 + }, + { + "epoch": 0.7684122094651358, + "grad_norm": 3.0995695469892826, + "learning_rate": 1.3425896320621224e-06, + "loss": 0.9187, + "step": 1029 + }, + { + "epoch": 0.7691589657425558, + "grad_norm": 2.9172531099918624, + "learning_rate": 1.3343487021614315e-06, + "loss": 0.9461, + "step": 1030 + }, + { + "epoch": 0.7699057220199758, + "grad_norm": 2.7992115604306993, + "learning_rate": 1.3261292456846648e-06, + "loss": 0.9552, + "step": 1031 + }, + { + "epoch": 0.7706524782973957, + "grad_norm": 2.895791747367878, + "learning_rate": 1.3179313107814844e-06, + "loss": 0.9338, + "step": 1032 + }, + { + "epoch": 0.7713992345748156, + "grad_norm": 3.185964164923729, + "learning_rate": 1.3097549454754782e-06, + "loss": 0.9724, + "step": 1033 + }, + { + "epoch": 0.7721459908522356, + "grad_norm": 2.5940064617026564, + "learning_rate": 1.3016001976638836e-06, + "loss": 0.8629, + "step": 1034 + }, + { + "epoch": 0.7728927471296556, + "grad_norm": 10.058975733269536, + "learning_rate": 1.2934671151172974e-06, + "loss": 0.9243, + "step": 1035 + }, + { + "epoch": 0.7736395034070755, + "grad_norm": 3.365992421531039, + "learning_rate": 1.2853557454794025e-06, + "loss": 1.0129, + "step": 1036 + }, + { + "epoch": 0.7736395034070755, + "eval_loss": 0.934323787689209, + "eval_runtime": 162.2911, + "eval_samples_per_second": 111.091, + "eval_steps_per_second": 1.738, + "step": 1036 + }, + { + "epoch": 0.7743862596844955, + "grad_norm": 3.4360489381196446, + "learning_rate": 1.2772661362666877e-06, + "loss": 1.0044, + "step": 1037 + }, + { + "epoch": 0.7751330159619154, + "grad_norm": 4.245525265730096, + "learning_rate": 1.2691983348681698e-06, + "loss": 0.9676, + "step": 1038 + }, + { + "epoch": 0.7758797722393354, + "grad_norm": 3.7645844451140666, + "learning_rate": 1.2611523885451137e-06, + "loss": 0.9585, + "step": 1039 + }, + { + "epoch": 0.7766265285167554, + "grad_norm": 2.9642595691153444, + "learning_rate": 1.2531283444307567e-06, + "loss": 1.0301, + "step": 1040 + }, + { + "epoch": 0.7773732847941753, + "grad_norm": 2.6311114422503605, + "learning_rate": 1.2451262495300366e-06, + "loss": 1.0166, + "step": 1041 + }, + { + "epoch": 0.7781200410715953, + "grad_norm": 3.151248153435886, + "learning_rate": 1.2371461507193077e-06, + "loss": 0.9414, + "step": 1042 + }, + { + "epoch": 0.7788667973490152, + "grad_norm": 3.2520539000528057, + "learning_rate": 1.2291880947460732e-06, + "loss": 0.9074, + "step": 1043 + }, + { + "epoch": 0.7796135536264351, + "grad_norm": 3.2890092101452435, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.8918, + "step": 1044 + }, + { + "epoch": 0.7803603099038551, + "grad_norm": 3.2370242322883875, + "learning_rate": 1.213338297656191e-06, + "loss": 0.9752, + "step": 1045 + }, + { + "epoch": 0.7811070661812751, + "grad_norm": 2.7758770607541368, + "learning_rate": 1.2054466493878209e-06, + "loss": 0.9171, + "step": 1046 + }, + { + "epoch": 0.7818538224586951, + "grad_norm": 3.605541137271554, + "learning_rate": 1.1975772296529564e-06, + "loss": 0.9332, + "step": 1047 + }, + { + "epoch": 0.782600578736115, + "grad_norm": 3.041323832919354, + "learning_rate": 1.1897300845507447e-06, + "loss": 0.9344, + "step": 1048 + }, + { + "epoch": 0.7833473350135349, + "grad_norm": 2.8405142198832998, + "learning_rate": 1.1819052600498444e-06, + "loss": 0.9004, + "step": 1049 + }, + { + "epoch": 0.7840940912909549, + "grad_norm": 3.021597526668893, + "learning_rate": 1.1741028019881546e-06, + "loss": 0.9212, + "step": 1050 + }, + { + "epoch": 0.7840940912909549, + "eval_loss": 0.9340171217918396, + "eval_runtime": 162.2347, + "eval_samples_per_second": 111.129, + "eval_steps_per_second": 1.738, + "step": 1050 + }, + { + "epoch": 0.7848408475683749, + "grad_norm": 2.950883129014722, + "learning_rate": 1.166322756072562e-06, + "loss": 0.9675, + "step": 1051 + }, + { + "epoch": 0.7855876038457948, + "grad_norm": 3.0912726001846256, + "learning_rate": 1.1585651678786558e-06, + "loss": 0.8971, + "step": 1052 + }, + { + "epoch": 0.7863343601232148, + "grad_norm": 4.15327113314584, + "learning_rate": 1.1508300828504682e-06, + "loss": 0.9149, + "step": 1053 + }, + { + "epoch": 0.7870811164006347, + "grad_norm": 3.2238088469540704, + "learning_rate": 1.1431175463002114e-06, + "loss": 0.9713, + "step": 1054 + }, + { + "epoch": 0.7878278726780547, + "grad_norm": 3.0917040248252428, + "learning_rate": 1.1354276034080059e-06, + "loss": 0.8971, + "step": 1055 + }, + { + "epoch": 0.7885746289554747, + "grad_norm": 4.274054201982966, + "learning_rate": 1.1277602992216142e-06, + "loss": 0.9776, + "step": 1056 + }, + { + "epoch": 0.7893213852328946, + "grad_norm": 3.4775465819659312, + "learning_rate": 1.1201156786561884e-06, + "loss": 1.0144, + "step": 1057 + }, + { + "epoch": 0.7900681415103146, + "grad_norm": 3.353681585305471, + "learning_rate": 1.1124937864939956e-06, + "loss": 0.9548, + "step": 1058 + }, + { + "epoch": 0.7908148977877345, + "grad_norm": 3.129257365590115, + "learning_rate": 1.1048946673841598e-06, + "loss": 1.0154, + "step": 1059 + }, + { + "epoch": 0.7915616540651544, + "grad_norm": 3.590240334184257, + "learning_rate": 1.0973183658424008e-06, + "loss": 0.9403, + "step": 1060 + }, + { + "epoch": 0.7923084103425745, + "grad_norm": 2.8470937895204407, + "learning_rate": 1.0897649262507753e-06, + "loss": 0.9681, + "step": 1061 + }, + { + "epoch": 0.7930551666199944, + "grad_norm": 3.532189828005365, + "learning_rate": 1.0822343928574087e-06, + "loss": 0.9324, + "step": 1062 + }, + { + "epoch": 0.7938019228974144, + "grad_norm": 3.1397557159820395, + "learning_rate": 1.0747268097762454e-06, + "loss": 0.9094, + "step": 1063 + }, + { + "epoch": 0.7945486791748343, + "grad_norm": 6.086890250561432, + "learning_rate": 1.0672422209867879e-06, + "loss": 1.0163, + "step": 1064 + }, + { + "epoch": 0.7945486791748343, + "eval_loss": 0.933886706829071, + "eval_runtime": 162.281, + "eval_samples_per_second": 111.097, + "eval_steps_per_second": 1.738, + "step": 1064 + }, + { + "epoch": 0.7952954354522542, + "grad_norm": 3.169410119671673, + "learning_rate": 1.0597806703338354e-06, + "loss": 0.9663, + "step": 1065 + }, + { + "epoch": 0.7960421917296743, + "grad_norm": 3.3940195467110765, + "learning_rate": 1.0523422015272299e-06, + "loss": 0.9519, + "step": 1066 + }, + { + "epoch": 0.7967889480070942, + "grad_norm": 4.508947232616116, + "learning_rate": 1.0449268581416012e-06, + "loss": 0.904, + "step": 1067 + }, + { + "epoch": 0.7975357042845141, + "grad_norm": 2.632774375597297, + "learning_rate": 1.0375346836161071e-06, + "loss": 0.9245, + "step": 1068 + }, + { + "epoch": 0.7982824605619341, + "grad_norm": 2.9785159713949714, + "learning_rate": 1.0301657212541854e-06, + "loss": 0.9514, + "step": 1069 + }, + { + "epoch": 0.799029216839354, + "grad_norm": 3.155524617990433, + "learning_rate": 1.022820014223293e-06, + "loss": 0.8659, + "step": 1070 + }, + { + "epoch": 0.7997759731167741, + "grad_norm": 2.805966276939344, + "learning_rate": 1.0154976055546627e-06, + "loss": 1.0431, + "step": 1071 + }, + { + "epoch": 0.800522729394194, + "grad_norm": 3.8141120841423706, + "learning_rate": 1.0081985381430392e-06, + "loss": 0.9828, + "step": 1072 + }, + { + "epoch": 0.8012694856716139, + "grad_norm": 3.2436963757912824, + "learning_rate": 1.0009228547464373e-06, + "loss": 0.9252, + "step": 1073 + }, + { + "epoch": 0.8020162419490339, + "grad_norm": 3.034785333690608, + "learning_rate": 9.936705979858863e-07, + "loss": 0.9408, + "step": 1074 + }, + { + "epoch": 0.8027629982264538, + "grad_norm": 3.247230477369662, + "learning_rate": 9.86441810345183e-07, + "loss": 0.9407, + "step": 1075 + }, + { + "epoch": 0.8035097545038739, + "grad_norm": 3.1230396825638223, + "learning_rate": 9.792365341706395e-07, + "loss": 1.0738, + "step": 1076 + }, + { + "epoch": 0.8042565107812938, + "grad_norm": 3.634705534638435, + "learning_rate": 9.720548116708434e-07, + "loss": 1.0531, + "step": 1077 + }, + { + "epoch": 0.8050032670587137, + "grad_norm": 3.2524650618531403, + "learning_rate": 9.648966849163987e-07, + "loss": 0.9508, + "step": 1078 + }, + { + "epoch": 0.8050032670587137, + "eval_loss": 0.9330756068229675, + "eval_runtime": 162.464, + "eval_samples_per_second": 110.972, + "eval_steps_per_second": 1.736, + "step": 1078 + }, + { + "epoch": 0.8057500233361337, + "grad_norm": 3.195766017025719, + "learning_rate": 9.577621958396876e-07, + "loss": 0.9247, + "step": 1079 + }, + { + "epoch": 0.8064967796135536, + "grad_norm": 3.231704715181529, + "learning_rate": 9.506513862346223e-07, + "loss": 0.9718, + "step": 1080 + }, + { + "epoch": 0.8072435358909735, + "grad_norm": 3.08352329883453, + "learning_rate": 9.435642977564002e-07, + "loss": 0.953, + "step": 1081 + }, + { + "epoch": 0.8079902921683936, + "grad_norm": 2.7239297991167146, + "learning_rate": 9.365009719212609e-07, + "loss": 1.019, + "step": 1082 + }, + { + "epoch": 0.8087370484458135, + "grad_norm": 3.4569217007757103, + "learning_rate": 9.294614501062393e-07, + "loss": 1.0053, + "step": 1083 + }, + { + "epoch": 0.8094838047232334, + "grad_norm": 2.8952768629298182, + "learning_rate": 9.224457735489312e-07, + "loss": 0.879, + "step": 1084 + }, + { + "epoch": 0.8102305610006534, + "grad_norm": 2.635860772081649, + "learning_rate": 9.154539833472442e-07, + "loss": 0.9415, + "step": 1085 + }, + { + "epoch": 0.8109773172780733, + "grad_norm": 2.981500012106527, + "learning_rate": 9.08486120459155e-07, + "loss": 0.8901, + "step": 1086 + }, + { + "epoch": 0.8117240735554934, + "grad_norm": 3.2151674731774955, + "learning_rate": 9.015422257024814e-07, + "loss": 1.0142, + "step": 1087 + }, + { + "epoch": 0.8124708298329133, + "grad_norm": 4.460127069540021, + "learning_rate": 8.946223397546311e-07, + "loss": 0.9761, + "step": 1088 + }, + { + "epoch": 0.8132175861103332, + "grad_norm": 2.64438098456573, + "learning_rate": 8.877265031523685e-07, + "loss": 0.9587, + "step": 1089 + }, + { + "epoch": 0.8139643423877532, + "grad_norm": 3.075964264839053, + "learning_rate": 8.80854756291576e-07, + "loss": 1.0468, + "step": 1090 + }, + { + "epoch": 0.8147110986651731, + "grad_norm": 3.701739504153379, + "learning_rate": 8.740071394270217e-07, + "loss": 0.905, + "step": 1091 + }, + { + "epoch": 0.8154578549425932, + "grad_norm": 2.9113831337724507, + "learning_rate": 8.671836926721172e-07, + "loss": 0.9129, + "step": 1092 + }, + { + "epoch": 0.8154578549425932, + "eval_loss": 0.9328471422195435, + "eval_runtime": 162.392, + "eval_samples_per_second": 111.021, + "eval_steps_per_second": 1.737, + "step": 1092 + }, + { + "epoch": 0.8162046112200131, + "grad_norm": 4.310707752332849, + "learning_rate": 8.603844559986823e-07, + "loss": 1.0048, + "step": 1093 + }, + { + "epoch": 0.816951367497433, + "grad_norm": 3.8711564841693384, + "learning_rate": 8.536094692367197e-07, + "loss": 0.9851, + "step": 1094 + }, + { + "epoch": 0.817698123774853, + "grad_norm": 8.753075618875652, + "learning_rate": 8.468587720741728e-07, + "loss": 0.9883, + "step": 1095 + }, + { + "epoch": 0.8184448800522729, + "grad_norm": 2.8672638891822864, + "learning_rate": 8.401324040566955e-07, + "loss": 0.9291, + "step": 1096 + }, + { + "epoch": 0.8191916363296929, + "grad_norm": 4.776201464319421, + "learning_rate": 8.334304045874248e-07, + "loss": 0.9755, + "step": 1097 + }, + { + "epoch": 0.8199383926071129, + "grad_norm": 3.16525486972403, + "learning_rate": 8.267528129267438e-07, + "loss": 0.9925, + "step": 1098 + }, + { + "epoch": 0.8206851488845328, + "grad_norm": 4.511439354536366, + "learning_rate": 8.20099668192052e-07, + "loss": 0.9539, + "step": 1099 + }, + { + "epoch": 0.8214319051619527, + "grad_norm": 3.4286525245441797, + "learning_rate": 8.134710093575444e-07, + "loss": 0.9686, + "step": 1100 + }, + { + "epoch": 0.8221786614393727, + "grad_norm": 3.0049333808435343, + "learning_rate": 8.068668752539726e-07, + "loss": 0.9661, + "step": 1101 + }, + { + "epoch": 0.8229254177167927, + "grad_norm": 4.385250881338943, + "learning_rate": 8.002873045684245e-07, + "loss": 0.9325, + "step": 1102 + }, + { + "epoch": 0.8236721739942127, + "grad_norm": 3.027199330607413, + "learning_rate": 7.937323358440935e-07, + "loss": 0.9295, + "step": 1103 + }, + { + "epoch": 0.8244189302716326, + "grad_norm": 3.4801606475057505, + "learning_rate": 7.872020074800585e-07, + "loss": 1.0889, + "step": 1104 + }, + { + "epoch": 0.8251656865490525, + "grad_norm": 2.881651694848292, + "learning_rate": 7.80696357731049e-07, + "loss": 0.9259, + "step": 1105 + }, + { + "epoch": 0.8259124428264725, + "grad_norm": 3.232131277331923, + "learning_rate": 7.742154247072287e-07, + "loss": 0.9543, + "step": 1106 + }, + { + "epoch": 0.8259124428264725, + "eval_loss": 0.9323558211326599, + "eval_runtime": 160.8356, + "eval_samples_per_second": 112.096, + "eval_steps_per_second": 1.753, + "step": 1106 + }, + { + "epoch": 0.8266591991038925, + "grad_norm": 3.839039683871901, + "learning_rate": 7.677592463739741e-07, + "loss": 1.0328, + "step": 1107 + }, + { + "epoch": 0.8274059553813125, + "grad_norm": 3.451018921091918, + "learning_rate": 7.613278605516455e-07, + "loss": 0.9324, + "step": 1108 + }, + { + "epoch": 0.8281527116587324, + "grad_norm": 2.741822965758025, + "learning_rate": 7.549213049153687e-07, + "loss": 0.9556, + "step": 1109 + }, + { + "epoch": 0.8288994679361523, + "grad_norm": 2.809367958739757, + "learning_rate": 7.485396169948133e-07, + "loss": 1.0265, + "step": 1110 + }, + { + "epoch": 0.8296462242135723, + "grad_norm": 2.5599878133704914, + "learning_rate": 7.421828341739751e-07, + "loss": 0.888, + "step": 1111 + }, + { + "epoch": 0.8303929804909923, + "grad_norm": 3.482845460987069, + "learning_rate": 7.358509936909541e-07, + "loss": 1.025, + "step": 1112 + }, + { + "epoch": 0.8311397367684122, + "grad_norm": 3.736730021811191, + "learning_rate": 7.295441326377384e-07, + "loss": 0.9231, + "step": 1113 + }, + { + "epoch": 0.8318864930458322, + "grad_norm": 2.945367690866313, + "learning_rate": 7.232622879599882e-07, + "loss": 0.8896, + "step": 1114 + }, + { + "epoch": 0.8326332493232521, + "grad_norm": 2.786535060510927, + "learning_rate": 7.170054964568146e-07, + "loss": 0.9609, + "step": 1115 + }, + { + "epoch": 0.833380005600672, + "grad_norm": 4.311286527154838, + "learning_rate": 7.107737947805688e-07, + "loss": 1.0491, + "step": 1116 + }, + { + "epoch": 0.834126761878092, + "grad_norm": 3.356639280458624, + "learning_rate": 7.045672194366238e-07, + "loss": 0.8749, + "step": 1117 + }, + { + "epoch": 0.834873518155512, + "grad_norm": 2.703485397651996, + "learning_rate": 6.983858067831645e-07, + "loss": 0.9012, + "step": 1118 + }, + { + "epoch": 0.835620274432932, + "grad_norm": 4.065892457563659, + "learning_rate": 6.922295930309691e-07, + "loss": 1.0221, + "step": 1119 + }, + { + "epoch": 0.8363670307103519, + "grad_norm": 3.418730924372397, + "learning_rate": 6.860986142432057e-07, + "loss": 1.0233, + "step": 1120 + }, + { + "epoch": 0.8363670307103519, + "eval_loss": 0.9319419264793396, + "eval_runtime": 162.3687, + "eval_samples_per_second": 111.037, + "eval_steps_per_second": 1.737, + "step": 1120 + }, + { + "epoch": 0.8371137869877718, + "grad_norm": 2.996912276789265, + "learning_rate": 6.799929063352112e-07, + "loss": 0.9296, + "step": 1121 + }, + { + "epoch": 0.8378605432651918, + "grad_norm": 2.7030140370045483, + "learning_rate": 6.739125050742873e-07, + "loss": 1.014, + "step": 1122 + }, + { + "epoch": 0.8386072995426118, + "grad_norm": 2.7714835166723937, + "learning_rate": 6.678574460794879e-07, + "loss": 0.9642, + "step": 1123 + }, + { + "epoch": 0.8393540558200318, + "grad_norm": 3.0335494350345162, + "learning_rate": 6.618277648214127e-07, + "loss": 0.9585, + "step": 1124 + }, + { + "epoch": 0.8401008120974517, + "grad_norm": 3.05369699501868, + "learning_rate": 6.558234966219984e-07, + "loss": 0.9921, + "step": 1125 + }, + { + "epoch": 0.8408475683748716, + "grad_norm": 3.2366643164869218, + "learning_rate": 6.498446766543098e-07, + "loss": 0.969, + "step": 1126 + }, + { + "epoch": 0.8415943246522916, + "grad_norm": 3.3113556891155964, + "learning_rate": 6.438913399423396e-07, + "loss": 0.9841, + "step": 1127 + }, + { + "epoch": 0.8423410809297116, + "grad_norm": 3.742348768556645, + "learning_rate": 6.379635213607971e-07, + "loss": 0.9123, + "step": 1128 + }, + { + "epoch": 0.8430878372071315, + "grad_norm": 4.013813043282346, + "learning_rate": 6.320612556349027e-07, + "loss": 1.0005, + "step": 1129 + }, + { + "epoch": 0.8438345934845515, + "grad_norm": 3.235695639624682, + "learning_rate": 6.261845773401936e-07, + "loss": 0.9728, + "step": 1130 + }, + { + "epoch": 0.8445813497619714, + "grad_norm": 3.279666788848323, + "learning_rate": 6.203335209023137e-07, + "loss": 0.9955, + "step": 1131 + }, + { + "epoch": 0.8453281060393913, + "grad_norm": 2.985404353225797, + "learning_rate": 6.145081205968123e-07, + "loss": 0.9569, + "step": 1132 + }, + { + "epoch": 0.8460748623168114, + "grad_norm": 2.9264881627568933, + "learning_rate": 6.087084105489449e-07, + "loss": 0.97, + "step": 1133 + }, + { + "epoch": 0.8468216185942313, + "grad_norm": 2.8871274133914526, + "learning_rate": 6.029344247334773e-07, + "loss": 0.9063, + "step": 1134 + }, + { + "epoch": 0.8468216185942313, + "eval_loss": 0.9319166541099548, + "eval_runtime": 160.895, + "eval_samples_per_second": 112.054, + "eval_steps_per_second": 1.753, + "step": 1134 + }, + { + "epoch": 0.8475683748716513, + "grad_norm": 2.9254630933971604, + "learning_rate": 5.971861969744758e-07, + "loss": 0.9854, + "step": 1135 + }, + { + "epoch": 0.8483151311490712, + "grad_norm": 3.9276232939207363, + "learning_rate": 5.914637609451191e-07, + "loss": 1.0273, + "step": 1136 + }, + { + "epoch": 0.8490618874264911, + "grad_norm": 2.507415577305572, + "learning_rate": 5.857671501675005e-07, + "loss": 0.9728, + "step": 1137 + }, + { + "epoch": 0.8498086437039112, + "grad_norm": 2.4460330720059504, + "learning_rate": 5.800963980124241e-07, + "loss": 0.9021, + "step": 1138 + }, + { + "epoch": 0.8505553999813311, + "grad_norm": 3.0103053437385334, + "learning_rate": 5.744515376992155e-07, + "loss": 0.9459, + "step": 1139 + }, + { + "epoch": 0.851302156258751, + "grad_norm": 2.817309734726002, + "learning_rate": 5.688326022955276e-07, + "loss": 0.95, + "step": 1140 + }, + { + "epoch": 0.852048912536171, + "grad_norm": 2.6530611977707976, + "learning_rate": 5.632396247171429e-07, + "loss": 0.9195, + "step": 1141 + }, + { + "epoch": 0.8527956688135909, + "grad_norm": 3.2183899358377213, + "learning_rate": 5.576726377277803e-07, + "loss": 0.9738, + "step": 1142 + }, + { + "epoch": 0.853542425091011, + "grad_norm": 2.7141680726405926, + "learning_rate": 5.521316739389116e-07, + "loss": 0.9675, + "step": 1143 + }, + { + "epoch": 0.8542891813684309, + "grad_norm": 2.7689523546663146, + "learning_rate": 5.46616765809559e-07, + "loss": 0.9566, + "step": 1144 + }, + { + "epoch": 0.8550359376458508, + "grad_norm": 2.6466811955985095, + "learning_rate": 5.411279456461133e-07, + "loss": 0.865, + "step": 1145 + }, + { + "epoch": 0.8557826939232708, + "grad_norm": 3.5195514432600206, + "learning_rate": 5.3566524560214e-07, + "loss": 0.923, + "step": 1146 + }, + { + "epoch": 0.8565294502006907, + "grad_norm": 2.871952018242148, + "learning_rate": 5.302286976781956e-07, + "loss": 1.0447, + "step": 1147 + }, + { + "epoch": 0.8572762064781108, + "grad_norm": 2.925114725852367, + "learning_rate": 5.248183337216328e-07, + "loss": 0.9123, + "step": 1148 + }, + { + "epoch": 0.8572762064781108, + "eval_loss": 0.9314769506454468, + "eval_runtime": 162.7458, + "eval_samples_per_second": 110.78, + "eval_steps_per_second": 1.733, + "step": 1148 + }, + { + "epoch": 0.8580229627555307, + "grad_norm": 4.717832820933871, + "learning_rate": 5.194341854264206e-07, + "loss": 1.0074, + "step": 1149 + }, + { + "epoch": 0.8587697190329506, + "grad_norm": 3.809263009430757, + "learning_rate": 5.140762843329583e-07, + "loss": 0.9953, + "step": 1150 + }, + { + "epoch": 0.8595164753103706, + "grad_norm": 4.127384933862416, + "learning_rate": 5.087446618278858e-07, + "loss": 0.9496, + "step": 1151 + }, + { + "epoch": 0.8602632315877905, + "grad_norm": 4.437603789678461, + "learning_rate": 5.034393491439044e-07, + "loss": 0.9705, + "step": 1152 + }, + { + "epoch": 0.8610099878652105, + "grad_norm": 3.243478071729358, + "learning_rate": 4.981603773595911e-07, + "loss": 0.9363, + "step": 1153 + }, + { + "epoch": 0.8617567441426305, + "grad_norm": 2.6993410282554007, + "learning_rate": 4.929077773992186e-07, + "loss": 0.9113, + "step": 1154 + }, + { + "epoch": 0.8625035004200504, + "grad_norm": 2.91054239299717, + "learning_rate": 4.87681580032573e-07, + "loss": 0.9461, + "step": 1155 + }, + { + "epoch": 0.8632502566974704, + "grad_norm": 2.568086078013279, + "learning_rate": 4.824818158747718e-07, + "loss": 0.8377, + "step": 1156 + }, + { + "epoch": 0.8639970129748903, + "grad_norm": 3.029881375353677, + "learning_rate": 4.773085153860912e-07, + "loss": 0.9166, + "step": 1157 + }, + { + "epoch": 0.8647437692523102, + "grad_norm": 2.88437306298097, + "learning_rate": 4.7216170887177834e-07, + "loss": 0.9333, + "step": 1158 + }, + { + "epoch": 0.8654905255297303, + "grad_norm": 3.137367070847363, + "learning_rate": 4.6704142648188013e-07, + "loss": 0.9337, + "step": 1159 + }, + { + "epoch": 0.8662372818071502, + "grad_norm": 2.9359952034209273, + "learning_rate": 4.619476982110649e-07, + "loss": 0.9736, + "step": 1160 + }, + { + "epoch": 0.8669840380845701, + "grad_norm": 3.1468262493105184, + "learning_rate": 4.568805538984461e-07, + "loss": 0.9122, + "step": 1161 + }, + { + "epoch": 0.8677307943619901, + "grad_norm": 3.5463405622760105, + "learning_rate": 4.5184002322740784e-07, + "loss": 1.0057, + "step": 1162 + }, + { + "epoch": 0.8677307943619901, + "eval_loss": 0.9312469363212585, + "eval_runtime": 162.7083, + "eval_samples_per_second": 110.806, + "eval_steps_per_second": 1.733, + "step": 1162 + }, + { + "epoch": 0.86847755063941, + "grad_norm": 2.9402861286207607, + "learning_rate": 4.468261357254339e-07, + "loss": 0.918, + "step": 1163 + }, + { + "epoch": 0.8692243069168301, + "grad_norm": 3.5252882533020564, + "learning_rate": 4.41838920763929e-07, + "loss": 0.9697, + "step": 1164 + }, + { + "epoch": 0.86997106319425, + "grad_norm": 3.682368504764222, + "learning_rate": 4.368784075580512e-07, + "loss": 0.9509, + "step": 1165 + }, + { + "epoch": 0.8707178194716699, + "grad_norm": 3.3322856389909865, + "learning_rate": 4.319446251665388e-07, + "loss": 1.0236, + "step": 1166 + }, + { + "epoch": 0.8714645757490899, + "grad_norm": 3.4771965818234882, + "learning_rate": 4.2703760249154124e-07, + "loss": 0.9486, + "step": 1167 + }, + { + "epoch": 0.8722113320265098, + "grad_norm": 3.069323768435232, + "learning_rate": 4.221573682784486e-07, + "loss": 0.9591, + "step": 1168 + }, + { + "epoch": 0.8729580883039298, + "grad_norm": 4.090015934591448, + "learning_rate": 4.1730395111572397e-07, + "loss": 0.969, + "step": 1169 + }, + { + "epoch": 0.8737048445813498, + "grad_norm": 6.2126368267999705, + "learning_rate": 4.124773794347375e-07, + "loss": 0.9801, + "step": 1170 + }, + { + "epoch": 0.8744516008587697, + "grad_norm": 2.655406724824259, + "learning_rate": 4.0767768150959785e-07, + "loss": 0.9159, + "step": 1171 + }, + { + "epoch": 0.8751983571361897, + "grad_norm": 2.840595662469952, + "learning_rate": 4.0290488545698224e-07, + "loss": 0.8925, + "step": 1172 + }, + { + "epoch": 0.8759451134136096, + "grad_norm": 3.2666873166974373, + "learning_rate": 3.9815901923598354e-07, + "loss": 0.9983, + "step": 1173 + }, + { + "epoch": 0.8766918696910296, + "grad_norm": 2.899555759428235, + "learning_rate": 3.934401106479352e-07, + "loss": 0.983, + "step": 1174 + }, + { + "epoch": 0.8774386259684496, + "grad_norm": 3.082400103066319, + "learning_rate": 3.8874818733625363e-07, + "loss": 0.9847, + "step": 1175 + }, + { + "epoch": 0.8781853822458695, + "grad_norm": 3.4527125973541892, + "learning_rate": 3.8408327678627343e-07, + "loss": 0.9698, + "step": 1176 + }, + { + "epoch": 0.8781853822458695, + "eval_loss": 0.9311810731887817, + "eval_runtime": 161.1374, + "eval_samples_per_second": 111.886, + "eval_steps_per_second": 1.75, + "step": 1176 + }, + { + "epoch": 0.8789321385232894, + "grad_norm": 3.090415752994308, + "learning_rate": 3.79445406325093e-07, + "loss": 1.0307, + "step": 1177 + }, + { + "epoch": 0.8796788948007094, + "grad_norm": 2.6147140868395784, + "learning_rate": 3.7483460312140343e-07, + "loss": 0.9311, + "step": 1178 + }, + { + "epoch": 0.8804256510781294, + "grad_norm": 2.7086102542703174, + "learning_rate": 3.702508941853383e-07, + "loss": 0.9121, + "step": 1179 + }, + { + "epoch": 0.8811724073555494, + "grad_norm": 3.1100117614192557, + "learning_rate": 3.6569430636831496e-07, + "loss": 1.0066, + "step": 1180 + }, + { + "epoch": 0.8819191636329693, + "grad_norm": 3.5375133182103884, + "learning_rate": 3.611648663628725e-07, + "loss": 1.0192, + "step": 1181 + }, + { + "epoch": 0.8826659199103892, + "grad_norm": 3.4809829662795053, + "learning_rate": 3.566626007025159e-07, + "loss": 0.9227, + "step": 1182 + }, + { + "epoch": 0.8834126761878092, + "grad_norm": 2.807725591167475, + "learning_rate": 3.5218753576156837e-07, + "loss": 0.9324, + "step": 1183 + }, + { + "epoch": 0.8841594324652292, + "grad_norm": 4.1307538335322995, + "learning_rate": 3.477396977550052e-07, + "loss": 1.0402, + "step": 1184 + }, + { + "epoch": 0.8849061887426491, + "grad_norm": 3.1065655720227943, + "learning_rate": 3.433191127383079e-07, + "loss": 0.9984, + "step": 1185 + }, + { + "epoch": 0.8856529450200691, + "grad_norm": 3.0423671480418006, + "learning_rate": 3.3892580660731146e-07, + "loss": 0.9477, + "step": 1186 + }, + { + "epoch": 0.886399701297489, + "grad_norm": 5.506314111297366, + "learning_rate": 3.3455980509804865e-07, + "loss": 0.9375, + "step": 1187 + }, + { + "epoch": 0.887146457574909, + "grad_norm": 2.7257294061538526, + "learning_rate": 3.302211337866029e-07, + "loss": 0.9616, + "step": 1188 + }, + { + "epoch": 0.887893213852329, + "grad_norm": 4.286522315516212, + "learning_rate": 3.2590981808895637e-07, + "loss": 1.002, + "step": 1189 + }, + { + "epoch": 0.8886399701297489, + "grad_norm": 3.266886884058907, + "learning_rate": 3.21625883260841e-07, + "loss": 0.9467, + "step": 1190 + }, + { + "epoch": 0.8886399701297489, + "eval_loss": 0.9309723377227783, + "eval_runtime": 161.2639, + "eval_samples_per_second": 111.798, + "eval_steps_per_second": 1.749, + "step": 1190 + }, + { + "epoch": 0.8893867264071689, + "grad_norm": 2.594459317115802, + "learning_rate": 3.173693543975931e-07, + "loss": 0.9563, + "step": 1191 + }, + { + "epoch": 0.8901334826845888, + "grad_norm": 3.4125681478868297, + "learning_rate": 3.1314025643400246e-07, + "loss": 0.9761, + "step": 1192 + }, + { + "epoch": 0.8908802389620087, + "grad_norm": 3.2367487026935855, + "learning_rate": 3.089386141441714e-07, + "loss": 1.0675, + "step": 1193 + }, + { + "epoch": 0.8916269952394287, + "grad_norm": 2.7617284333366316, + "learning_rate": 3.0476445214136343e-07, + "loss": 0.9114, + "step": 1194 + }, + { + "epoch": 0.8923737515168487, + "grad_norm": 3.4059049786673707, + "learning_rate": 3.0061779487786325e-07, + "loss": 1.0092, + "step": 1195 + }, + { + "epoch": 0.8931205077942687, + "grad_norm": 2.9466488189376094, + "learning_rate": 2.9649866664483387e-07, + "loss": 1.0071, + "step": 1196 + }, + { + "epoch": 0.8938672640716886, + "grad_norm": 3.2214019544717356, + "learning_rate": 2.9240709157217107e-07, + "loss": 0.9705, + "step": 1197 + }, + { + "epoch": 0.8946140203491085, + "grad_norm": 2.92519808633413, + "learning_rate": 2.883430936283649e-07, + "loss": 0.9843, + "step": 1198 + }, + { + "epoch": 0.8953607766265285, + "grad_norm": 3.173147787422756, + "learning_rate": 2.8430669662035784e-07, + "loss": 0.961, + "step": 1199 + }, + { + "epoch": 0.8961075329039485, + "grad_norm": 3.73113933358305, + "learning_rate": 2.802979241934067e-07, + "loss": 0.9631, + "step": 1200 + }, + { + "epoch": 0.8968542891813684, + "grad_norm": 2.8388033769300582, + "learning_rate": 2.7631679983094293e-07, + "loss": 0.9686, + "step": 1201 + }, + { + "epoch": 0.8976010454587884, + "grad_norm": 2.580484676835834, + "learning_rate": 2.72363346854434e-07, + "loss": 0.8967, + "step": 1202 + }, + { + "epoch": 0.8983478017362083, + "grad_norm": 2.6372158270212616, + "learning_rate": 2.684375884232493e-07, + "loss": 0.8848, + "step": 1203 + }, + { + "epoch": 0.8990945580136283, + "grad_norm": 5.045405465113533, + "learning_rate": 2.645395475345236e-07, + "loss": 0.9653, + "step": 1204 + }, + { + "epoch": 0.8990945580136283, + "eval_loss": 0.9308164119720459, + "eval_runtime": 160.9645, + "eval_samples_per_second": 112.006, + "eval_steps_per_second": 1.752, + "step": 1204 + }, + { + "epoch": 0.8998413142910483, + "grad_norm": 3.81237492723543, + "learning_rate": 2.6066924702302044e-07, + "loss": 0.985, + "step": 1205 + }, + { + "epoch": 0.9005880705684682, + "grad_norm": 4.444299985817935, + "learning_rate": 2.568267095610022e-07, + "loss": 0.9718, + "step": 1206 + }, + { + "epoch": 0.9013348268458882, + "grad_norm": 3.213116580736095, + "learning_rate": 2.530119576580936e-07, + "loss": 0.95, + "step": 1207 + }, + { + "epoch": 0.9020815831233081, + "grad_norm": 3.304199593315397, + "learning_rate": 2.492250136611513e-07, + "loss": 0.9401, + "step": 1208 + }, + { + "epoch": 0.902828339400728, + "grad_norm": 3.216958801550224, + "learning_rate": 2.454658997541326e-07, + "loss": 0.9949, + "step": 1209 + }, + { + "epoch": 0.9035750956781481, + "grad_norm": 2.8366418091093633, + "learning_rate": 2.417346379579671e-07, + "loss": 0.9415, + "step": 1210 + }, + { + "epoch": 0.904321851955568, + "grad_norm": 3.4418483482994353, + "learning_rate": 2.380312501304255e-07, + "loss": 0.9263, + "step": 1211 + }, + { + "epoch": 0.905068608232988, + "grad_norm": 3.2162625122987283, + "learning_rate": 2.343557579659922e-07, + "loss": 0.9481, + "step": 1212 + }, + { + "epoch": 0.9058153645104079, + "grad_norm": 3.6606526579447363, + "learning_rate": 2.3070818299573972e-07, + "loss": 0.9496, + "step": 1213 + }, + { + "epoch": 0.9065621207878278, + "grad_norm": 3.413782160357342, + "learning_rate": 2.2708854658720135e-07, + "loss": 0.8925, + "step": 1214 + }, + { + "epoch": 0.9073088770652479, + "grad_norm": 3.37342109636671, + "learning_rate": 2.2349686994424303e-07, + "loss": 0.9775, + "step": 1215 + }, + { + "epoch": 0.9080556333426678, + "grad_norm": 3.226917932046761, + "learning_rate": 2.1993317410694605e-07, + "loss": 0.9228, + "step": 1216 + }, + { + "epoch": 0.9088023896200877, + "grad_norm": 2.9170592051096937, + "learning_rate": 2.1639747995147843e-07, + "loss": 0.9238, + "step": 1217 + }, + { + "epoch": 0.9095491458975077, + "grad_norm": 4.372205178863148, + "learning_rate": 2.1288980818997272e-07, + "loss": 0.974, + "step": 1218 + }, + { + "epoch": 0.9095491458975077, + "eval_loss": 0.9307811260223389, + "eval_runtime": 162.4851, + "eval_samples_per_second": 110.958, + "eval_steps_per_second": 1.736, + "step": 1218 + }, + { + "epoch": 0.9102959021749276, + "grad_norm": 2.9674581524340713, + "learning_rate": 2.094101793704073e-07, + "loss": 0.9195, + "step": 1219 + }, + { + "epoch": 0.9110426584523477, + "grad_norm": 2.9235998255992537, + "learning_rate": 2.0595861387648574e-07, + "loss": 0.9142, + "step": 1220 + }, + { + "epoch": 0.9117894147297676, + "grad_norm": 3.485012716463773, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.9069, + "step": 1221 + }, + { + "epoch": 0.9125361710071875, + "grad_norm": 2.6461707104119876, + "learning_rate": 1.9913975357828408e-07, + "loss": 0.9067, + "step": 1222 + }, + { + "epoch": 0.9132829272846075, + "grad_norm": 2.661675283327674, + "learning_rate": 1.957724987189602e-07, + "loss": 0.9367, + "step": 1223 + }, + { + "epoch": 0.9140296835620274, + "grad_norm": 3.1094316780050812, + "learning_rate": 1.9243338707495618e-07, + "loss": 0.8989, + "step": 1224 + }, + { + "epoch": 0.9147764398394475, + "grad_norm": 3.183968339471646, + "learning_rate": 1.8912243820682296e-07, + "loss": 0.9342, + "step": 1225 + }, + { + "epoch": 0.9155231961168674, + "grad_norm": 2.9166944722749273, + "learning_rate": 1.858396715101346e-07, + "loss": 1.0136, + "step": 1226 + }, + { + "epoch": 0.9162699523942873, + "grad_norm": 4.554593719123295, + "learning_rate": 1.8258510621537219e-07, + "loss": 0.9789, + "step": 1227 + }, + { + "epoch": 0.9170167086717073, + "grad_norm": 2.8500992040594473, + "learning_rate": 1.7935876138781284e-07, + "loss": 0.9422, + "step": 1228 + }, + { + "epoch": 0.9177634649491272, + "grad_norm": 3.314999491197659, + "learning_rate": 1.7616065592742038e-07, + "loss": 0.9746, + "step": 1229 + }, + { + "epoch": 0.9185102212265472, + "grad_norm": 2.785252191895322, + "learning_rate": 1.729908085687293e-07, + "loss": 0.8996, + "step": 1230 + }, + { + "epoch": 0.9192569775039672, + "grad_norm": 3.312621983480616, + "learning_rate": 1.698492378807387e-07, + "loss": 0.9983, + "step": 1231 + }, + { + "epoch": 0.9200037337813871, + "grad_norm": 2.970201392187938, + "learning_rate": 1.6673596226680356e-07, + "loss": 0.9679, + "step": 1232 + }, + { + "epoch": 0.9200037337813871, + "eval_loss": 0.930705726146698, + "eval_runtime": 162.5052, + "eval_samples_per_second": 110.944, + "eval_steps_per_second": 1.735, + "step": 1232 + }, + { + "epoch": 0.920750490058807, + "grad_norm": 2.766948354777863, + "learning_rate": 1.6365099996452416e-07, + "loss": 0.9815, + "step": 1233 + }, + { + "epoch": 0.921497246336227, + "grad_norm": 3.775583043641789, + "learning_rate": 1.6059436904564296e-07, + "loss": 0.9999, + "step": 1234 + }, + { + "epoch": 0.9222440026136469, + "grad_norm": 3.1430081879332823, + "learning_rate": 1.575660874159346e-07, + "loss": 0.969, + "step": 1235 + }, + { + "epoch": 0.922990758891067, + "grad_norm": 3.861696256410228, + "learning_rate": 1.545661728151071e-07, + "loss": 1.0212, + "step": 1236 + }, + { + "epoch": 0.9237375151684869, + "grad_norm": 3.6672560176616704, + "learning_rate": 1.515946428166909e-07, + "loss": 0.9356, + "step": 1237 + }, + { + "epoch": 0.9244842714459068, + "grad_norm": 4.106696856846996, + "learning_rate": 1.4865151482793938e-07, + "loss": 0.9449, + "step": 1238 + }, + { + "epoch": 0.9252310277233268, + "grad_norm": 2.9579773642607066, + "learning_rate": 1.4573680608972796e-07, + "loss": 0.9129, + "step": 1239 + }, + { + "epoch": 0.9259777840007467, + "grad_norm": 2.808101044256339, + "learning_rate": 1.4285053367645074e-07, + "loss": 0.866, + "step": 1240 + }, + { + "epoch": 0.9267245402781668, + "grad_norm": 2.9339064632805307, + "learning_rate": 1.3999271449592177e-07, + "loss": 0.9948, + "step": 1241 + }, + { + "epoch": 0.9274712965555867, + "grad_norm": 3.0554176273960563, + "learning_rate": 1.371633652892762e-07, + "loss": 1.0511, + "step": 1242 + }, + { + "epoch": 0.9282180528330066, + "grad_norm": 5.510569175235229, + "learning_rate": 1.3436250263087204e-07, + "loss": 0.928, + "step": 1243 + }, + { + "epoch": 0.9289648091104266, + "grad_norm": 2.844624274973105, + "learning_rate": 1.3159014292819126e-07, + "loss": 0.9032, + "step": 1244 + }, + { + "epoch": 0.9297115653878465, + "grad_norm": 4.434412677362633, + "learning_rate": 1.2884630242174734e-07, + "loss": 0.9385, + "step": 1245 + }, + { + "epoch": 0.9304583216652665, + "grad_norm": 3.4872159250577295, + "learning_rate": 1.2613099718498556e-07, + "loss": 0.9364, + "step": 1246 + }, + { + "epoch": 0.9304583216652665, + "eval_loss": 0.9305473566055298, + "eval_runtime": 162.4805, + "eval_samples_per_second": 110.961, + "eval_steps_per_second": 1.736, + "step": 1246 + }, + { + "epoch": 0.9312050779426865, + "grad_norm": 3.5713420230443695, + "learning_rate": 1.234442431241939e-07, + "loss": 1.0143, + "step": 1247 + }, + { + "epoch": 0.9319518342201064, + "grad_norm": 2.8182072549192214, + "learning_rate": 1.207860559784052e-07, + "loss": 0.9826, + "step": 1248 + }, + { + "epoch": 0.9326985904975263, + "grad_norm": 2.875098039824205, + "learning_rate": 1.181564513193073e-07, + "loss": 0.9561, + "step": 1249 + }, + { + "epoch": 0.9334453467749463, + "grad_norm": 3.08129402935908, + "learning_rate": 1.1555544455115253e-07, + "loss": 0.9792, + "step": 1250 + }, + { + "epoch": 0.9341921030523663, + "grad_norm": 4.008794109759551, + "learning_rate": 1.1298305091066664e-07, + "loss": 1.0236, + "step": 1251 + }, + { + "epoch": 0.9349388593297863, + "grad_norm": 3.25803536742016, + "learning_rate": 1.1043928546695782e-07, + "loss": 0.9582, + "step": 1252 + }, + { + "epoch": 0.9356856156072062, + "grad_norm": 2.8272859290809005, + "learning_rate": 1.0792416312143172e-07, + "loss": 0.939, + "step": 1253 + }, + { + "epoch": 0.9364323718846261, + "grad_norm": 3.27251051941531, + "learning_rate": 1.0543769860769992e-07, + "loss": 0.9998, + "step": 1254 + }, + { + "epoch": 0.9371791281620461, + "grad_norm": 3.3364183862421113, + "learning_rate": 1.029799064914988e-07, + "loss": 0.9757, + "step": 1255 + }, + { + "epoch": 0.9379258844394661, + "grad_norm": 3.3619896026786265, + "learning_rate": 1.0055080117060079e-07, + "loss": 1.0142, + "step": 1256 + }, + { + "epoch": 0.938672640716886, + "grad_norm": 3.0948786475102508, + "learning_rate": 9.81503968747305e-08, + "loss": 0.9619, + "step": 1257 + }, + { + "epoch": 0.939419396994306, + "grad_norm": 3.0788291331198674, + "learning_rate": 9.577870766547981e-08, + "loss": 0.9774, + "step": 1258 + }, + { + "epoch": 0.9401661532717259, + "grad_norm": 3.2586272595359516, + "learning_rate": 9.34357474362313e-08, + "loss": 0.9681, + "step": 1259 + }, + { + "epoch": 0.9409129095491459, + "grad_norm": 3.0686211394939424, + "learning_rate": 9.112152991206991e-08, + "loss": 0.992, + "step": 1260 + }, + { + "epoch": 0.9409129095491459, + "eval_loss": 0.9305338859558105, + "eval_runtime": 161.1412, + "eval_samples_per_second": 111.883, + "eval_steps_per_second": 1.75, + "step": 1260 + }, + { + "epoch": 0.9416596658265659, + "grad_norm": 2.8839831606268143, + "learning_rate": 8.883606864970585e-08, + "loss": 0.8647, + "step": 1261 + }, + { + "epoch": 0.9424064221039858, + "grad_norm": 3.5045862915012886, + "learning_rate": 8.657937703739516e-08, + "loss": 0.9615, + "step": 1262 + }, + { + "epoch": 0.9431531783814058, + "grad_norm": 3.013831723584786, + "learning_rate": 8.435146829486263e-08, + "loss": 0.9761, + "step": 1263 + }, + { + "epoch": 0.9438999346588257, + "grad_norm": 3.05235918180062, + "learning_rate": 8.215235547321897e-08, + "loss": 0.9317, + "step": 1264 + }, + { + "epoch": 0.9446466909362456, + "grad_norm": 2.8893059761689592, + "learning_rate": 7.998205145489157e-08, + "loss": 0.9059, + "step": 1265 + }, + { + "epoch": 0.9453934472136657, + "grad_norm": 2.9521748418786355, + "learning_rate": 7.784056895354386e-08, + "loss": 0.9681, + "step": 1266 + }, + { + "epoch": 0.9461402034910856, + "grad_norm": 3.1653409809050665, + "learning_rate": 7.572792051400325e-08, + "loss": 0.9913, + "step": 1267 + }, + { + "epoch": 0.9468869597685056, + "grad_norm": 2.6124719184273477, + "learning_rate": 7.364411851218667e-08, + "loss": 0.8977, + "step": 1268 + }, + { + "epoch": 0.9476337160459255, + "grad_norm": 2.858057980612677, + "learning_rate": 7.158917515502739e-08, + "loss": 0.9498, + "step": 1269 + }, + { + "epoch": 0.9483804723233454, + "grad_norm": 3.549096737039596, + "learning_rate": 6.95631024804061e-08, + "loss": 0.9432, + "step": 1270 + }, + { + "epoch": 0.9491272286007654, + "grad_norm": 3.1098491003954307, + "learning_rate": 6.75659123570771e-08, + "loss": 0.9664, + "step": 1271 + }, + { + "epoch": 0.9498739848781854, + "grad_norm": 3.4670315690611377, + "learning_rate": 6.559761648460117e-08, + "loss": 0.8994, + "step": 1272 + }, + { + "epoch": 0.9506207411556054, + "grad_norm": 4.325331230417902, + "learning_rate": 6.365822639327724e-08, + "loss": 0.9636, + "step": 1273 + }, + { + "epoch": 0.9513674974330253, + "grad_norm": 2.9355017549957942, + "learning_rate": 6.174775344407246e-08, + "loss": 0.9531, + "step": 1274 + }, + { + "epoch": 0.9513674974330253, + "eval_loss": 0.9305031299591064, + "eval_runtime": 162.3818, + "eval_samples_per_second": 111.028, + "eval_steps_per_second": 1.737, + "step": 1274 + }, + { + "epoch": 0.9521142537104452, + "grad_norm": 2.7787168628182286, + "learning_rate": 5.986620882855676e-08, + "loss": 0.9989, + "step": 1275 + }, + { + "epoch": 0.9528610099878652, + "grad_norm": 3.250198943576934, + "learning_rate": 5.801360356883945e-08, + "loss": 1.0186, + "step": 1276 + }, + { + "epoch": 0.9536077662652852, + "grad_norm": 3.5045478545742372, + "learning_rate": 5.618994851750104e-08, + "loss": 1.0243, + "step": 1277 + }, + { + "epoch": 0.9543545225427051, + "grad_norm": 3.596766310602903, + "learning_rate": 5.439525435753157e-08, + "loss": 1.0089, + "step": 1278 + }, + { + "epoch": 0.9551012788201251, + "grad_norm": 3.2110723577196394, + "learning_rate": 5.262953160226958e-08, + "loss": 1.0486, + "step": 1279 + }, + { + "epoch": 0.955848035097545, + "grad_norm": 2.945046395031802, + "learning_rate": 5.089279059533658e-08, + "loss": 0.8746, + "step": 1280 + }, + { + "epoch": 0.956594791374965, + "grad_norm": 2.742862157777454, + "learning_rate": 4.91850415105799e-08, + "loss": 0.905, + "step": 1281 + }, + { + "epoch": 0.957341547652385, + "grad_norm": 2.816790640635472, + "learning_rate": 4.7506294352011596e-08, + "loss": 0.9554, + "step": 1282 + }, + { + "epoch": 0.9580883039298049, + "grad_norm": 3.3497034429748975, + "learning_rate": 4.5856558953750744e-08, + "loss": 0.8932, + "step": 1283 + }, + { + "epoch": 0.9588350602072249, + "grad_norm": 2.9107805279966543, + "learning_rate": 4.423584497996458e-08, + "loss": 0.9526, + "step": 1284 + }, + { + "epoch": 0.9595818164846448, + "grad_norm": 2.700904755255844, + "learning_rate": 4.2644161924811353e-08, + "loss": 0.9566, + "step": 1285 + }, + { + "epoch": 0.9603285727620647, + "grad_norm": 3.2097128106381225, + "learning_rate": 4.108151911238922e-08, + "loss": 0.9119, + "step": 1286 + }, + { + "epoch": 0.9610753290394848, + "grad_norm": 3.1787175007546455, + "learning_rate": 3.9547925696675206e-08, + "loss": 0.9617, + "step": 1287 + }, + { + "epoch": 0.9618220853169047, + "grad_norm": 3.468752659560656, + "learning_rate": 3.804339066147467e-08, + "loss": 1.004, + "step": 1288 + }, + { + "epoch": 0.9618220853169047, + "eval_loss": 0.9304465055465698, + "eval_runtime": 162.5341, + "eval_samples_per_second": 110.924, + "eval_steps_per_second": 1.735, + "step": 1288 + }, + { + "epoch": 0.9625688415943247, + "grad_norm": 2.8644336249094513, + "learning_rate": 3.656792282037136e-08, + "loss": 0.8925, + "step": 1289 + }, + { + "epoch": 0.9633155978717446, + "grad_norm": 3.0822614830737627, + "learning_rate": 3.512153081667135e-08, + "loss": 0.9553, + "step": 1290 + }, + { + "epoch": 0.9640623541491645, + "grad_norm": 2.907482583857242, + "learning_rate": 3.370422312335309e-08, + "loss": 0.971, + "step": 1291 + }, + { + "epoch": 0.9648091104265846, + "grad_norm": 3.070952453759052, + "learning_rate": 3.2316008043020154e-08, + "loss": 0.9381, + "step": 1292 + }, + { + "epoch": 0.9655558667040045, + "grad_norm": 3.397673851079207, + "learning_rate": 3.095689370785249e-08, + "loss": 0.9627, + "step": 1293 + }, + { + "epoch": 0.9663026229814244, + "grad_norm": 2.9351868857737684, + "learning_rate": 2.9626888079554716e-08, + "loss": 0.9772, + "step": 1294 + }, + { + "epoch": 0.9670493792588444, + "grad_norm": 2.8830349043131154, + "learning_rate": 2.8325998949314536e-08, + "loss": 0.9592, + "step": 1295 + }, + { + "epoch": 0.9677961355362643, + "grad_norm": 2.7889585719099848, + "learning_rate": 2.705423393775386e-08, + "loss": 0.9692, + "step": 1296 + }, + { + "epoch": 0.9685428918136844, + "grad_norm": 3.3872739189532446, + "learning_rate": 2.5811600494885512e-08, + "loss": 0.9343, + "step": 1297 + }, + { + "epoch": 0.9692896480911043, + "grad_norm": 3.735163743338174, + "learning_rate": 2.4598105900069392e-08, + "loss": 0.9593, + "step": 1298 + }, + { + "epoch": 0.9700364043685242, + "grad_norm": 2.71350691275283, + "learning_rate": 2.341375726197026e-08, + "loss": 0.9819, + "step": 1299 + }, + { + "epoch": 0.9707831606459442, + "grad_norm": 2.6806573316875517, + "learning_rate": 2.2258561518513912e-08, + "loss": 0.9477, + "step": 1300 + }, + { + "epoch": 0.9715299169233641, + "grad_norm": 3.2230529938354278, + "learning_rate": 2.1132525436849406e-08, + "loss": 0.9044, + "step": 1301 + }, + { + "epoch": 0.9722766732007841, + "grad_norm": 3.2452809812911276, + "learning_rate": 2.003565561330856e-08, + "loss": 0.8966, + "step": 1302 + }, + { + "epoch": 0.9722766732007841, + "eval_loss": 0.9304633736610413, + "eval_runtime": 162.5458, + "eval_samples_per_second": 110.916, + "eval_steps_per_second": 1.735, + "step": 1302 + }, + { + "epoch": 0.9730234294782041, + "grad_norm": 3.169797354119332, + "learning_rate": 1.896795847336541e-08, + "loss": 0.9213, + "step": 1303 + }, + { + "epoch": 0.973770185755624, + "grad_norm": 2.9915639532159046, + "learning_rate": 1.792944027160237e-08, + "loss": 0.8673, + "step": 1304 + }, + { + "epoch": 0.974516942033044, + "grad_norm": 4.0354586375745765, + "learning_rate": 1.6920107091668582e-08, + "loss": 0.9383, + "step": 1305 + }, + { + "epoch": 0.9752636983104639, + "grad_norm": 2.8461272392012247, + "learning_rate": 1.593996484624938e-08, + "loss": 0.9598, + "step": 1306 + }, + { + "epoch": 0.9760104545878838, + "grad_norm": 3.0983794245126086, + "learning_rate": 1.4989019277028004e-08, + "loss": 1.0039, + "step": 1307 + }, + { + "epoch": 0.9767572108653039, + "grad_norm": 3.1350242555646743, + "learning_rate": 1.4067275954653403e-08, + "loss": 0.9235, + "step": 1308 + }, + { + "epoch": 0.9775039671427238, + "grad_norm": 2.9339669005824467, + "learning_rate": 1.3174740278708575e-08, + "loss": 0.9089, + "step": 1309 + }, + { + "epoch": 0.9782507234201437, + "grad_norm": 4.474824857246958, + "learning_rate": 1.2311417477676168e-08, + "loss": 0.9183, + "step": 1310 + }, + { + "epoch": 0.9789974796975637, + "grad_norm": 2.989598754747387, + "learning_rate": 1.1477312608910162e-08, + "loss": 0.9991, + "step": 1311 + }, + { + "epoch": 0.9797442359749836, + "grad_norm": 2.9625742101217014, + "learning_rate": 1.0672430558605895e-08, + "loss": 0.953, + "step": 1312 + }, + { + "epoch": 0.9804909922524037, + "grad_norm": 2.951799943908215, + "learning_rate": 9.89677604177064e-09, + "loss": 0.9771, + "step": 1313 + }, + { + "epoch": 0.9812377485298236, + "grad_norm": 3.131290656868523, + "learning_rate": 9.150353602197516e-09, + "loss": 0.8718, + "step": 1314 + }, + { + "epoch": 0.9819845048072435, + "grad_norm": 2.953246971653502, + "learning_rate": 8.433167612436066e-09, + "loss": 0.9561, + "step": 1315 + }, + { + "epoch": 0.9827312610846635, + "grad_norm": 3.994087349668577, + "learning_rate": 7.745222273770059e-09, + "loss": 0.9905, + "step": 1316 + }, + { + "epoch": 0.9827312610846635, + "eval_loss": 0.9304625988006592, + "eval_runtime": 162.5204, + "eval_samples_per_second": 110.934, + "eval_steps_per_second": 1.735, + "step": 1316 + }, + { + "epoch": 0.9834780173620834, + "grad_norm": 2.854502234217566, + "learning_rate": 7.0865216161902785e-09, + "loss": 1.0256, + "step": 1317 + }, + { + "epoch": 0.9842247736395034, + "grad_norm": 3.1821866741449867, + "learning_rate": 6.457069498372326e-09, + "loss": 0.9699, + "step": 1318 + }, + { + "epoch": 0.9849715299169234, + "grad_norm": 2.869146882764378, + "learning_rate": 5.856869607652749e-09, + "loss": 0.9833, + "step": 1319 + }, + { + "epoch": 0.9857182861943433, + "grad_norm": 3.1328413430563975, + "learning_rate": 5.285925460009056e-09, + "loss": 0.9053, + "step": 1320 + }, + { + "epoch": 0.9864650424717633, + "grad_norm": 2.6428303935877917, + "learning_rate": 4.744240400038624e-09, + "loss": 0.9862, + "step": 1321 + }, + { + "epoch": 0.9872117987491832, + "grad_norm": 3.6370165390584277, + "learning_rate": 4.231817600938159e-09, + "loss": 0.9705, + "step": 1322 + }, + { + "epoch": 0.9879585550266032, + "grad_norm": 3.206324861128839, + "learning_rate": 3.748660064484821e-09, + "loss": 0.9382, + "step": 1323 + }, + { + "epoch": 0.9887053113040232, + "grad_norm": 3.3127354281132635, + "learning_rate": 3.2947706210217923e-09, + "loss": 0.9847, + "step": 1324 + }, + { + "epoch": 0.9894520675814431, + "grad_norm": 2.5984978136593284, + "learning_rate": 2.8701519294371815e-09, + "loss": 0.9211, + "step": 1325 + }, + { + "epoch": 0.990198823858863, + "grad_norm": 2.806011670037097, + "learning_rate": 2.4748064771529247e-09, + "loss": 0.9384, + "step": 1326 + }, + { + "epoch": 0.990945580136283, + "grad_norm": 3.06945560096709, + "learning_rate": 2.1087365801053526e-09, + "loss": 0.9233, + "step": 1327 + }, + { + "epoch": 0.991692336413703, + "grad_norm": 3.170045754610596, + "learning_rate": 1.7719443827368677e-09, + "loss": 1.0009, + "step": 1328 + }, + { + "epoch": 0.992439092691123, + "grad_norm": 2.5106733677134887, + "learning_rate": 1.4644318579798422e-09, + "loss": 0.8646, + "step": 1329 + }, + { + "epoch": 0.9931858489685429, + "grad_norm": 2.6476074361565733, + "learning_rate": 1.186200807245519e-09, + "loss": 1.007, + "step": 1330 + }, + { + "epoch": 0.9931858489685429, + "eval_loss": 0.9304366111755371, + "eval_runtime": 160.9597, + "eval_samples_per_second": 112.009, + "eval_steps_per_second": 1.752, + "step": 1330 + }, + { + "epoch": 0.9939326052459628, + "grad_norm": 3.095198427340493, + "learning_rate": 9.372528604134623e-10, + "loss": 0.9486, + "step": 1331 + }, + { + "epoch": 0.9946793615233828, + "grad_norm": 3.941739091074789, + "learning_rate": 7.17589475824898e-10, + "loss": 0.9998, + "step": 1332 + }, + { + "epoch": 0.9954261178008028, + "grad_norm": 2.6143928805877543, + "learning_rate": 5.272119402693898e-10, + "loss": 0.9061, + "step": 1333 + }, + { + "epoch": 0.9961728740782227, + "grad_norm": 3.161138570652923, + "learning_rate": 3.6612136898039885e-10, + "loss": 0.9688, + "step": 1334 + }, + { + "epoch": 0.9969196303556427, + "grad_norm": 4.104053917839182, + "learning_rate": 2.3431870562917735e-10, + "loss": 1.0086, + "step": 1335 + }, + { + "epoch": 0.9976663866330626, + "grad_norm": 2.9990664641234006, + "learning_rate": 1.3180472231588694e-10, + "loss": 1.0231, + "step": 1336 + }, + { + "epoch": 0.9984131429104826, + "grad_norm": 2.6250712172326782, + "learning_rate": 5.858001956904335e-11, + "loss": 0.9374, + "step": 1337 + }, + { + "epoch": 0.9991598991879026, + "grad_norm": 2.8904822230789065, + "learning_rate": 1.464502633996556e-11, + "loss": 0.87, + "step": 1338 + }, + { + "epoch": 0.9999066554653225, + "grad_norm": 2.4805900744497618, + "learning_rate": 0.0, + "loss": 0.8749, + "step": 1339 + }, + { + "epoch": 0.9999066554653225, + "step": 1339, + "total_flos": 1690624755499008.0, + "train_loss": 1.0187733439120719, + "train_runtime": 28145.7978, + "train_samples_per_second": 12.179, + "train_steps_per_second": 0.048 + } + ], + "logging_steps": 1.0, + "max_steps": 1339, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 134, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1690624755499008.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..2328f5b --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b6e78dbf46da3dc16aa0fed6b448c38181be5e4fb53d4e900ba4d1bd90f2ed7 +size 7096