From ace6e29fbee90a94ac75ddfbf87b31dccd1e1bf0 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 20 Jun 2026 17:34:31 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: edbeeching/Qwen3-4B-Thinking-2507-SFT-tr5 Source: Original Platform --- .gitattributes | 36 + README.md | 61 + all_results.json | 9 + chat_template.jinja | 86 + config.json | 71 + generation_config.json | 9 + model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 29 + train_results.json | 9 + trainer_state.json | 5003 ++++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 12 files changed, 5322 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..bceacc1 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +base_model: Qwen/Qwen3-4B-Thinking-2507 +library_name: transformers +model_name: Qwen3-4B-Thinking-2507-SFT-tr5 +tags: +- generated_from_trainer +- trackio +- trl-internal +- trackio:https://huggingface.co/spaces/hf-imo-colab/trackio-distillation-sft +- sft +- trl +licence: license +--- + +# Model Card for Qwen3-4B-Thinking-2507-SFT-tr5 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="edbeeching/Qwen3-4B-Thinking-2507-SFT-tr5", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/huggingface/imo-distillation/runs/18h3l5nj) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.27.0.dev0 +- Transformers: 5.3.0.dev0 +- Pytorch: 2.10.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..7378daa --- /dev/null +++ b/all_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 4.6268656716417915, + "total_flos": 829282868854784.0, + "train_loss": 0.02679153286641644, + "train_runtime": 1845.6941, + "train_samples": 4281, + "train_samples_per_second": 10.749, + "train_steps_per_second": 0.336 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..2e2f69c --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,86 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e406718 --- /dev/null +++ b/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 5000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.3.0.dev0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..7469251 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "do_sample": true, + "eos_token_id": 151645, + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.3.0.dev0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..7c6de8a --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15018ea756d7ac7854eb2316b328ee60b3436037367e47ec08133d1ca52efdd0 +size 8044982080 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..501f11b --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 262144, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..7378daa --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 4.6268656716417915, + "total_flos": 829282868854784.0, + "train_loss": 0.02679153286641644, + "train_runtime": 1845.6941, + "train_samples": 4281, + "train_samples_per_second": 10.749, + "train_steps_per_second": 0.336 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..26396d8 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,5003 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.6268656716417915, + "eval_steps": 500, + "global_step": 620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007462686567164179, + "grad_norm": 7.744417324010015, + "learning_rate": 0.0, + "loss": 0.8532977104187012, + "num_tokens": 940199.0, + "step": 1 + }, + { + "epoch": 0.014925373134328358, + "grad_norm": 7.534592349315775, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.870805561542511, + "num_tokens": 1940958.0, + "step": 2 + }, + { + "epoch": 0.022388059701492536, + "grad_norm": 7.301629258268991, + "learning_rate": 3.157894736842105e-06, + "loss": 0.8422647714614868, + "num_tokens": 2857380.0, + "step": 3 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 6.897211503812214, + "learning_rate": 4.736842105263158e-06, + "loss": 0.8292515277862549, + "num_tokens": 3696403.0, + "step": 4 + }, + { + "epoch": 0.03731343283582089, + "grad_norm": 4.507105826947362, + "learning_rate": 6.31578947368421e-06, + "loss": 0.7875182628631592, + "num_tokens": 4528235.0, + "step": 5 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 2.305703326092364, + "learning_rate": 7.894736842105263e-06, + "loss": 0.7126146554946899, + "num_tokens": 5554672.0, + "step": 6 + }, + { + "epoch": 0.05223880597014925, + "grad_norm": 1.921229796923974, + "learning_rate": 9.473684210526315e-06, + "loss": 0.6916477680206299, + "num_tokens": 6423132.0, + "step": 7 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 2.0416384054663053, + "learning_rate": 1.1052631578947368e-05, + "loss": 0.6471172571182251, + "num_tokens": 7201644.0, + "step": 8 + }, + { + "epoch": 0.06716417910447761, + "grad_norm": 2.239115666581825, + "learning_rate": 1.263157894736842e-05, + "loss": 0.6206663846969604, + "num_tokens": 8128715.0, + "step": 9 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 2.1623779063703825, + "learning_rate": 1.4210526315789473e-05, + "loss": 0.6805848479270935, + "num_tokens": 9074027.0, + "step": 10 + }, + { + "epoch": 0.08208955223880597, + "grad_norm": 1.357823714265532, + "learning_rate": 1.5789473684210526e-05, + "loss": 0.5907301902770996, + "num_tokens": 9950641.0, + "step": 11 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 1.2436697073761152, + "learning_rate": 1.736842105263158e-05, + "loss": 0.6134575009346008, + "num_tokens": 10885057.0, + "step": 12 + }, + { + "epoch": 0.09701492537313433, + "grad_norm": 1.0124569249363744, + "learning_rate": 1.894736842105263e-05, + "loss": 0.5782807469367981, + "num_tokens": 11697963.0, + "step": 13 + }, + { + "epoch": 0.1044776119402985, + "grad_norm": 0.7569239607127325, + "learning_rate": 2.0526315789473685e-05, + "loss": 0.5509419441223145, + "num_tokens": 12632602.0, + "step": 14 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.5817534527891748, + "learning_rate": 2.2105263157894736e-05, + "loss": 0.532228410243988, + "num_tokens": 13568889.0, + "step": 15 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 0.7057373397195236, + "learning_rate": 2.368421052631579e-05, + "loss": 0.5408649444580078, + "num_tokens": 14534242.0, + "step": 16 + }, + { + "epoch": 0.12686567164179105, + "grad_norm": 0.6046224645124905, + "learning_rate": 2.526315789473684e-05, + "loss": 0.5322834253311157, + "num_tokens": 15435946.0, + "step": 17 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 0.49331973808673285, + "learning_rate": 2.6842105263157896e-05, + "loss": 0.5015720725059509, + "num_tokens": 16352267.0, + "step": 18 + }, + { + "epoch": 0.1417910447761194, + "grad_norm": 0.5002360432437354, + "learning_rate": 2.8421052631578946e-05, + "loss": 0.507352888584137, + "num_tokens": 17277422.0, + "step": 19 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5278153522403675, + "learning_rate": 3e-05, + "loss": 0.5295838713645935, + "num_tokens": 18270697.0, + "step": 20 + }, + { + "epoch": 0.15671641791044777, + "grad_norm": 0.504428457093509, + "learning_rate": 2.9999815560649025e-05, + "loss": 0.4966413080692291, + "num_tokens": 19308555.0, + "step": 21 + }, + { + "epoch": 0.16417910447761194, + "grad_norm": 0.46271144160592165, + "learning_rate": 2.9999262247635783e-05, + "loss": 0.47084784507751465, + "num_tokens": 20162797.0, + "step": 22 + }, + { + "epoch": 0.17164179104477612, + "grad_norm": 0.4874776182121212, + "learning_rate": 2.9998340076079188e-05, + "loss": 0.4917251765727997, + "num_tokens": 20981106.0, + "step": 23 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 0.4431461732541396, + "learning_rate": 2.9997049071176987e-05, + "loss": 0.4785962998867035, + "num_tokens": 21858000.0, + "step": 24 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.387861494751977, + "learning_rate": 2.9995389268205035e-05, + "loss": 0.4448994994163513, + "num_tokens": 22793285.0, + "step": 25 + }, + { + "epoch": 0.19402985074626866, + "grad_norm": 0.4318496289646886, + "learning_rate": 2.9993360712516377e-05, + "loss": 0.5124952793121338, + "num_tokens": 23723801.0, + "step": 26 + }, + { + "epoch": 0.20149253731343283, + "grad_norm": 0.37498700509106286, + "learning_rate": 2.999096345953997e-05, + "loss": 0.4689701795578003, + "num_tokens": 24725740.0, + "step": 27 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 0.46758132187976276, + "learning_rate": 2.9988197574779187e-05, + "loss": 0.5058130621910095, + "num_tokens": 25730725.0, + "step": 28 + }, + { + "epoch": 0.21641791044776118, + "grad_norm": 0.4541769806648773, + "learning_rate": 2.998506313381003e-05, + "loss": 0.4760160744190216, + "num_tokens": 26557776.0, + "step": 29 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.4285196017188533, + "learning_rate": 2.998156022227906e-05, + "loss": 0.46224498748779297, + "num_tokens": 27504251.0, + "step": 30 + }, + { + "epoch": 0.23134328358208955, + "grad_norm": 0.47777177042995506, + "learning_rate": 2.9977688935901042e-05, + "loss": 0.45890241861343384, + "num_tokens": 28534541.0, + "step": 31 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 0.4677469755237667, + "learning_rate": 2.997344938045636e-05, + "loss": 0.48648908734321594, + "num_tokens": 29341502.0, + "step": 32 + }, + { + "epoch": 0.2462686567164179, + "grad_norm": 0.417614303033266, + "learning_rate": 2.99688416717881e-05, + "loss": 0.4909588694572449, + "num_tokens": 30211822.0, + "step": 33 + }, + { + "epoch": 0.2537313432835821, + "grad_norm": 0.41160216727121024, + "learning_rate": 2.9963865935798904e-05, + "loss": 0.470625102519989, + "num_tokens": 31102775.0, + "step": 34 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.3679274109247649, + "learning_rate": 2.995852230844751e-05, + "loss": 0.45474812388420105, + "num_tokens": 31898227.0, + "step": 35 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 0.38798639827332265, + "learning_rate": 2.9952810935745055e-05, + "loss": 0.44148534536361694, + "num_tokens": 32541892.0, + "step": 36 + }, + { + "epoch": 0.27611940298507465, + "grad_norm": 0.3837062370088247, + "learning_rate": 2.9946731973751076e-05, + "loss": 0.47073429822921753, + "num_tokens": 33543040.0, + "step": 37 + }, + { + "epoch": 0.2835820895522388, + "grad_norm": 0.35454060994897946, + "learning_rate": 2.9940285588569244e-05, + "loss": 0.4520432949066162, + "num_tokens": 34505224.0, + "step": 38 + }, + { + "epoch": 0.291044776119403, + "grad_norm": 0.47976493136494663, + "learning_rate": 2.993347195634284e-05, + "loss": 0.490145206451416, + "num_tokens": 35411826.0, + "step": 39 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.3473597577972683, + "learning_rate": 2.992629126324992e-05, + "loss": 0.48773276805877686, + "num_tokens": 36307345.0, + "step": 40 + }, + { + "epoch": 0.30597014925373134, + "grad_norm": 0.3849018293538417, + "learning_rate": 2.9918743705498237e-05, + "loss": 0.4593764543533325, + "num_tokens": 37196875.0, + "step": 41 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 0.32778453758129855, + "learning_rate": 2.9910829489319903e-05, + "loss": 0.4493025243282318, + "num_tokens": 38112193.0, + "step": 42 + }, + { + "epoch": 0.3208955223880597, + "grad_norm": 0.38874846877448016, + "learning_rate": 2.9902548830965703e-05, + "loss": 0.44223347306251526, + "num_tokens": 38855918.0, + "step": 43 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 0.3600626273878142, + "learning_rate": 2.9893901956699236e-05, + "loss": 0.4619264602661133, + "num_tokens": 39833215.0, + "step": 44 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.4252489485804584, + "learning_rate": 2.9884889102790703e-05, + "loss": 0.47333118319511414, + "num_tokens": 40760145.0, + "step": 45 + }, + { + "epoch": 0.34328358208955223, + "grad_norm": 0.44355227925846324, + "learning_rate": 2.9875510515510472e-05, + "loss": 0.4684419631958008, + "num_tokens": 41745749.0, + "step": 46 + }, + { + "epoch": 0.35074626865671643, + "grad_norm": 0.40090080870615863, + "learning_rate": 2.986576645112232e-05, + "loss": 0.45152851939201355, + "num_tokens": 42686630.0, + "step": 47 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 0.3968769871577976, + "learning_rate": 2.9855657175876453e-05, + "loss": 0.46956488490104675, + "num_tokens": 43510586.0, + "step": 48 + }, + { + "epoch": 0.3656716417910448, + "grad_norm": 0.34818131786283646, + "learning_rate": 2.9845182966002236e-05, + "loss": 0.43737900257110596, + "num_tokens": 44362248.0, + "step": 49 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.37174081071440807, + "learning_rate": 2.983434410770063e-05, + "loss": 0.4198949337005615, + "num_tokens": 45216722.0, + "step": 50 + }, + { + "epoch": 0.3805970149253731, + "grad_norm": 0.3771227721970023, + "learning_rate": 2.9823140897136368e-05, + "loss": 0.43871694803237915, + "num_tokens": 46010142.0, + "step": 51 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 0.40827091621620715, + "learning_rate": 2.981157364042988e-05, + "loss": 0.43513864278793335, + "num_tokens": 46858670.0, + "step": 52 + }, + { + "epoch": 0.39552238805970147, + "grad_norm": 0.37817640704637673, + "learning_rate": 2.9799642653648915e-05, + "loss": 0.4714231491088867, + "num_tokens": 47836905.0, + "step": 53 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 0.4521767636779311, + "learning_rate": 2.9787348262799917e-05, + "loss": 0.46958601474761963, + "num_tokens": 48836237.0, + "step": 54 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.3996917386608209, + "learning_rate": 2.9774690803819092e-05, + "loss": 0.4700014591217041, + "num_tokens": 49860153.0, + "step": 55 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 0.3486187929603249, + "learning_rate": 2.976167062256327e-05, + "loss": 0.4338191747665405, + "num_tokens": 50786110.0, + "step": 56 + }, + { + "epoch": 0.4253731343283582, + "grad_norm": 0.32937938308672743, + "learning_rate": 2.9748288074800414e-05, + "loss": 0.41941165924072266, + "num_tokens": 51790390.0, + "step": 57 + }, + { + "epoch": 0.43283582089552236, + "grad_norm": 0.34924110154041565, + "learning_rate": 2.9734543526199922e-05, + "loss": 0.457973837852478, + "num_tokens": 52742397.0, + "step": 58 + }, + { + "epoch": 0.44029850746268656, + "grad_norm": 0.410698318336265, + "learning_rate": 2.9720437352322618e-05, + "loss": 0.47605207562446594, + "num_tokens": 53673114.0, + "step": 59 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.32459119196055575, + "learning_rate": 2.9705969938610523e-05, + "loss": 0.4107760787010193, + "num_tokens": 54566889.0, + "step": 60 + }, + { + "epoch": 0.4552238805970149, + "grad_norm": 0.356687155861432, + "learning_rate": 2.9691141680376277e-05, + "loss": 0.4515986740589142, + "num_tokens": 55460491.0, + "step": 61 + }, + { + "epoch": 0.4626865671641791, + "grad_norm": 0.34110692115474495, + "learning_rate": 2.9675952982792383e-05, + "loss": 0.4474300444126129, + "num_tokens": 56520990.0, + "step": 62 + }, + { + "epoch": 0.4701492537313433, + "grad_norm": 0.4001841133306117, + "learning_rate": 2.9660404260880092e-05, + "loss": 0.446544885635376, + "num_tokens": 57422206.0, + "step": 63 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 0.3580957652668953, + "learning_rate": 2.964449593949811e-05, + "loss": 0.47310975193977356, + "num_tokens": 58260720.0, + "step": 64 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.4285689076758677, + "learning_rate": 2.9628228453330938e-05, + "loss": 0.4611589014530182, + "num_tokens": 59123617.0, + "step": 65 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 0.37944004231248546, + "learning_rate": 2.9611602246877044e-05, + "loss": 0.43839746713638306, + "num_tokens": 60033505.0, + "step": 66 + }, + { + "epoch": 0.5, + "grad_norm": 0.4058895912331529, + "learning_rate": 2.9594617774436683e-05, + "loss": 0.4206322133541107, + "num_tokens": 60974452.0, + "step": 67 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 0.3573247245646442, + "learning_rate": 2.957727550009949e-05, + "loss": 0.4404195547103882, + "num_tokens": 61913977.0, + "step": 68 + }, + { + "epoch": 0.5149253731343284, + "grad_norm": 0.3517188453992546, + "learning_rate": 2.9559575897731815e-05, + "loss": 0.4638599753379822, + "num_tokens": 62800879.0, + "step": 69 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.38237698336509157, + "learning_rate": 2.9541519450963753e-05, + "loss": 0.4506247341632843, + "num_tokens": 63823183.0, + "step": 70 + }, + { + "epoch": 0.5298507462686567, + "grad_norm": 0.34718947718626825, + "learning_rate": 2.9523106653175947e-05, + "loss": 0.43822404742240906, + "num_tokens": 64700087.0, + "step": 71 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 0.38571581429627394, + "learning_rate": 2.9504338007486096e-05, + "loss": 0.41369548439979553, + "num_tokens": 65533415.0, + "step": 72 + }, + { + "epoch": 0.5447761194029851, + "grad_norm": 0.4179062195109546, + "learning_rate": 2.948521402673521e-05, + "loss": 0.4132109582424164, + "num_tokens": 66411589.0, + "step": 73 + }, + { + "epoch": 0.5522388059701493, + "grad_norm": 0.3884838709698021, + "learning_rate": 2.9465735233473607e-05, + "loss": 0.4519786536693573, + "num_tokens": 67203675.0, + "step": 74 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.37292728239052736, + "learning_rate": 2.9445902159946608e-05, + "loss": 0.4415651559829712, + "num_tokens": 68056574.0, + "step": 75 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 0.3498292196022338, + "learning_rate": 2.942571534808003e-05, + "loss": 0.4234054684638977, + "num_tokens": 68907426.0, + "step": 76 + }, + { + "epoch": 0.5746268656716418, + "grad_norm": 0.3282849607678631, + "learning_rate": 2.9405175349465346e-05, + "loss": 0.43461883068084717, + "num_tokens": 69817179.0, + "step": 77 + }, + { + "epoch": 0.582089552238806, + "grad_norm": 0.35116807662765037, + "learning_rate": 2.938428272534464e-05, + "loss": 0.45615193247795105, + "num_tokens": 70803003.0, + "step": 78 + }, + { + "epoch": 0.5895522388059702, + "grad_norm": 0.3329914149262814, + "learning_rate": 2.9363038046595242e-05, + "loss": 0.41635048389434814, + "num_tokens": 71708353.0, + "step": 79 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.38045704306263595, + "learning_rate": 2.9341441893714155e-05, + "loss": 0.43726855516433716, + "num_tokens": 72587706.0, + "step": 80 + }, + { + "epoch": 0.6044776119402985, + "grad_norm": 0.35924349265893135, + "learning_rate": 2.9319494856802178e-05, + "loss": 0.4230448007583618, + "num_tokens": 73605832.0, + "step": 81 + }, + { + "epoch": 0.6119402985074627, + "grad_norm": 0.38619247621111746, + "learning_rate": 2.9297197535547806e-05, + "loss": 0.4357215464115143, + "num_tokens": 74512496.0, + "step": 82 + }, + { + "epoch": 0.6194029850746269, + "grad_norm": 0.39604323481100373, + "learning_rate": 2.9274550539210795e-05, + "loss": 0.4608227014541626, + "num_tokens": 75428481.0, + "step": 83 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 0.37443244592149005, + "learning_rate": 2.925155448660557e-05, + "loss": 0.4334092140197754, + "num_tokens": 76292706.0, + "step": 84 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.331225928550454, + "learning_rate": 2.9228210006084278e-05, + "loss": 0.42100948095321655, + "num_tokens": 77209633.0, + "step": 85 + }, + { + "epoch": 0.6417910447761194, + "grad_norm": 0.3733284174499636, + "learning_rate": 2.9204517735519638e-05, + "loss": 0.42018914222717285, + "num_tokens": 78063420.0, + "step": 86 + }, + { + "epoch": 0.6492537313432836, + "grad_norm": 0.32698643343045436, + "learning_rate": 2.91804783222875e-05, + "loss": 0.4293556809425354, + "num_tokens": 78870397.0, + "step": 87 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 0.3702901089838646, + "learning_rate": 2.915609242324917e-05, + "loss": 0.43072593212127686, + "num_tokens": 79871666.0, + "step": 88 + }, + { + "epoch": 0.664179104477612, + "grad_norm": 0.37258166652948627, + "learning_rate": 2.913136070473344e-05, + "loss": 0.42400825023651123, + "num_tokens": 80712206.0, + "step": 89 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.3474241060283533, + "learning_rate": 2.9106283842518404e-05, + "loss": 0.4022632837295532, + "num_tokens": 81538216.0, + "step": 90 + }, + { + "epoch": 0.6791044776119403, + "grad_norm": 0.3241352741578233, + "learning_rate": 2.9080862521812974e-05, + "loss": 0.4167214035987854, + "num_tokens": 82585839.0, + "step": 91 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 0.3810979193242223, + "learning_rate": 2.9055097437238178e-05, + "loss": 0.424973726272583, + "num_tokens": 83427449.0, + "step": 92 + }, + { + "epoch": 0.6940298507462687, + "grad_norm": 0.3649031636927641, + "learning_rate": 2.9028989292808156e-05, + "loss": 0.4390385150909424, + "num_tokens": 84449388.0, + "step": 93 + }, + { + "epoch": 0.7014925373134329, + "grad_norm": 0.28994598738953636, + "learning_rate": 2.9002538801910943e-05, + "loss": 0.4120522141456604, + "num_tokens": 85256514.0, + "step": 94 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.33708866035801577, + "learning_rate": 2.897574668728896e-05, + "loss": 0.4396127164363861, + "num_tokens": 86165960.0, + "step": 95 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 0.33927189407215896, + "learning_rate": 2.894861368101929e-05, + "loss": 0.4281761050224304, + "num_tokens": 86982659.0, + "step": 96 + }, + { + "epoch": 0.7238805970149254, + "grad_norm": 0.31050046707178475, + "learning_rate": 2.892114052449363e-05, + "loss": 0.42657923698425293, + "num_tokens": 87931000.0, + "step": 97 + }, + { + "epoch": 0.7313432835820896, + "grad_norm": 0.3648336319576507, + "learning_rate": 2.8893327968398085e-05, + "loss": 0.4396938681602478, + "num_tokens": 88689701.0, + "step": 98 + }, + { + "epoch": 0.7388059701492538, + "grad_norm": 0.32123414861291977, + "learning_rate": 2.886517677269263e-05, + "loss": 0.4277549386024475, + "num_tokens": 89547645.0, + "step": 99 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.35178070684423185, + "learning_rate": 2.883668770659033e-05, + "loss": 0.42951005697250366, + "num_tokens": 90297517.0, + "step": 100 + }, + { + "epoch": 0.753731343283582, + "grad_norm": 0.3404454736543532, + "learning_rate": 2.8807861548536364e-05, + "loss": 0.42362749576568604, + "num_tokens": 91186856.0, + "step": 101 + }, + { + "epoch": 0.7611940298507462, + "grad_norm": 0.3294687134617137, + "learning_rate": 2.8778699086186704e-05, + "loss": 0.43012386560440063, + "num_tokens": 91987232.0, + "step": 102 + }, + { + "epoch": 0.7686567164179104, + "grad_norm": 0.3613861468433532, + "learning_rate": 2.8749201116386635e-05, + "loss": 0.46676358580589294, + "num_tokens": 92898696.0, + "step": 103 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 0.32525203161057137, + "learning_rate": 2.871936844514895e-05, + "loss": 0.4271778464317322, + "num_tokens": 93791120.0, + "step": 104 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.3645223492290418, + "learning_rate": 2.8689201887631954e-05, + "loss": 0.4019509553909302, + "num_tokens": 94639289.0, + "step": 105 + }, + { + "epoch": 0.7910447761194029, + "grad_norm": 0.35120891749306765, + "learning_rate": 2.8658702268117166e-05, + "loss": 0.47020262479782104, + "num_tokens": 95400207.0, + "step": 106 + }, + { + "epoch": 0.7985074626865671, + "grad_norm": 0.3897429998289724, + "learning_rate": 2.8627870419986818e-05, + "loss": 0.45215320587158203, + "num_tokens": 96227104.0, + "step": 107 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 0.3551261237711927, + "learning_rate": 2.859670718570107e-05, + "loss": 0.41790810227394104, + "num_tokens": 97056588.0, + "step": 108 + }, + { + "epoch": 0.8134328358208955, + "grad_norm": 0.4085760278992768, + "learning_rate": 2.8565213416774984e-05, + "loss": 0.43688803911209106, + "num_tokens": 97944111.0, + "step": 109 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34538986828654805, + "learning_rate": 2.8533389973755266e-05, + "loss": 0.40269792079925537, + "num_tokens": 98816920.0, + "step": 110 + }, + { + "epoch": 0.8283582089552238, + "grad_norm": 0.3680387468305633, + "learning_rate": 2.8501237726196767e-05, + "loss": 0.4414367079734802, + "num_tokens": 99773832.0, + "step": 111 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 0.3132024744536474, + "learning_rate": 2.846875755263869e-05, + "loss": 0.44121602177619934, + "num_tokens": 100805832.0, + "step": 112 + }, + { + "epoch": 0.8432835820895522, + "grad_norm": 0.3212832093670825, + "learning_rate": 2.843595034058062e-05, + "loss": 0.43163514137268066, + "num_tokens": 101747939.0, + "step": 113 + }, + { + "epoch": 0.8507462686567164, + "grad_norm": 0.3377699333103733, + "learning_rate": 2.8402816986458235e-05, + "loss": 0.45706361532211304, + "num_tokens": 102733715.0, + "step": 114 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3020084528652058, + "learning_rate": 2.836935839561885e-05, + "loss": 0.40077459812164307, + "num_tokens": 103577969.0, + "step": 115 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 0.3487492894550424, + "learning_rate": 2.833557548229665e-05, + "loss": 0.4227057695388794, + "num_tokens": 104507837.0, + "step": 116 + }, + { + "epoch": 0.8731343283582089, + "grad_norm": 0.3476991142190051, + "learning_rate": 2.8301469169587724e-05, + "loss": 0.4556281566619873, + "num_tokens": 105482901.0, + "step": 117 + }, + { + "epoch": 0.8805970149253731, + "grad_norm": 0.328015796780554, + "learning_rate": 2.826704038942485e-05, + "loss": 0.42667752504348755, + "num_tokens": 106441176.0, + "step": 118 + }, + { + "epoch": 0.8880597014925373, + "grad_norm": 0.34794554907206476, + "learning_rate": 2.8232290082551994e-05, + "loss": 0.4443303048610687, + "num_tokens": 107265870.0, + "step": 119 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.3207190436944611, + "learning_rate": 2.819721919849865e-05, + "loss": 0.43958723545074463, + "num_tokens": 108146690.0, + "step": 120 + }, + { + "epoch": 0.9029850746268657, + "grad_norm": 0.36544124775156067, + "learning_rate": 2.8161828695553876e-05, + "loss": 0.4427248537540436, + "num_tokens": 109034402.0, + "step": 121 + }, + { + "epoch": 0.9104477611940298, + "grad_norm": 0.35253344355491567, + "learning_rate": 2.812611954074009e-05, + "loss": 0.4511459469795227, + "num_tokens": 109989572.0, + "step": 122 + }, + { + "epoch": 0.917910447761194, + "grad_norm": 0.3419502161009737, + "learning_rate": 2.8090092709786683e-05, + "loss": 0.45898139476776123, + "num_tokens": 110969334.0, + "step": 123 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 0.3994476102816512, + "learning_rate": 2.8053749187103323e-05, + "loss": 0.4459114372730255, + "num_tokens": 111844990.0, + "step": 124 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.37335441467558017, + "learning_rate": 2.801708996575309e-05, + "loss": 0.43445926904678345, + "num_tokens": 112800888.0, + "step": 125 + }, + { + "epoch": 0.9402985074626866, + "grad_norm": 0.3148170264896714, + "learning_rate": 2.7980116047425318e-05, + "loss": 0.4525066912174225, + "num_tokens": 113857610.0, + "step": 126 + }, + { + "epoch": 0.9477611940298507, + "grad_norm": 0.3540516068525593, + "learning_rate": 2.7942828442408225e-05, + "loss": 0.42399919033050537, + "num_tokens": 114800904.0, + "step": 127 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 0.329278225140609, + "learning_rate": 2.7905228169561314e-05, + "loss": 0.43032482266426086, + "num_tokens": 115759913.0, + "step": 128 + }, + { + "epoch": 0.9626865671641791, + "grad_norm": 0.3630319590251905, + "learning_rate": 2.786731625628754e-05, + "loss": 0.44865018129348755, + "num_tokens": 116624191.0, + "step": 129 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.34043245011086026, + "learning_rate": 2.7829093738505223e-05, + "loss": 0.4354362189769745, + "num_tokens": 117499418.0, + "step": 130 + }, + { + "epoch": 0.9776119402985075, + "grad_norm": 0.3222353349021393, + "learning_rate": 2.7790561660619757e-05, + "loss": 0.4167882204055786, + "num_tokens": 118329517.0, + "step": 131 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 0.33759104962145015, + "learning_rate": 2.7751721075495062e-05, + "loss": 0.4432622492313385, + "num_tokens": 119221343.0, + "step": 132 + }, + { + "epoch": 0.9925373134328358, + "grad_norm": 0.3033715752060908, + "learning_rate": 2.7712573044424797e-05, + "loss": 0.4342583119869232, + "num_tokens": 120123659.0, + "step": 133 + }, + { + "epoch": 1.0, + "grad_norm": 0.3157095104018222, + "learning_rate": 2.7673118637103414e-05, + "loss": 0.43080803751945496, + "num_tokens": 121054976.0, + "step": 134 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.4365911200440399, + "learning_rate": 2.7633358931596875e-05, + "loss": 0.39168182015419006, + "num_tokens": 121995409.0, + "step": 135 + }, + { + "epoch": 1.0149253731343284, + "grad_norm": 0.34729870538048124, + "learning_rate": 2.7593295014313222e-05, + "loss": 0.3802366852760315, + "num_tokens": 122823226.0, + "step": 136 + }, + { + "epoch": 1.0223880597014925, + "grad_norm": 0.36891237319998677, + "learning_rate": 2.755292797997288e-05, + "loss": 0.370537668466568, + "num_tokens": 123660597.0, + "step": 137 + }, + { + "epoch": 1.0298507462686568, + "grad_norm": 0.4473631385211834, + "learning_rate": 2.751225893157876e-05, + "loss": 0.3735314905643463, + "num_tokens": 124554146.0, + "step": 138 + }, + { + "epoch": 1.037313432835821, + "grad_norm": 0.3699400370687646, + "learning_rate": 2.7471288980386104e-05, + "loss": 0.3833698034286499, + "num_tokens": 125332236.0, + "step": 139 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3383566258765907, + "learning_rate": 2.743001924587213e-05, + "loss": 0.36771178245544434, + "num_tokens": 126310236.0, + "step": 140 + }, + { + "epoch": 1.0522388059701493, + "grad_norm": 0.34187462479662406, + "learning_rate": 2.738845085570543e-05, + "loss": 0.37672191858291626, + "num_tokens": 127218706.0, + "step": 141 + }, + { + "epoch": 1.0597014925373134, + "grad_norm": 0.3483444921381018, + "learning_rate": 2.734658494571519e-05, + "loss": 0.38160958886146545, + "num_tokens": 128057825.0, + "step": 142 + }, + { + "epoch": 1.0671641791044777, + "grad_norm": 0.3614945464912025, + "learning_rate": 2.73044226598601e-05, + "loss": 0.37473732233047485, + "num_tokens": 128955736.0, + "step": 143 + }, + { + "epoch": 1.0746268656716418, + "grad_norm": 0.36395086131367427, + "learning_rate": 2.7261965150197148e-05, + "loss": 0.3781934380531311, + "num_tokens": 129777788.0, + "step": 144 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.38520112404335904, + "learning_rate": 2.7219213576850122e-05, + "loss": 0.37962204217910767, + "num_tokens": 130659960.0, + "step": 145 + }, + { + "epoch": 1.0895522388059702, + "grad_norm": 0.4078777110059471, + "learning_rate": 2.7176169107977898e-05, + "loss": 0.38424360752105713, + "num_tokens": 131550221.0, + "step": 146 + }, + { + "epoch": 1.0970149253731343, + "grad_norm": 0.36815785695334224, + "learning_rate": 2.713283291974253e-05, + "loss": 0.38741737604141235, + "num_tokens": 132486469.0, + "step": 147 + }, + { + "epoch": 1.1044776119402986, + "grad_norm": 0.3233706872052398, + "learning_rate": 2.7089206196277132e-05, + "loss": 0.36474981904029846, + "num_tokens": 133366950.0, + "step": 148 + }, + { + "epoch": 1.1119402985074627, + "grad_norm": 0.3322677249511474, + "learning_rate": 2.704529012965348e-05, + "loss": 0.3808598518371582, + "num_tokens": 134285043.0, + "step": 149 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.339456725306424, + "learning_rate": 2.7001085919849477e-05, + "loss": 0.35642245411872864, + "num_tokens": 135136228.0, + "step": 150 + }, + { + "epoch": 1.126865671641791, + "grad_norm": 0.3226137136335262, + "learning_rate": 2.6956594774716346e-05, + "loss": 0.3718845844268799, + "num_tokens": 136013129.0, + "step": 151 + }, + { + "epoch": 1.1343283582089552, + "grad_norm": 0.3327685091410092, + "learning_rate": 2.691181790994564e-05, + "loss": 0.3985145092010498, + "num_tokens": 136978716.0, + "step": 152 + }, + { + "epoch": 1.1417910447761195, + "grad_norm": 0.33120353900381816, + "learning_rate": 2.6866756549035997e-05, + "loss": 0.3699200451374054, + "num_tokens": 137957110.0, + "step": 153 + }, + { + "epoch": 1.1492537313432836, + "grad_norm": 0.3043199924636294, + "learning_rate": 2.6821411923259747e-05, + "loss": 0.3767678737640381, + "num_tokens": 138894209.0, + "step": 154 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.33866074681411823, + "learning_rate": 2.677578527162923e-05, + "loss": 0.3994665741920471, + "num_tokens": 139925878.0, + "step": 155 + }, + { + "epoch": 1.164179104477612, + "grad_norm": 0.3339471075019717, + "learning_rate": 2.672987784086297e-05, + "loss": 0.37443894147872925, + "num_tokens": 140844266.0, + "step": 156 + }, + { + "epoch": 1.171641791044776, + "grad_norm": 0.34303437215557886, + "learning_rate": 2.66836908853516e-05, + "loss": 0.37581557035446167, + "num_tokens": 141685264.0, + "step": 157 + }, + { + "epoch": 1.1791044776119404, + "grad_norm": 0.29691716745960073, + "learning_rate": 2.6637225667123567e-05, + "loss": 0.3585776090621948, + "num_tokens": 142607439.0, + "step": 158 + }, + { + "epoch": 1.1865671641791045, + "grad_norm": 0.36115800096975614, + "learning_rate": 2.659048345581068e-05, + "loss": 0.3523404598236084, + "num_tokens": 143442522.0, + "step": 159 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3854051146002361, + "learning_rate": 2.654346552861341e-05, + "loss": 0.3825865685939789, + "num_tokens": 144356683.0, + "step": 160 + }, + { + "epoch": 1.2014925373134329, + "grad_norm": 0.37239720041712515, + "learning_rate": 2.6496173170265967e-05, + "loss": 0.38340622186660767, + "num_tokens": 145164747.0, + "step": 161 + }, + { + "epoch": 1.208955223880597, + "grad_norm": 0.32579199473078013, + "learning_rate": 2.6448607673001228e-05, + "loss": 0.37306541204452515, + "num_tokens": 145974438.0, + "step": 162 + }, + { + "epoch": 1.2164179104477613, + "grad_norm": 0.3018547733296397, + "learning_rate": 2.6400770336515403e-05, + "loss": 0.35844796895980835, + "num_tokens": 146897553.0, + "step": 163 + }, + { + "epoch": 1.2238805970149254, + "grad_norm": 0.3718792094427089, + "learning_rate": 2.6352662467932535e-05, + "loss": 0.4024726152420044, + "num_tokens": 147706235.0, + "step": 164 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.2976198356156792, + "learning_rate": 2.6304285381768785e-05, + "loss": 0.3483440577983856, + "num_tokens": 148638477.0, + "step": 165 + }, + { + "epoch": 1.2388059701492538, + "grad_norm": 0.40262421884910277, + "learning_rate": 2.6255640399896502e-05, + "loss": 0.37967991828918457, + "num_tokens": 149598765.0, + "step": 166 + }, + { + "epoch": 1.2462686567164178, + "grad_norm": 0.3506490786312828, + "learning_rate": 2.620672885150811e-05, + "loss": 0.3896668553352356, + "num_tokens": 150499813.0, + "step": 167 + }, + { + "epoch": 1.2537313432835822, + "grad_norm": 0.37867461356808607, + "learning_rate": 2.61575520730798e-05, + "loss": 0.3811056315898895, + "num_tokens": 151406909.0, + "step": 168 + }, + { + "epoch": 1.2611940298507462, + "grad_norm": 0.34652476711835556, + "learning_rate": 2.6108111408334992e-05, + "loss": 0.4021441638469696, + "num_tokens": 152358615.0, + "step": 169 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3443693872016931, + "learning_rate": 2.6058408208207623e-05, + "loss": 0.3495699167251587, + "num_tokens": 153140245.0, + "step": 170 + }, + { + "epoch": 1.2761194029850746, + "grad_norm": 0.3199055027860086, + "learning_rate": 2.600844383080525e-05, + "loss": 0.39528757333755493, + "num_tokens": 154065403.0, + "step": 171 + }, + { + "epoch": 1.2835820895522387, + "grad_norm": 0.38260512944207237, + "learning_rate": 2.595821964137192e-05, + "loss": 0.3903374969959259, + "num_tokens": 155004060.0, + "step": 172 + }, + { + "epoch": 1.291044776119403, + "grad_norm": 0.32094103983604383, + "learning_rate": 2.590773701225089e-05, + "loss": 0.38557156920433044, + "num_tokens": 155894044.0, + "step": 173 + }, + { + "epoch": 1.2985074626865671, + "grad_norm": 0.3191554521618694, + "learning_rate": 2.585699732284708e-05, + "loss": 0.3759213089942932, + "num_tokens": 156891753.0, + "step": 174 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.32242700469585533, + "learning_rate": 2.580600195958945e-05, + "loss": 0.37213414907455444, + "num_tokens": 157781264.0, + "step": 175 + }, + { + "epoch": 1.3134328358208955, + "grad_norm": 0.3284517036436859, + "learning_rate": 2.5754752315893065e-05, + "loss": 0.378812313079834, + "num_tokens": 158729371.0, + "step": 176 + }, + { + "epoch": 1.3208955223880596, + "grad_norm": 0.3415023419708296, + "learning_rate": 2.5703249792121037e-05, + "loss": 0.3865644931793213, + "num_tokens": 159723929.0, + "step": 177 + }, + { + "epoch": 1.328358208955224, + "grad_norm": 0.3343050420961093, + "learning_rate": 2.5651495795546263e-05, + "loss": 0.4062744081020355, + "num_tokens": 160655042.0, + "step": 178 + }, + { + "epoch": 1.335820895522388, + "grad_norm": 0.5981815975526952, + "learning_rate": 2.5599491740312972e-05, + "loss": 0.37754061818122864, + "num_tokens": 161689806.0, + "step": 179 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.35234814112990026, + "learning_rate": 2.5547239047398078e-05, + "loss": 0.3692866861820221, + "num_tokens": 162672971.0, + "step": 180 + }, + { + "epoch": 1.3507462686567164, + "grad_norm": 0.32830063055017134, + "learning_rate": 2.5494739144572368e-05, + "loss": 0.35535305738449097, + "num_tokens": 163606727.0, + "step": 181 + }, + { + "epoch": 1.3582089552238805, + "grad_norm": 0.3451645432424477, + "learning_rate": 2.544199346636147e-05, + "loss": 0.38066795468330383, + "num_tokens": 164379724.0, + "step": 182 + }, + { + "epoch": 1.3656716417910448, + "grad_norm": 0.36363681244248197, + "learning_rate": 2.5389003454006667e-05, + "loss": 0.380257785320282, + "num_tokens": 165282114.0, + "step": 183 + }, + { + "epoch": 1.373134328358209, + "grad_norm": 0.3192538780146095, + "learning_rate": 2.533577055542551e-05, + "loss": 0.3674117922782898, + "num_tokens": 166184652.0, + "step": 184 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.33313618040811743, + "learning_rate": 2.5282296225172267e-05, + "loss": 0.36746978759765625, + "num_tokens": 167131883.0, + "step": 185 + }, + { + "epoch": 1.3880597014925373, + "grad_norm": 0.3670551777933176, + "learning_rate": 2.522858192439815e-05, + "loss": 0.40295130014419556, + "num_tokens": 168105786.0, + "step": 186 + }, + { + "epoch": 1.3955223880597014, + "grad_norm": 0.3475964519943968, + "learning_rate": 2.5174629120811432e-05, + "loss": 0.38296568393707275, + "num_tokens": 168981965.0, + "step": 187 + }, + { + "epoch": 1.4029850746268657, + "grad_norm": 0.3556039194849401, + "learning_rate": 2.512043928863731e-05, + "loss": 0.38510382175445557, + "num_tokens": 169813930.0, + "step": 188 + }, + { + "epoch": 1.4104477611940298, + "grad_norm": 0.32738176960414617, + "learning_rate": 2.5066013908577625e-05, + "loss": 0.356991708278656, + "num_tokens": 170803921.0, + "step": 189 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.3545590302027483, + "learning_rate": 2.501135446777042e-05, + "loss": 0.3816283941268921, + "num_tokens": 171568584.0, + "step": 190 + }, + { + "epoch": 1.4253731343283582, + "grad_norm": 0.33317616623937235, + "learning_rate": 2.4956462459749297e-05, + "loss": 0.36903613805770874, + "num_tokens": 172302686.0, + "step": 191 + }, + { + "epoch": 1.4328358208955223, + "grad_norm": 0.3581041627669198, + "learning_rate": 2.4901339384402598e-05, + "loss": 0.40988194942474365, + "num_tokens": 173251435.0, + "step": 192 + }, + { + "epoch": 1.4402985074626866, + "grad_norm": 0.3987362939905261, + "learning_rate": 2.4845986747932434e-05, + "loss": 0.3909692168235779, + "num_tokens": 174154926.0, + "step": 193 + }, + { + "epoch": 1.4477611940298507, + "grad_norm": 0.4522210758422187, + "learning_rate": 2.4790406062813526e-05, + "loss": 0.40102025866508484, + "num_tokens": 174968736.0, + "step": 194 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.3447348513379396, + "learning_rate": 2.4734598847751868e-05, + "loss": 0.3985745310783386, + "num_tokens": 175993671.0, + "step": 195 + }, + { + "epoch": 1.462686567164179, + "grad_norm": 0.30700265731423365, + "learning_rate": 2.4678566627643243e-05, + "loss": 0.37859317660331726, + "num_tokens": 176965410.0, + "step": 196 + }, + { + "epoch": 1.4701492537313432, + "grad_norm": 0.34463758170682973, + "learning_rate": 2.462231093353155e-05, + "loss": 0.4219540059566498, + "num_tokens": 177894815.0, + "step": 197 + }, + { + "epoch": 1.4776119402985075, + "grad_norm": 0.3101163888412067, + "learning_rate": 2.4565833302566967e-05, + "loss": 0.3521503210067749, + "num_tokens": 178840660.0, + "step": 198 + }, + { + "epoch": 1.4850746268656716, + "grad_norm": 0.34884755051979194, + "learning_rate": 2.4509135277963953e-05, + "loss": 0.3874298632144928, + "num_tokens": 179786009.0, + "step": 199 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.3068857508105448, + "learning_rate": 2.445221840895908e-05, + "loss": 0.34809160232543945, + "num_tokens": 180680467.0, + "step": 200 + }, + { + "epoch": 1.5, + "grad_norm": 0.33736718002624627, + "learning_rate": 2.43950842507687e-05, + "loss": 0.38442444801330566, + "num_tokens": 181598316.0, + "step": 201 + }, + { + "epoch": 1.5074626865671643, + "grad_norm": 0.3701582076339982, + "learning_rate": 2.4337734364546455e-05, + "loss": 0.38641679286956787, + "num_tokens": 182458909.0, + "step": 202 + }, + { + "epoch": 1.5149253731343284, + "grad_norm": 0.43633118208871485, + "learning_rate": 2.4280170317340602e-05, + "loss": 0.3791668117046356, + "num_tokens": 183258199.0, + "step": 203 + }, + { + "epoch": 1.5223880597014925, + "grad_norm": 0.3471858102435004, + "learning_rate": 2.4222393682051225e-05, + "loss": 0.38509491086006165, + "num_tokens": 184223376.0, + "step": 204 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3457150792550615, + "learning_rate": 2.4164406037387226e-05, + "loss": 0.40659117698669434, + "num_tokens": 185129043.0, + "step": 205 + }, + { + "epoch": 1.537313432835821, + "grad_norm": 0.4042975807556774, + "learning_rate": 2.4106208967823205e-05, + "loss": 0.386791467666626, + "num_tokens": 186025421.0, + "step": 206 + }, + { + "epoch": 1.544776119402985, + "grad_norm": 0.32459079771864724, + "learning_rate": 2.4047804063556156e-05, + "loss": 0.3690309226512909, + "num_tokens": 186963319.0, + "step": 207 + }, + { + "epoch": 1.5522388059701493, + "grad_norm": 0.3392685539840793, + "learning_rate": 2.3989192920462032e-05, + "loss": 0.3927544951438904, + "num_tokens": 187973354.0, + "step": 208 + }, + { + "epoch": 1.5597014925373134, + "grad_norm": 0.32438229385759354, + "learning_rate": 2.3930377140052118e-05, + "loss": 0.3521687984466553, + "num_tokens": 188705328.0, + "step": 209 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.3418923460834205, + "learning_rate": 2.3871358329429282e-05, + "loss": 0.39543381333351135, + "num_tokens": 189538934.0, + "step": 210 + }, + { + "epoch": 1.5746268656716418, + "grad_norm": 0.32558989276658784, + "learning_rate": 2.3812138101244062e-05, + "loss": 0.3742252588272095, + "num_tokens": 190336903.0, + "step": 211 + }, + { + "epoch": 1.582089552238806, + "grad_norm": 0.35255218420418694, + "learning_rate": 2.37527180736506e-05, + "loss": 0.40875107049942017, + "num_tokens": 191168843.0, + "step": 212 + }, + { + "epoch": 1.5895522388059702, + "grad_norm": 0.31783452945012386, + "learning_rate": 2.3693099870262425e-05, + "loss": 0.3772295117378235, + "num_tokens": 192111363.0, + "step": 213 + }, + { + "epoch": 1.5970149253731343, + "grad_norm": 0.3156291557535895, + "learning_rate": 2.363328512010809e-05, + "loss": 0.39021003246307373, + "num_tokens": 193103746.0, + "step": 214 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.32761123022827565, + "learning_rate": 2.3573275457586658e-05, + "loss": 0.38943108916282654, + "num_tokens": 193981563.0, + "step": 215 + }, + { + "epoch": 1.6119402985074627, + "grad_norm": 0.3337068007026254, + "learning_rate": 2.3513072522423058e-05, + "loss": 0.3988877236843109, + "num_tokens": 194834592.0, + "step": 216 + }, + { + "epoch": 1.6194029850746268, + "grad_norm": 0.3080942622353808, + "learning_rate": 2.3452677959623254e-05, + "loss": 0.3594892621040344, + "num_tokens": 195762991.0, + "step": 217 + }, + { + "epoch": 1.626865671641791, + "grad_norm": 0.29606722446920497, + "learning_rate": 2.3392093419429313e-05, + "loss": 0.37819525599479675, + "num_tokens": 196736861.0, + "step": 218 + }, + { + "epoch": 1.6343283582089554, + "grad_norm": 0.31290197305267825, + "learning_rate": 2.333132055727431e-05, + "loss": 0.386009156703949, + "num_tokens": 197626724.0, + "step": 219 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.3200855389837665, + "learning_rate": 2.32703610337371e-05, + "loss": 0.40475738048553467, + "num_tokens": 198637589.0, + "step": 220 + }, + { + "epoch": 1.6492537313432836, + "grad_norm": 0.2866817447413364, + "learning_rate": 2.320921651449694e-05, + "loss": 0.39424002170562744, + "num_tokens": 199563255.0, + "step": 221 + }, + { + "epoch": 1.6567164179104479, + "grad_norm": 0.4885347371955867, + "learning_rate": 2.3147888670287962e-05, + "loss": 0.3826729953289032, + "num_tokens": 200461303.0, + "step": 222 + }, + { + "epoch": 1.664179104477612, + "grad_norm": 0.3109683935111661, + "learning_rate": 2.3086379176853553e-05, + "loss": 0.40459978580474854, + "num_tokens": 201369977.0, + "step": 223 + }, + { + "epoch": 1.671641791044776, + "grad_norm": 0.25809457402969005, + "learning_rate": 2.3024689714900524e-05, + "loss": 0.35879969596862793, + "num_tokens": 202278503.0, + "step": 224 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.3025695421124313, + "learning_rate": 2.296282197005322e-05, + "loss": 0.35284388065338135, + "num_tokens": 203242720.0, + "step": 225 + }, + { + "epoch": 1.6865671641791045, + "grad_norm": 0.30640518076000706, + "learning_rate": 2.2900777632807456e-05, + "loss": 0.37256160378456116, + "num_tokens": 204150301.0, + "step": 226 + }, + { + "epoch": 1.6940298507462686, + "grad_norm": 0.3140380062192946, + "learning_rate": 2.283855839848431e-05, + "loss": 0.37972885370254517, + "num_tokens": 205093558.0, + "step": 227 + }, + { + "epoch": 1.7014925373134329, + "grad_norm": 0.34946442818041484, + "learning_rate": 2.2776165967183807e-05, + "loss": 0.39244264364242554, + "num_tokens": 205970210.0, + "step": 228 + }, + { + "epoch": 1.7089552238805972, + "grad_norm": 0.32538438973624206, + "learning_rate": 2.2713602043738475e-05, + "loss": 0.39682289958000183, + "num_tokens": 206859291.0, + "step": 229 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.3124091616900136, + "learning_rate": 2.2650868337666746e-05, + "loss": 0.3859510123729706, + "num_tokens": 207786446.0, + "step": 230 + }, + { + "epoch": 1.7238805970149254, + "grad_norm": 0.3423691973747688, + "learning_rate": 2.2587966563126255e-05, + "loss": 0.3976070284843445, + "num_tokens": 208698287.0, + "step": 231 + }, + { + "epoch": 1.7313432835820897, + "grad_norm": 0.3134320041738064, + "learning_rate": 2.2524898438867004e-05, + "loss": 0.3667559325695038, + "num_tokens": 209548343.0, + "step": 232 + }, + { + "epoch": 1.7388059701492538, + "grad_norm": 0.33617188210180216, + "learning_rate": 2.2461665688184372e-05, + "loss": 0.3952285945415497, + "num_tokens": 210468969.0, + "step": 233 + }, + { + "epoch": 1.7462686567164178, + "grad_norm": 0.2947903980731328, + "learning_rate": 2.2398270038872083e-05, + "loss": 0.40012168884277344, + "num_tokens": 211457470.0, + "step": 234 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.3086024831747328, + "learning_rate": 2.233471322317492e-05, + "loss": 0.38004422187805176, + "num_tokens": 212347451.0, + "step": 235 + }, + { + "epoch": 1.7611940298507462, + "grad_norm": 0.29270776401429416, + "learning_rate": 2.227099697774146e-05, + "loss": 0.37762215733528137, + "num_tokens": 213190706.0, + "step": 236 + }, + { + "epoch": 1.7686567164179103, + "grad_norm": 0.35194638120625044, + "learning_rate": 2.2207123043576585e-05, + "loss": 0.3850764036178589, + "num_tokens": 214169074.0, + "step": 237 + }, + { + "epoch": 1.7761194029850746, + "grad_norm": 0.30551926833119664, + "learning_rate": 2.2143093165993916e-05, + "loss": 0.395663321018219, + "num_tokens": 215188427.0, + "step": 238 + }, + { + "epoch": 1.783582089552239, + "grad_norm": 0.33662162397203393, + "learning_rate": 2.2078909094568133e-05, + "loss": 0.3957657814025879, + "num_tokens": 216080767.0, + "step": 239 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.34783223375914446, + "learning_rate": 2.2014572583087155e-05, + "loss": 0.390730082988739, + "num_tokens": 216995394.0, + "step": 240 + }, + { + "epoch": 1.7985074626865671, + "grad_norm": 0.3130872530548468, + "learning_rate": 2.1950085389504232e-05, + "loss": 0.3682572841644287, + "num_tokens": 217866020.0, + "step": 241 + }, + { + "epoch": 1.8059701492537314, + "grad_norm": 0.3489897287487041, + "learning_rate": 2.18854492758899e-05, + "loss": 0.3791583180427551, + "num_tokens": 218680341.0, + "step": 242 + }, + { + "epoch": 1.8134328358208955, + "grad_norm": 0.28800056676846153, + "learning_rate": 2.182066600838384e-05, + "loss": 0.39488768577575684, + "num_tokens": 219550948.0, + "step": 243 + }, + { + "epoch": 1.8208955223880596, + "grad_norm": 0.35235893169992594, + "learning_rate": 2.1755737357146618e-05, + "loss": 0.37826257944107056, + "num_tokens": 220517125.0, + "step": 244 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.3108059485175432, + "learning_rate": 2.169066509631132e-05, + "loss": 0.3689156770706177, + "num_tokens": 221365026.0, + "step": 245 + }, + { + "epoch": 1.835820895522388, + "grad_norm": 0.27688834966994996, + "learning_rate": 2.162545100393505e-05, + "loss": 0.34449559450149536, + "num_tokens": 222233736.0, + "step": 246 + }, + { + "epoch": 1.8432835820895521, + "grad_norm": 0.3559202619871652, + "learning_rate": 2.1560096861950396e-05, + "loss": 0.41038885712623596, + "num_tokens": 223222809.0, + "step": 247 + }, + { + "epoch": 1.8507462686567164, + "grad_norm": 0.32206069093634854, + "learning_rate": 2.1494604456116695e-05, + "loss": 0.3931525945663452, + "num_tokens": 224116326.0, + "step": 248 + }, + { + "epoch": 1.8582089552238807, + "grad_norm": 0.32036384873450585, + "learning_rate": 2.1428975575971243e-05, + "loss": 0.3952087461948395, + "num_tokens": 225107686.0, + "step": 249 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.2750788343679779, + "learning_rate": 2.1363212014780432e-05, + "loss": 0.3948509097099304, + "num_tokens": 226126493.0, + "step": 250 + }, + { + "epoch": 1.873134328358209, + "grad_norm": 0.3546848770246566, + "learning_rate": 2.1297315569490704e-05, + "loss": 0.38538211584091187, + "num_tokens": 227002265.0, + "step": 251 + }, + { + "epoch": 1.8805970149253732, + "grad_norm": 0.31987168628076534, + "learning_rate": 2.123128804067949e-05, + "loss": 0.3849794268608093, + "num_tokens": 227879194.0, + "step": 252 + }, + { + "epoch": 1.8880597014925373, + "grad_norm": 0.35226690474895933, + "learning_rate": 2.1165131232505973e-05, + "loss": 0.40667471289634705, + "num_tokens": 228849840.0, + "step": 253 + }, + { + "epoch": 1.8955223880597014, + "grad_norm": 0.3517377549019829, + "learning_rate": 2.1098846952661833e-05, + "loss": 0.36520224809646606, + "num_tokens": 229755841.0, + "step": 254 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.31196447519845827, + "learning_rate": 2.1032437012321812e-05, + "loss": 0.37600016593933105, + "num_tokens": 230531378.0, + "step": 255 + }, + { + "epoch": 1.9104477611940298, + "grad_norm": 0.2994021775242901, + "learning_rate": 2.0965903226094246e-05, + "loss": 0.35384806990623474, + "num_tokens": 231462516.0, + "step": 256 + }, + { + "epoch": 1.917910447761194, + "grad_norm": 0.3160465003726717, + "learning_rate": 2.08992474119715e-05, + "loss": 0.3978261649608612, + "num_tokens": 232355925.0, + "step": 257 + }, + { + "epoch": 1.9253731343283582, + "grad_norm": 0.3261497602811777, + "learning_rate": 2.0832471391280234e-05, + "loss": 0.40133193135261536, + "num_tokens": 233194033.0, + "step": 258 + }, + { + "epoch": 1.9328358208955225, + "grad_norm": 0.2915382309557714, + "learning_rate": 2.0765576988631707e-05, + "loss": 0.36901217699050903, + "num_tokens": 234098698.0, + "step": 259 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.3301749887472271, + "learning_rate": 2.0698566031871877e-05, + "loss": 0.38775068521499634, + "num_tokens": 235139771.0, + "step": 260 + }, + { + "epoch": 1.9477611940298507, + "grad_norm": 0.32435823220698096, + "learning_rate": 2.063144035203146e-05, + "loss": 0.37508994340896606, + "num_tokens": 235974035.0, + "step": 261 + }, + { + "epoch": 1.955223880597015, + "grad_norm": 0.3051639407042942, + "learning_rate": 2.0564201783275908e-05, + "loss": 0.3903445601463318, + "num_tokens": 236882822.0, + "step": 262 + }, + { + "epoch": 1.962686567164179, + "grad_norm": 0.3254047024560983, + "learning_rate": 2.0496852162855303e-05, + "loss": 0.40096017718315125, + "num_tokens": 237883798.0, + "step": 263 + }, + { + "epoch": 1.9701492537313432, + "grad_norm": 0.3010384436296043, + "learning_rate": 2.0429393331054122e-05, + "loss": 0.3954760432243347, + "num_tokens": 238711038.0, + "step": 264 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.2897130131386432, + "learning_rate": 2.0361827131140988e-05, + "loss": 0.3967036008834839, + "num_tokens": 239602771.0, + "step": 265 + }, + { + "epoch": 1.9850746268656716, + "grad_norm": 0.2957878042115852, + "learning_rate": 2.0294155409318273e-05, + "loss": 0.3834611177444458, + "num_tokens": 240344316.0, + "step": 266 + }, + { + "epoch": 1.9925373134328357, + "grad_norm": 0.28585034773563434, + "learning_rate": 2.022638001467168e-05, + "loss": 0.36557599902153015, + "num_tokens": 241222304.0, + "step": 267 + }, + { + "epoch": 2.0, + "grad_norm": 0.31387177455183296, + "learning_rate": 2.0158502799119694e-05, + "loss": 0.3776703178882599, + "num_tokens": 242128094.0, + "step": 268 + }, + { + "epoch": 2.0074626865671643, + "grad_norm": 0.4118428271810675, + "learning_rate": 2.0090525617362995e-05, + "loss": 0.35364389419555664, + "num_tokens": 243032329.0, + "step": 269 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.3402616783384963, + "learning_rate": 2.002245032683378e-05, + "loss": 0.3219972252845764, + "num_tokens": 243800954.0, + "step": 270 + }, + { + "epoch": 2.0223880597014925, + "grad_norm": 0.37707313442099644, + "learning_rate": 1.9954278787644977e-05, + "loss": 0.3484679162502289, + "num_tokens": 244806412.0, + "step": 271 + }, + { + "epoch": 2.029850746268657, + "grad_norm": 0.39396702522268606, + "learning_rate": 1.988601286253949e-05, + "loss": 0.3331984877586365, + "num_tokens": 245802398.0, + "step": 272 + }, + { + "epoch": 2.0373134328358207, + "grad_norm": 0.40195308083874304, + "learning_rate": 1.9817654416839217e-05, + "loss": 0.3107374608516693, + "num_tokens": 246739297.0, + "step": 273 + }, + { + "epoch": 2.044776119402985, + "grad_norm": 0.35623399811084044, + "learning_rate": 1.9749205318394146e-05, + "loss": 0.3448570966720581, + "num_tokens": 247613231.0, + "step": 274 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.36546920760892426, + "learning_rate": 1.9680667437531283e-05, + "loss": 0.332324355840683, + "num_tokens": 248481475.0, + "step": 275 + }, + { + "epoch": 2.0597014925373136, + "grad_norm": 0.34431898026591723, + "learning_rate": 1.961204264700355e-05, + "loss": 0.3348411023616791, + "num_tokens": 249300938.0, + "step": 276 + }, + { + "epoch": 2.0671641791044775, + "grad_norm": 0.35727153061507005, + "learning_rate": 1.954333282193863e-05, + "loss": 0.33406710624694824, + "num_tokens": 250171263.0, + "step": 277 + }, + { + "epoch": 2.074626865671642, + "grad_norm": 0.3250548689859224, + "learning_rate": 1.9474539839787713e-05, + "loss": 0.3140842020511627, + "num_tokens": 251071115.0, + "step": 278 + }, + { + "epoch": 2.082089552238806, + "grad_norm": 0.3059178018916231, + "learning_rate": 1.9405665580274205e-05, + "loss": 0.32764101028442383, + "num_tokens": 251961398.0, + "step": 279 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.3265488394498236, + "learning_rate": 1.9336711925342357e-05, + "loss": 0.31429940462112427, + "num_tokens": 252775080.0, + "step": 280 + }, + { + "epoch": 2.0970149253731343, + "grad_norm": 0.3033003365838648, + "learning_rate": 1.926768075910586e-05, + "loss": 0.3364748954772949, + "num_tokens": 253678902.0, + "step": 281 + }, + { + "epoch": 2.1044776119402986, + "grad_norm": 0.3218060266076608, + "learning_rate": 1.919857396779633e-05, + "loss": 0.34063756465911865, + "num_tokens": 254547582.0, + "step": 282 + }, + { + "epoch": 2.111940298507463, + "grad_norm": 0.28752356415270264, + "learning_rate": 1.9129393439711812e-05, + "loss": 0.3032745122909546, + "num_tokens": 255299741.0, + "step": 283 + }, + { + "epoch": 2.1194029850746268, + "grad_norm": 0.3114027725704962, + "learning_rate": 1.906014106516515e-05, + "loss": 0.323519766330719, + "num_tokens": 256183942.0, + "step": 284 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.35567716347702344, + "learning_rate": 1.899081873643235e-05, + "loss": 0.3606981635093689, + "num_tokens": 257098083.0, + "step": 285 + }, + { + "epoch": 2.1343283582089554, + "grad_norm": 0.31558423890531895, + "learning_rate": 1.8921428347700853e-05, + "loss": 0.33504611253738403, + "num_tokens": 258138577.0, + "step": 286 + }, + { + "epoch": 2.1417910447761193, + "grad_norm": 0.34224186580930754, + "learning_rate": 1.8851971795017822e-05, + "loss": 0.326399028301239, + "num_tokens": 258888036.0, + "step": 287 + }, + { + "epoch": 2.1492537313432836, + "grad_norm": 0.30575598315812624, + "learning_rate": 1.8782450976238294e-05, + "loss": 0.3074103593826294, + "num_tokens": 259766509.0, + "step": 288 + }, + { + "epoch": 2.156716417910448, + "grad_norm": 0.3205831945487892, + "learning_rate": 1.8712867790973317e-05, + "loss": 0.33759474754333496, + "num_tokens": 260610097.0, + "step": 289 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.3023776868912514, + "learning_rate": 1.86432241405381e-05, + "loss": 0.3334404230117798, + "num_tokens": 261447212.0, + "step": 290 + }, + { + "epoch": 2.171641791044776, + "grad_norm": 0.30838933870298346, + "learning_rate": 1.8573521927900004e-05, + "loss": 0.32669875025749207, + "num_tokens": 262481613.0, + "step": 291 + }, + { + "epoch": 2.1791044776119404, + "grad_norm": 0.31402266902142234, + "learning_rate": 1.850376305762655e-05, + "loss": 0.35277265310287476, + "num_tokens": 263536437.0, + "step": 292 + }, + { + "epoch": 2.1865671641791047, + "grad_norm": 0.31931309491882254, + "learning_rate": 1.843394943583342e-05, + "loss": 0.32963383197784424, + "num_tokens": 264379962.0, + "step": 293 + }, + { + "epoch": 2.1940298507462686, + "grad_norm": 0.34845358198148824, + "learning_rate": 1.836408297013232e-05, + "loss": 0.3339906334877014, + "num_tokens": 265196630.0, + "step": 294 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.3046594968746612, + "learning_rate": 1.8294165569578902e-05, + "loss": 0.33100634813308716, + "num_tokens": 266192395.0, + "step": 295 + }, + { + "epoch": 2.208955223880597, + "grad_norm": 0.30699215790098994, + "learning_rate": 1.8224199144620557e-05, + "loss": 0.33232712745666504, + "num_tokens": 267198691.0, + "step": 296 + }, + { + "epoch": 2.216417910447761, + "grad_norm": 0.29968857683346356, + "learning_rate": 1.8154185607044267e-05, + "loss": 0.3363949656486511, + "num_tokens": 268129026.0, + "step": 297 + }, + { + "epoch": 2.2238805970149254, + "grad_norm": 0.2805025168393364, + "learning_rate": 1.8084126869924304e-05, + "loss": 0.32357555627822876, + "num_tokens": 269034104.0, + "step": 298 + }, + { + "epoch": 2.2313432835820897, + "grad_norm": 0.3030010408610569, + "learning_rate": 1.801402484757001e-05, + "loss": 0.31561556458473206, + "num_tokens": 269856471.0, + "step": 299 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.3195759723269335, + "learning_rate": 1.794388145547346e-05, + "loss": 0.34712180495262146, + "num_tokens": 270737041.0, + "step": 300 + }, + { + "epoch": 2.246268656716418, + "grad_norm": 0.3002118422900145, + "learning_rate": 1.7873698610257117e-05, + "loss": 0.35004639625549316, + "num_tokens": 271655450.0, + "step": 301 + }, + { + "epoch": 2.253731343283582, + "grad_norm": 0.28294722266034306, + "learning_rate": 1.7803478229621504e-05, + "loss": 0.3119392395019531, + "num_tokens": 272452734.0, + "step": 302 + }, + { + "epoch": 2.2611940298507465, + "grad_norm": 0.3751958180610849, + "learning_rate": 1.773322223229275e-05, + "loss": 0.3349981904029846, + "num_tokens": 273321732.0, + "step": 303 + }, + { + "epoch": 2.2686567164179103, + "grad_norm": 0.29383426672277096, + "learning_rate": 1.766293253797021e-05, + "loss": 0.3226167857646942, + "num_tokens": 274159747.0, + "step": 304 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.3225857187342117, + "learning_rate": 1.7592611067273947e-05, + "loss": 0.34066349267959595, + "num_tokens": 275031559.0, + "step": 305 + }, + { + "epoch": 2.283582089552239, + "grad_norm": 0.30673283102679866, + "learning_rate": 1.7522259741692343e-05, + "loss": 0.33413374423980713, + "num_tokens": 275930932.0, + "step": 306 + }, + { + "epoch": 2.291044776119403, + "grad_norm": 0.29177063821827953, + "learning_rate": 1.7451880483529507e-05, + "loss": 0.308035671710968, + "num_tokens": 276741084.0, + "step": 307 + }, + { + "epoch": 2.298507462686567, + "grad_norm": 0.28427282903782, + "learning_rate": 1.7381475215852805e-05, + "loss": 0.3250593841075897, + "num_tokens": 277707588.0, + "step": 308 + }, + { + "epoch": 2.3059701492537314, + "grad_norm": 0.2971627244171146, + "learning_rate": 1.7311045862440298e-05, + "loss": 0.32269105315208435, + "num_tokens": 278703194.0, + "step": 309 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.31555926494620046, + "learning_rate": 1.724059434772816e-05, + "loss": 0.32977578043937683, + "num_tokens": 279491539.0, + "step": 310 + }, + { + "epoch": 2.3208955223880596, + "grad_norm": 0.36417590081584483, + "learning_rate": 1.7170122596758127e-05, + "loss": 0.33532094955444336, + "num_tokens": 280324630.0, + "step": 311 + }, + { + "epoch": 2.328358208955224, + "grad_norm": 0.3105391565464644, + "learning_rate": 1.7099632535124854e-05, + "loss": 0.3156779408454895, + "num_tokens": 281308248.0, + "step": 312 + }, + { + "epoch": 2.3358208955223883, + "grad_norm": 0.2749240967299516, + "learning_rate": 1.702912608892335e-05, + "loss": 0.31482142210006714, + "num_tokens": 282221715.0, + "step": 313 + }, + { + "epoch": 2.343283582089552, + "grad_norm": 0.29515283586141233, + "learning_rate": 1.6958605184696297e-05, + "loss": 0.32622820138931274, + "num_tokens": 283077823.0, + "step": 314 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.2968928416618244, + "learning_rate": 1.688807174938145e-05, + "loss": 0.3397972583770752, + "num_tokens": 284064121.0, + "step": 315 + }, + { + "epoch": 2.3582089552238807, + "grad_norm": 0.29827012810037, + "learning_rate": 1.681752771025896e-05, + "loss": 0.3332856297492981, + "num_tokens": 285080424.0, + "step": 316 + }, + { + "epoch": 2.3656716417910446, + "grad_norm": 0.3039705351616898, + "learning_rate": 1.674697499489872e-05, + "loss": 0.33647334575653076, + "num_tokens": 286006199.0, + "step": 317 + }, + { + "epoch": 2.373134328358209, + "grad_norm": 0.30141732787530867, + "learning_rate": 1.6676415531107706e-05, + "loss": 0.3342139720916748, + "num_tokens": 286965514.0, + "step": 318 + }, + { + "epoch": 2.3805970149253732, + "grad_norm": 0.2956530210848347, + "learning_rate": 1.6605851246877272e-05, + "loss": 0.3201013207435608, + "num_tokens": 287842439.0, + "step": 319 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.2940637214016598, + "learning_rate": 1.65352840703305e-05, + "loss": 0.3377227783203125, + "num_tokens": 288763923.0, + "step": 320 + }, + { + "epoch": 2.3955223880597014, + "grad_norm": 0.2832996218159561, + "learning_rate": 1.64647159296695e-05, + "loss": 0.3385891020298004, + "num_tokens": 289625254.0, + "step": 321 + }, + { + "epoch": 2.4029850746268657, + "grad_norm": 0.3190448740603143, + "learning_rate": 1.6394148753122734e-05, + "loss": 0.33053308725357056, + "num_tokens": 290474425.0, + "step": 322 + }, + { + "epoch": 2.41044776119403, + "grad_norm": 0.3096387349106184, + "learning_rate": 1.63235844688923e-05, + "loss": 0.3427371680736542, + "num_tokens": 291335951.0, + "step": 323 + }, + { + "epoch": 2.417910447761194, + "grad_norm": 0.27491757978825115, + "learning_rate": 1.6253025005101283e-05, + "loss": 0.3303934931755066, + "num_tokens": 292257658.0, + "step": 324 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.3066415534546823, + "learning_rate": 1.6182472289741043e-05, + "loss": 0.36399906873703003, + "num_tokens": 293162733.0, + "step": 325 + }, + { + "epoch": 2.4328358208955225, + "grad_norm": 0.29140134988200495, + "learning_rate": 1.611192825061855e-05, + "loss": 0.3504979610443115, + "num_tokens": 294199419.0, + "step": 326 + }, + { + "epoch": 2.4402985074626864, + "grad_norm": 0.2895038992576744, + "learning_rate": 1.604139481530371e-05, + "loss": 0.35671094059944153, + "num_tokens": 295163721.0, + "step": 327 + }, + { + "epoch": 2.4477611940298507, + "grad_norm": 0.2871110161885208, + "learning_rate": 1.5970873911076654e-05, + "loss": 0.3230712115764618, + "num_tokens": 296048485.0, + "step": 328 + }, + { + "epoch": 2.455223880597015, + "grad_norm": 0.29355297538880015, + "learning_rate": 1.590036746487515e-05, + "loss": 0.32808297872543335, + "num_tokens": 296905697.0, + "step": 329 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.2951986083005398, + "learning_rate": 1.5829877403241875e-05, + "loss": 0.3399554491043091, + "num_tokens": 297837804.0, + "step": 330 + }, + { + "epoch": 2.470149253731343, + "grad_norm": 0.29400144153471874, + "learning_rate": 1.5759405652271843e-05, + "loss": 0.33751606941223145, + "num_tokens": 298822600.0, + "step": 331 + }, + { + "epoch": 2.4776119402985075, + "grad_norm": 0.31657313495586964, + "learning_rate": 1.5688954137559705e-05, + "loss": 0.35242465138435364, + "num_tokens": 299764042.0, + "step": 332 + }, + { + "epoch": 2.485074626865672, + "grad_norm": 0.2716779461194812, + "learning_rate": 1.5618524784147197e-05, + "loss": 0.3363187313079834, + "num_tokens": 300754135.0, + "step": 333 + }, + { + "epoch": 2.4925373134328357, + "grad_norm": 0.29632964201216716, + "learning_rate": 1.5548119516470496e-05, + "loss": 0.3306392431259155, + "num_tokens": 301644488.0, + "step": 334 + }, + { + "epoch": 2.5, + "grad_norm": 0.27058966408716395, + "learning_rate": 1.547774025830766e-05, + "loss": 0.31814491748809814, + "num_tokens": 302538046.0, + "step": 335 + }, + { + "epoch": 2.5074626865671643, + "grad_norm": 0.3055781987611692, + "learning_rate": 1.5407388932726056e-05, + "loss": 0.3387256860733032, + "num_tokens": 303333898.0, + "step": 336 + }, + { + "epoch": 2.5149253731343286, + "grad_norm": 0.27960594879506695, + "learning_rate": 1.53370674620298e-05, + "loss": 0.33688774704933167, + "num_tokens": 304300483.0, + "step": 337 + }, + { + "epoch": 2.5223880597014925, + "grad_norm": 0.3168292150331439, + "learning_rate": 1.526677776770725e-05, + "loss": 0.34352821111679077, + "num_tokens": 305176138.0, + "step": 338 + }, + { + "epoch": 2.529850746268657, + "grad_norm": 0.40646030417402895, + "learning_rate": 1.5196521770378498e-05, + "loss": 0.3636009693145752, + "num_tokens": 306092248.0, + "step": 339 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.34451254015626254, + "learning_rate": 1.5126301389742889e-05, + "loss": 0.3361930251121521, + "num_tokens": 306939786.0, + "step": 340 + }, + { + "epoch": 2.544776119402985, + "grad_norm": 0.301191033765102, + "learning_rate": 1.5056118544526552e-05, + "loss": 0.34493589401245117, + "num_tokens": 307676111.0, + "step": 341 + }, + { + "epoch": 2.5522388059701493, + "grad_norm": 0.3179102063972407, + "learning_rate": 1.4985975152429998e-05, + "loss": 0.35899001359939575, + "num_tokens": 308557757.0, + "step": 342 + }, + { + "epoch": 2.5597014925373136, + "grad_norm": 0.3171583352212965, + "learning_rate": 1.4915873130075704e-05, + "loss": 0.3521921634674072, + "num_tokens": 309465811.0, + "step": 343 + }, + { + "epoch": 2.5671641791044775, + "grad_norm": 0.3150146379597525, + "learning_rate": 1.484581439295574e-05, + "loss": 0.3577362895011902, + "num_tokens": 310383391.0, + "step": 344 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.3129412764272567, + "learning_rate": 1.4775800855379447e-05, + "loss": 0.33559077978134155, + "num_tokens": 311194322.0, + "step": 345 + }, + { + "epoch": 2.582089552238806, + "grad_norm": 0.3119289397564452, + "learning_rate": 1.4705834430421109e-05, + "loss": 0.3442152142524719, + "num_tokens": 312296357.0, + "step": 346 + }, + { + "epoch": 2.58955223880597, + "grad_norm": 0.30424883379817386, + "learning_rate": 1.4635917029867686e-05, + "loss": 0.3301926851272583, + "num_tokens": 313212850.0, + "step": 347 + }, + { + "epoch": 2.5970149253731343, + "grad_norm": 0.2888937621740727, + "learning_rate": 1.4566050564166585e-05, + "loss": 0.3173384368419647, + "num_tokens": 314136793.0, + "step": 348 + }, + { + "epoch": 2.6044776119402986, + "grad_norm": 0.3181049412726844, + "learning_rate": 1.4496236942373452e-05, + "loss": 0.33396849036216736, + "num_tokens": 315103742.0, + "step": 349 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.285849757305298, + "learning_rate": 1.4426478072100001e-05, + "loss": 0.3271850645542145, + "num_tokens": 315911989.0, + "step": 350 + }, + { + "epoch": 2.6194029850746268, + "grad_norm": 0.30462072449655236, + "learning_rate": 1.4356775859461898e-05, + "loss": 0.3309672474861145, + "num_tokens": 316818398.0, + "step": 351 + }, + { + "epoch": 2.626865671641791, + "grad_norm": 0.28242189700779124, + "learning_rate": 1.4287132209026686e-05, + "loss": 0.3406432271003723, + "num_tokens": 317815953.0, + "step": 352 + }, + { + "epoch": 2.6343283582089554, + "grad_norm": 0.30367294764460456, + "learning_rate": 1.4217549023761713e-05, + "loss": 0.33886873722076416, + "num_tokens": 318782328.0, + "step": 353 + }, + { + "epoch": 2.6417910447761193, + "grad_norm": 0.2986406693990765, + "learning_rate": 1.4148028204982184e-05, + "loss": 0.3135310113430023, + "num_tokens": 319721759.0, + "step": 354 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.26230985959231096, + "learning_rate": 1.407857165229915e-05, + "loss": 0.3319952189922333, + "num_tokens": 320632767.0, + "step": 355 + }, + { + "epoch": 2.656716417910448, + "grad_norm": 0.29286908776153336, + "learning_rate": 1.4009181263567659e-05, + "loss": 0.33468297123908997, + "num_tokens": 321567293.0, + "step": 356 + }, + { + "epoch": 2.664179104477612, + "grad_norm": 0.2655369943810491, + "learning_rate": 1.3939858934834851e-05, + "loss": 0.31415632367134094, + "num_tokens": 322466432.0, + "step": 357 + }, + { + "epoch": 2.671641791044776, + "grad_norm": 0.29776986063827793, + "learning_rate": 1.3870606560288188e-05, + "loss": 0.32620397210121155, + "num_tokens": 323416159.0, + "step": 358 + }, + { + "epoch": 2.6791044776119404, + "grad_norm": 0.2888554358463497, + "learning_rate": 1.3801426032203668e-05, + "loss": 0.3294253945350647, + "num_tokens": 324280115.0, + "step": 359 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.2743974222493521, + "learning_rate": 1.3732319240894143e-05, + "loss": 0.33846813440322876, + "num_tokens": 325182095.0, + "step": 360 + }, + { + "epoch": 2.6940298507462686, + "grad_norm": 0.28798464786719813, + "learning_rate": 1.3663288074657639e-05, + "loss": 0.32448339462280273, + "num_tokens": 326171068.0, + "step": 361 + }, + { + "epoch": 2.701492537313433, + "grad_norm": 0.24943230534603614, + "learning_rate": 1.3594334419725797e-05, + "loss": 0.3398998975753784, + "num_tokens": 327115635.0, + "step": 362 + }, + { + "epoch": 2.708955223880597, + "grad_norm": 0.2855896503061799, + "learning_rate": 1.3525460160212284e-05, + "loss": 0.3351544141769409, + "num_tokens": 328060133.0, + "step": 363 + }, + { + "epoch": 2.716417910447761, + "grad_norm": 0.2981015005997933, + "learning_rate": 1.3456667178061365e-05, + "loss": 0.3235108256340027, + "num_tokens": 328868585.0, + "step": 364 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.3017533668551756, + "learning_rate": 1.3387957352996446e-05, + "loss": 0.34303897619247437, + "num_tokens": 329676478.0, + "step": 365 + }, + { + "epoch": 2.7313432835820897, + "grad_norm": 0.2793280893422549, + "learning_rate": 1.3319332562468716e-05, + "loss": 0.3332846164703369, + "num_tokens": 330487275.0, + "step": 366 + }, + { + "epoch": 2.7388059701492535, + "grad_norm": 0.272656727703741, + "learning_rate": 1.3250794681605853e-05, + "loss": 0.3316395878791809, + "num_tokens": 331339930.0, + "step": 367 + }, + { + "epoch": 2.746268656716418, + "grad_norm": 0.2742961273683912, + "learning_rate": 1.3182345583160782e-05, + "loss": 0.3241080045700073, + "num_tokens": 332357238.0, + "step": 368 + }, + { + "epoch": 2.753731343283582, + "grad_norm": 0.26762807579168846, + "learning_rate": 1.3113987137460514e-05, + "loss": 0.331865131855011, + "num_tokens": 333294492.0, + "step": 369 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.28820933208703176, + "learning_rate": 1.3045721212355023e-05, + "loss": 0.35760703682899475, + "num_tokens": 334107753.0, + "step": 370 + }, + { + "epoch": 2.7686567164179103, + "grad_norm": 0.26979220761978373, + "learning_rate": 1.2977549673166228e-05, + "loss": 0.3278617858886719, + "num_tokens": 334989082.0, + "step": 371 + }, + { + "epoch": 2.7761194029850746, + "grad_norm": 0.27879196286904034, + "learning_rate": 1.2909474382637006e-05, + "loss": 0.33124369382858276, + "num_tokens": 335901082.0, + "step": 372 + }, + { + "epoch": 2.783582089552239, + "grad_norm": 0.2463893540212004, + "learning_rate": 1.2841497200880305e-05, + "loss": 0.32943689823150635, + "num_tokens": 336958851.0, + "step": 373 + }, + { + "epoch": 2.791044776119403, + "grad_norm": 0.27477456461332017, + "learning_rate": 1.2773619985328323e-05, + "loss": 0.3239135444164276, + "num_tokens": 337786409.0, + "step": 374 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.28497614476087085, + "learning_rate": 1.2705844590681726e-05, + "loss": 0.3271849453449249, + "num_tokens": 338694981.0, + "step": 375 + }, + { + "epoch": 2.8059701492537314, + "grad_norm": 0.2777009652008523, + "learning_rate": 1.2638172868859015e-05, + "loss": 0.31704217195510864, + "num_tokens": 339501927.0, + "step": 376 + }, + { + "epoch": 2.8134328358208958, + "grad_norm": 0.30766613572700274, + "learning_rate": 1.2570606668945877e-05, + "loss": 0.35138726234436035, + "num_tokens": 340423876.0, + "step": 377 + }, + { + "epoch": 2.8208955223880596, + "grad_norm": 0.24806225558937847, + "learning_rate": 1.2503147837144702e-05, + "loss": 0.31420814990997314, + "num_tokens": 341285598.0, + "step": 378 + }, + { + "epoch": 2.828358208955224, + "grad_norm": 0.2847378795760287, + "learning_rate": 1.2435798216724094e-05, + "loss": 0.32901105284690857, + "num_tokens": 342213168.0, + "step": 379 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.27949658273947187, + "learning_rate": 1.2368559647968544e-05, + "loss": 0.34290027618408203, + "num_tokens": 343216441.0, + "step": 380 + }, + { + "epoch": 2.843283582089552, + "grad_norm": 0.27303724081659647, + "learning_rate": 1.2301433968128127e-05, + "loss": 0.3377082645893097, + "num_tokens": 344164273.0, + "step": 381 + }, + { + "epoch": 2.8507462686567164, + "grad_norm": 0.28475093005317836, + "learning_rate": 1.2234423011368292e-05, + "loss": 0.3300044536590576, + "num_tokens": 345034929.0, + "step": 382 + }, + { + "epoch": 2.8582089552238807, + "grad_norm": 0.280519214961473, + "learning_rate": 1.2167528608719768e-05, + "loss": 0.3426816463470459, + "num_tokens": 345822215.0, + "step": 383 + }, + { + "epoch": 2.8656716417910446, + "grad_norm": 0.27103397464423407, + "learning_rate": 1.2100752588028507e-05, + "loss": 0.33561939001083374, + "num_tokens": 346779144.0, + "step": 384 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.26428076882187357, + "learning_rate": 1.2034096773905753e-05, + "loss": 0.3420035243034363, + "num_tokens": 347750581.0, + "step": 385 + }, + { + "epoch": 2.8805970149253732, + "grad_norm": 0.30501014590148545, + "learning_rate": 1.196756298767819e-05, + "loss": 0.33400657773017883, + "num_tokens": 348809613.0, + "step": 386 + }, + { + "epoch": 2.888059701492537, + "grad_norm": 0.24697890321618382, + "learning_rate": 1.1901153047338168e-05, + "loss": 0.3329269289970398, + "num_tokens": 349843341.0, + "step": 387 + }, + { + "epoch": 2.8955223880597014, + "grad_norm": 0.266310278361214, + "learning_rate": 1.1834868767494028e-05, + "loss": 0.3315233588218689, + "num_tokens": 350686011.0, + "step": 388 + }, + { + "epoch": 2.9029850746268657, + "grad_norm": 0.25685951719776035, + "learning_rate": 1.1768711959320512e-05, + "loss": 0.3367440104484558, + "num_tokens": 351603297.0, + "step": 389 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.3604332672305553, + "learning_rate": 1.1702684430509298e-05, + "loss": 0.35349708795547485, + "num_tokens": 352566195.0, + "step": 390 + }, + { + "epoch": 2.917910447761194, + "grad_norm": 0.2935692512218851, + "learning_rate": 1.1636787985219572e-05, + "loss": 0.3288194537162781, + "num_tokens": 353185236.0, + "step": 391 + }, + { + "epoch": 2.925373134328358, + "grad_norm": 0.2815859488295857, + "learning_rate": 1.1571024424028761e-05, + "loss": 0.339729905128479, + "num_tokens": 354050628.0, + "step": 392 + }, + { + "epoch": 2.9328358208955225, + "grad_norm": 0.275808180586563, + "learning_rate": 1.1505395543883313e-05, + "loss": 0.3455864489078522, + "num_tokens": 354968219.0, + "step": 393 + }, + { + "epoch": 2.9402985074626864, + "grad_norm": 0.25420785215211034, + "learning_rate": 1.143990313804961e-05, + "loss": 0.33193981647491455, + "num_tokens": 355907268.0, + "step": 394 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.2700179741152324, + "learning_rate": 1.1374548996064953e-05, + "loss": 0.32135769724845886, + "num_tokens": 356786243.0, + "step": 395 + }, + { + "epoch": 2.955223880597015, + "grad_norm": 0.2881492550060451, + "learning_rate": 1.1309334903688686e-05, + "loss": 0.33170467615127563, + "num_tokens": 357742891.0, + "step": 396 + }, + { + "epoch": 2.9626865671641793, + "grad_norm": 0.267258769627609, + "learning_rate": 1.1244262642853383e-05, + "loss": 0.3263099193572998, + "num_tokens": 358521016.0, + "step": 397 + }, + { + "epoch": 2.970149253731343, + "grad_norm": 0.27461845486227027, + "learning_rate": 1.1179333991616162e-05, + "loss": 0.31942278146743774, + "num_tokens": 359455120.0, + "step": 398 + }, + { + "epoch": 2.9776119402985075, + "grad_norm": 0.28304959654627004, + "learning_rate": 1.1114550724110105e-05, + "loss": 0.3328409790992737, + "num_tokens": 360361804.0, + "step": 399 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.25788972512908753, + "learning_rate": 1.1049914610495772e-05, + "loss": 0.3321342468261719, + "num_tokens": 361424683.0, + "step": 400 + }, + { + "epoch": 2.9925373134328357, + "grad_norm": 0.28331510950003724, + "learning_rate": 1.0985427416912853e-05, + "loss": 0.33656731247901917, + "num_tokens": 362323989.0, + "step": 401 + }, + { + "epoch": 3.0, + "grad_norm": 0.3020936447432793, + "learning_rate": 1.0921090905431871e-05, + "loss": 0.33412468433380127, + "num_tokens": 363125328.0, + "step": 402 + }, + { + "epoch": 3.0074626865671643, + "grad_norm": 0.3799465813612812, + "learning_rate": 1.0856906834006088e-05, + "loss": 0.2873135805130005, + "num_tokens": 363894208.0, + "step": 403 + }, + { + "epoch": 3.014925373134328, + "grad_norm": 0.36967050447420124, + "learning_rate": 1.079287695642342e-05, + "loss": 0.2959785461425781, + "num_tokens": 364737491.0, + "step": 404 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.29461417590711114, + "learning_rate": 1.0729003022258542e-05, + "loss": 0.29170793294906616, + "num_tokens": 365722761.0, + "step": 405 + }, + { + "epoch": 3.029850746268657, + "grad_norm": 0.4116803087373601, + "learning_rate": 1.0665286776825081e-05, + "loss": 0.30883458256721497, + "num_tokens": 366512957.0, + "step": 406 + }, + { + "epoch": 3.0373134328358207, + "grad_norm": 0.46157786514533145, + "learning_rate": 1.0601729961127924e-05, + "loss": 0.30715805292129517, + "num_tokens": 367415626.0, + "step": 407 + }, + { + "epoch": 3.044776119402985, + "grad_norm": 0.4187156114489574, + "learning_rate": 1.0538334311815627e-05, + "loss": 0.31521543860435486, + "num_tokens": 368197609.0, + "step": 408 + }, + { + "epoch": 3.0522388059701493, + "grad_norm": 0.3101860773424548, + "learning_rate": 1.0475101561133e-05, + "loss": 0.2965121269226074, + "num_tokens": 369065047.0, + "step": 409 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.33242255182112, + "learning_rate": 1.0412033436873744e-05, + "loss": 0.2895386815071106, + "num_tokens": 370031828.0, + "step": 410 + }, + { + "epoch": 3.0671641791044775, + "grad_norm": 0.33876504585540845, + "learning_rate": 1.0349131662333255e-05, + "loss": 0.3026469647884369, + "num_tokens": 370964850.0, + "step": 411 + }, + { + "epoch": 3.074626865671642, + "grad_norm": 0.30688475151658845, + "learning_rate": 1.0286397956261533e-05, + "loss": 0.2771751582622528, + "num_tokens": 371789883.0, + "step": 412 + }, + { + "epoch": 3.082089552238806, + "grad_norm": 0.29360163117574556, + "learning_rate": 1.0223834032816198e-05, + "loss": 0.3152085840702057, + "num_tokens": 372663206.0, + "step": 413 + }, + { + "epoch": 3.08955223880597, + "grad_norm": 0.31015233222700656, + "learning_rate": 1.0161441601515695e-05, + "loss": 0.2951708137989044, + "num_tokens": 373488698.0, + "step": 414 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.3610813886577992, + "learning_rate": 1.0099222367192547e-05, + "loss": 0.3165642321109772, + "num_tokens": 374309008.0, + "step": 415 + }, + { + "epoch": 3.1044776119402986, + "grad_norm": 0.3051632607105892, + "learning_rate": 1.0037178029946785e-05, + "loss": 0.2940051853656769, + "num_tokens": 375243569.0, + "step": 416 + }, + { + "epoch": 3.111940298507463, + "grad_norm": 0.28089972063969026, + "learning_rate": 9.975310285099484e-06, + "loss": 0.30177193880081177, + "num_tokens": 376203415.0, + "step": 417 + }, + { + "epoch": 3.1194029850746268, + "grad_norm": 0.25054754752277697, + "learning_rate": 9.913620823146451e-06, + "loss": 0.2875446081161499, + "num_tokens": 377153421.0, + "step": 418 + }, + { + "epoch": 3.126865671641791, + "grad_norm": 0.26784113270221377, + "learning_rate": 9.852111329712039e-06, + "loss": 0.30190229415893555, + "num_tokens": 378087276.0, + "step": 419 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.2708038955849966, + "learning_rate": 9.790783485503063e-06, + "loss": 0.27638930082321167, + "num_tokens": 378977281.0, + "step": 420 + }, + { + "epoch": 3.1417910447761193, + "grad_norm": 0.29895669435540717, + "learning_rate": 9.729638966262907e-06, + "loss": 0.29848071932792664, + "num_tokens": 379899880.0, + "step": 421 + }, + { + "epoch": 3.1492537313432836, + "grad_norm": 0.2554802919348738, + "learning_rate": 9.668679442725697e-06, + "loss": 0.27390313148498535, + "num_tokens": 380749969.0, + "step": 422 + }, + { + "epoch": 3.156716417910448, + "grad_norm": 0.3050926022188745, + "learning_rate": 9.607906580570695e-06, + "loss": 0.2757868468761444, + "num_tokens": 381625559.0, + "step": 423 + }, + { + "epoch": 3.1641791044776117, + "grad_norm": 0.2679289456473412, + "learning_rate": 9.54732204037675e-06, + "loss": 0.284029483795166, + "num_tokens": 382524515.0, + "step": 424 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.3337923481748472, + "learning_rate": 9.486927477576948e-06, + "loss": 0.2807900905609131, + "num_tokens": 383460945.0, + "step": 425 + }, + { + "epoch": 3.1791044776119404, + "grad_norm": 0.26612009858515995, + "learning_rate": 9.426724542413345e-06, + "loss": 0.273318886756897, + "num_tokens": 384264130.0, + "step": 426 + }, + { + "epoch": 3.1865671641791047, + "grad_norm": 0.3524087992151756, + "learning_rate": 9.366714879891915e-06, + "loss": 0.3047345280647278, + "num_tokens": 385268579.0, + "step": 427 + }, + { + "epoch": 3.1940298507462686, + "grad_norm": 0.27293635594516935, + "learning_rate": 9.306900129737579e-06, + "loss": 0.2729998230934143, + "num_tokens": 386028916.0, + "step": 428 + }, + { + "epoch": 3.201492537313433, + "grad_norm": 0.2758767208976419, + "learning_rate": 9.2472819263494e-06, + "loss": 0.2999764680862427, + "num_tokens": 386990098.0, + "step": 429 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.2600988814863155, + "learning_rate": 9.187861898755944e-06, + "loss": 0.28329452872276306, + "num_tokens": 387863679.0, + "step": 430 + }, + { + "epoch": 3.216417910447761, + "grad_norm": 0.2692034268302841, + "learning_rate": 9.128641670570722e-06, + "loss": 0.29894596338272095, + "num_tokens": 388670162.0, + "step": 431 + }, + { + "epoch": 3.2238805970149254, + "grad_norm": 0.26505479450449865, + "learning_rate": 9.069622859947886e-06, + "loss": 0.28377240896224976, + "num_tokens": 389520124.0, + "step": 432 + }, + { + "epoch": 3.2313432835820897, + "grad_norm": 0.27522585261540833, + "learning_rate": 9.010807079537969e-06, + "loss": 0.30390995740890503, + "num_tokens": 390462131.0, + "step": 433 + }, + { + "epoch": 3.2388059701492535, + "grad_norm": 0.2716269898933067, + "learning_rate": 8.952195936443843e-06, + "loss": 0.28739655017852783, + "num_tokens": 391294528.0, + "step": 434 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.24102893105996573, + "learning_rate": 8.893791032176798e-06, + "loss": 0.27352797985076904, + "num_tokens": 392296640.0, + "step": 435 + }, + { + "epoch": 3.253731343283582, + "grad_norm": 0.2648291528044443, + "learning_rate": 8.835593962612773e-06, + "loss": 0.2909316122531891, + "num_tokens": 393156418.0, + "step": 436 + }, + { + "epoch": 3.2611940298507465, + "grad_norm": 0.28306499151263154, + "learning_rate": 8.777606317948772e-06, + "loss": 0.2992030084133148, + "num_tokens": 394033667.0, + "step": 437 + }, + { + "epoch": 3.2686567164179103, + "grad_norm": 0.2716112435631178, + "learning_rate": 8.719829682659399e-06, + "loss": 0.2813768982887268, + "num_tokens": 394903535.0, + "step": 438 + }, + { + "epoch": 3.2761194029850746, + "grad_norm": 0.27512395684649504, + "learning_rate": 8.662265635453547e-06, + "loss": 0.29536497592926025, + "num_tokens": 395846549.0, + "step": 439 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.2989586716757814, + "learning_rate": 8.604915749231298e-06, + "loss": 0.2988872826099396, + "num_tokens": 396737205.0, + "step": 440 + }, + { + "epoch": 3.291044776119403, + "grad_norm": 0.2883428566171811, + "learning_rate": 8.54778159104092e-06, + "loss": 0.2863343358039856, + "num_tokens": 397630057.0, + "step": 441 + }, + { + "epoch": 3.298507462686567, + "grad_norm": 0.28465714524566543, + "learning_rate": 8.490864722036045e-06, + "loss": 0.29591017961502075, + "num_tokens": 398582978.0, + "step": 442 + }, + { + "epoch": 3.3059701492537314, + "grad_norm": 0.27001645219378384, + "learning_rate": 8.434166697433034e-06, + "loss": 0.28916236758232117, + "num_tokens": 399421334.0, + "step": 443 + }, + { + "epoch": 3.3134328358208958, + "grad_norm": 0.2636502279817121, + "learning_rate": 8.377689066468452e-06, + "loss": 0.2919909954071045, + "num_tokens": 400404286.0, + "step": 444 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.26433758679315084, + "learning_rate": 8.321433372356756e-06, + "loss": 0.29081422090530396, + "num_tokens": 401357244.0, + "step": 445 + }, + { + "epoch": 3.328358208955224, + "grad_norm": 0.2615424811114582, + "learning_rate": 8.26540115224813e-06, + "loss": 0.29471999406814575, + "num_tokens": 402272150.0, + "step": 446 + }, + { + "epoch": 3.3358208955223883, + "grad_norm": 0.2774834257770196, + "learning_rate": 8.209593937186475e-06, + "loss": 0.3036431670188904, + "num_tokens": 403288360.0, + "step": 447 + }, + { + "epoch": 3.343283582089552, + "grad_norm": 0.2615528181357639, + "learning_rate": 8.154013252067565e-06, + "loss": 0.28283798694610596, + "num_tokens": 404201834.0, + "step": 448 + }, + { + "epoch": 3.3507462686567164, + "grad_norm": 0.26020037614375063, + "learning_rate": 8.098660615597401e-06, + "loss": 0.2982422113418579, + "num_tokens": 405227526.0, + "step": 449 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.24108662753109747, + "learning_rate": 8.043537540250705e-06, + "loss": 0.2861343026161194, + "num_tokens": 406200774.0, + "step": 450 + }, + { + "epoch": 3.3656716417910446, + "grad_norm": 0.2739099235918602, + "learning_rate": 7.988645532229581e-06, + "loss": 0.2993728816509247, + "num_tokens": 407124735.0, + "step": 451 + }, + { + "epoch": 3.373134328358209, + "grad_norm": 0.247255433646386, + "learning_rate": 7.933986091422379e-06, + "loss": 0.26630109548568726, + "num_tokens": 407967520.0, + "step": 452 + }, + { + "epoch": 3.3805970149253732, + "grad_norm": 0.26491478815853126, + "learning_rate": 7.879560711362696e-06, + "loss": 0.2873428463935852, + "num_tokens": 408873357.0, + "step": 453 + }, + { + "epoch": 3.388059701492537, + "grad_norm": 0.2894069285105355, + "learning_rate": 7.825370879188569e-06, + "loss": 0.28855782747268677, + "num_tokens": 409780883.0, + "step": 454 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.2492281531289753, + "learning_rate": 7.771418075601852e-06, + "loss": 0.28437167406082153, + "num_tokens": 410746305.0, + "step": 455 + }, + { + "epoch": 3.4029850746268657, + "grad_norm": 0.24355852796907684, + "learning_rate": 7.71770377482774e-06, + "loss": 0.27994096279144287, + "num_tokens": 411680660.0, + "step": 456 + }, + { + "epoch": 3.41044776119403, + "grad_norm": 0.2700214986931186, + "learning_rate": 7.664229444574492e-06, + "loss": 0.2921644449234009, + "num_tokens": 412605533.0, + "step": 457 + }, + { + "epoch": 3.417910447761194, + "grad_norm": 0.26147389049318864, + "learning_rate": 7.610996545993334e-06, + "loss": 0.2780182957649231, + "num_tokens": 413578521.0, + "step": 458 + }, + { + "epoch": 3.425373134328358, + "grad_norm": 0.29826386143822425, + "learning_rate": 7.558006533638531e-06, + "loss": 0.2961535155773163, + "num_tokens": 414502174.0, + "step": 459 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.3037556792472721, + "learning_rate": 7.505260855427631e-06, + "loss": 0.2871173024177551, + "num_tokens": 415404496.0, + "step": 460 + }, + { + "epoch": 3.4402985074626864, + "grad_norm": 0.27538452886466275, + "learning_rate": 7.452760952601926e-06, + "loss": 0.29723048210144043, + "num_tokens": 416329218.0, + "step": 461 + }, + { + "epoch": 3.4477611940298507, + "grad_norm": 0.27152400208894184, + "learning_rate": 7.400508259687034e-06, + "loss": 0.28178274631500244, + "num_tokens": 417169036.0, + "step": 462 + }, + { + "epoch": 3.455223880597015, + "grad_norm": 0.26056261717807916, + "learning_rate": 7.3485042044537425e-06, + "loss": 0.28464025259017944, + "num_tokens": 418088260.0, + "step": 463 + }, + { + "epoch": 3.4626865671641793, + "grad_norm": 0.27386142278491205, + "learning_rate": 7.296750207878967e-06, + "loss": 0.29148146510124207, + "num_tokens": 418913562.0, + "step": 464 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.2854422365181894, + "learning_rate": 7.2452476841069365e-06, + "loss": 0.30438661575317383, + "num_tokens": 419816417.0, + "step": 465 + }, + { + "epoch": 3.4776119402985075, + "grad_norm": 0.2555489664737189, + "learning_rate": 7.193998040410553e-06, + "loss": 0.3044406771659851, + "num_tokens": 420724216.0, + "step": 466 + }, + { + "epoch": 3.485074626865672, + "grad_norm": 0.25726240443773163, + "learning_rate": 7.143002677152923e-06, + "loss": 0.28696900606155396, + "num_tokens": 421711967.0, + "step": 467 + }, + { + "epoch": 3.4925373134328357, + "grad_norm": 0.23933852458046884, + "learning_rate": 7.092262987749115e-06, + "loss": 0.28907179832458496, + "num_tokens": 422655373.0, + "step": 468 + }, + { + "epoch": 3.5, + "grad_norm": 0.25521858687923527, + "learning_rate": 7.041780358628076e-06, + "loss": 0.2952384948730469, + "num_tokens": 423645388.0, + "step": 469 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.2611107763482694, + "learning_rate": 6.991556169194752e-06, + "loss": 0.29364389181137085, + "num_tokens": 424569069.0, + "step": 470 + }, + { + "epoch": 3.5149253731343286, + "grad_norm": 0.25648032981059116, + "learning_rate": 6.941591791792378e-06, + "loss": 0.29367825388908386, + "num_tokens": 425492189.0, + "step": 471 + }, + { + "epoch": 3.5223880597014925, + "grad_norm": 0.2622850225964596, + "learning_rate": 6.8918885916650105e-06, + "loss": 0.29189831018447876, + "num_tokens": 426458959.0, + "step": 472 + }, + { + "epoch": 3.529850746268657, + "grad_norm": 0.2570163822819945, + "learning_rate": 6.842447926920199e-06, + "loss": 0.2819617688655853, + "num_tokens": 427323105.0, + "step": 473 + }, + { + "epoch": 3.5373134328358207, + "grad_norm": 0.29549281458059196, + "learning_rate": 6.793271148491887e-06, + "loss": 0.303572416305542, + "num_tokens": 428282716.0, + "step": 474 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.2802879385897247, + "learning_rate": 6.7443596001035025e-06, + "loss": 0.31654465198516846, + "num_tokens": 429252409.0, + "step": 475 + }, + { + "epoch": 3.5522388059701493, + "grad_norm": 0.24794767420280742, + "learning_rate": 6.6957146182312175e-06, + "loss": 0.2985188364982605, + "num_tokens": 430179989.0, + "step": 476 + }, + { + "epoch": 3.5597014925373136, + "grad_norm": 0.26530731793672097, + "learning_rate": 6.647337532067467e-06, + "loss": 0.2864232063293457, + "num_tokens": 431131078.0, + "step": 477 + }, + { + "epoch": 3.5671641791044775, + "grad_norm": 0.26007394216176255, + "learning_rate": 6.599229663484598e-06, + "loss": 0.31048181653022766, + "num_tokens": 432118357.0, + "step": 478 + }, + { + "epoch": 3.574626865671642, + "grad_norm": 0.2561363907813142, + "learning_rate": 6.551392326998776e-06, + "loss": 0.29227665066719055, + "num_tokens": 432981468.0, + "step": 479 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.2560064424004792, + "learning_rate": 6.503826829734035e-06, + "loss": 0.2897188663482666, + "num_tokens": 433892112.0, + "step": 480 + }, + { + "epoch": 3.58955223880597, + "grad_norm": 0.25775269914474314, + "learning_rate": 6.456534471386594e-06, + "loss": 0.2899354100227356, + "num_tokens": 434806511.0, + "step": 481 + }, + { + "epoch": 3.5970149253731343, + "grad_norm": 0.2763280796455374, + "learning_rate": 6.409516544189322e-06, + "loss": 0.294207900762558, + "num_tokens": 435709840.0, + "step": 482 + }, + { + "epoch": 3.6044776119402986, + "grad_norm": 0.26039260807281084, + "learning_rate": 6.362774332876438e-06, + "loss": 0.2990114390850067, + "num_tokens": 436640115.0, + "step": 483 + }, + { + "epoch": 3.611940298507463, + "grad_norm": 0.2527009190257145, + "learning_rate": 6.316309114648409e-06, + "loss": 0.2699679732322693, + "num_tokens": 437494545.0, + "step": 484 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.2736279490033172, + "learning_rate": 6.270122159137033e-06, + "loss": 0.2987067401409149, + "num_tokens": 438288906.0, + "step": 485 + }, + { + "epoch": 3.626865671641791, + "grad_norm": 0.2789173799100359, + "learning_rate": 6.2242147283707714e-06, + "loss": 0.3188440203666687, + "num_tokens": 439102139.0, + "step": 486 + }, + { + "epoch": 3.6343283582089554, + "grad_norm": 0.24253033921125058, + "learning_rate": 6.178588076740253e-06, + "loss": 0.2938775420188904, + "num_tokens": 439996247.0, + "step": 487 + }, + { + "epoch": 3.6417910447761193, + "grad_norm": 0.2638488313889946, + "learning_rate": 6.133243450964005e-06, + "loss": 0.299264132976532, + "num_tokens": 440863036.0, + "step": 488 + }, + { + "epoch": 3.6492537313432836, + "grad_norm": 0.2862685346779505, + "learning_rate": 6.088182090054364e-06, + "loss": 0.29607367515563965, + "num_tokens": 441618331.0, + "step": 489 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.27289917809516917, + "learning_rate": 6.043405225283654e-06, + "loss": 0.2921777367591858, + "num_tokens": 442361717.0, + "step": 490 + }, + { + "epoch": 3.664179104477612, + "grad_norm": 0.2537367234076187, + "learning_rate": 5.998914080150525e-06, + "loss": 0.2836867570877075, + "num_tokens": 443313769.0, + "step": 491 + }, + { + "epoch": 3.671641791044776, + "grad_norm": 0.26170955211126384, + "learning_rate": 5.9547098703465215e-06, + "loss": 0.30563318729400635, + "num_tokens": 444314596.0, + "step": 492 + }, + { + "epoch": 3.6791044776119404, + "grad_norm": 0.2693301068014662, + "learning_rate": 5.910793803722873e-06, + "loss": 0.29311275482177734, + "num_tokens": 445237263.0, + "step": 493 + }, + { + "epoch": 3.6865671641791042, + "grad_norm": 0.27762349435994677, + "learning_rate": 5.867167080257471e-06, + "loss": 0.29791638255119324, + "num_tokens": 446151590.0, + "step": 494 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.2701314496245139, + "learning_rate": 5.823830892022107e-06, + "loss": 0.3165101408958435, + "num_tokens": 447040490.0, + "step": 495 + }, + { + "epoch": 3.701492537313433, + "grad_norm": 0.27274243095008927, + "learning_rate": 5.780786423149879e-06, + "loss": 0.32390397787094116, + "num_tokens": 447930938.0, + "step": 496 + }, + { + "epoch": 3.708955223880597, + "grad_norm": 0.3607925974135692, + "learning_rate": 5.738034849802852e-06, + "loss": 0.2941335439682007, + "num_tokens": 448795073.0, + "step": 497 + }, + { + "epoch": 3.716417910447761, + "grad_norm": 0.27114314101622733, + "learning_rate": 5.695577340139905e-06, + "loss": 0.29179757833480835, + "num_tokens": 449748272.0, + "step": 498 + }, + { + "epoch": 3.7238805970149254, + "grad_norm": 0.2676978851481763, + "learning_rate": 5.653415054284816e-06, + "loss": 0.30068930983543396, + "num_tokens": 450716521.0, + "step": 499 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.26294336472483293, + "learning_rate": 5.611549144294568e-06, + "loss": 0.2907962203025818, + "num_tokens": 451536750.0, + "step": 500 + }, + { + "epoch": 3.7388059701492535, + "grad_norm": 0.2546485590984235, + "learning_rate": 5.569980754127872e-06, + "loss": 0.2873173952102661, + "num_tokens": 452509967.0, + "step": 501 + }, + { + "epoch": 3.746268656716418, + "grad_norm": 0.24938478382421467, + "learning_rate": 5.5287110196138985e-06, + "loss": 0.2843964993953705, + "num_tokens": 453499953.0, + "step": 502 + }, + { + "epoch": 3.753731343283582, + "grad_norm": 0.24634142699097625, + "learning_rate": 5.487741068421242e-06, + "loss": 0.295748770236969, + "num_tokens": 454428619.0, + "step": 503 + }, + { + "epoch": 3.7611940298507465, + "grad_norm": 0.24888257131984212, + "learning_rate": 5.447072020027122e-06, + "loss": 0.2946910858154297, + "num_tokens": 455343533.0, + "step": 504 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.2368545844700678, + "learning_rate": 5.406704985686782e-06, + "loss": 0.27735936641693115, + "num_tokens": 456246016.0, + "step": 505 + }, + { + "epoch": 3.7761194029850746, + "grad_norm": 0.27241818855184635, + "learning_rate": 5.366641068403126e-06, + "loss": 0.3016122579574585, + "num_tokens": 457104506.0, + "step": 506 + }, + { + "epoch": 3.783582089552239, + "grad_norm": 0.2590785342630335, + "learning_rate": 5.326881362896588e-06, + "loss": 0.3151727020740509, + "num_tokens": 458003785.0, + "step": 507 + }, + { + "epoch": 3.791044776119403, + "grad_norm": 0.25242642100086654, + "learning_rate": 5.287426955575205e-06, + "loss": 0.2941104769706726, + "num_tokens": 458840614.0, + "step": 508 + }, + { + "epoch": 3.798507462686567, + "grad_norm": 0.25216547714604487, + "learning_rate": 5.24827892450494e-06, + "loss": 0.28807011246681213, + "num_tokens": 459707587.0, + "step": 509 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.23923122289508578, + "learning_rate": 5.209438339380242e-06, + "loss": 0.2823304533958435, + "num_tokens": 460629686.0, + "step": 510 + }, + { + "epoch": 3.8134328358208958, + "grad_norm": 0.2657855590710968, + "learning_rate": 5.170906261494776e-06, + "loss": 0.2919255197048187, + "num_tokens": 461544147.0, + "step": 511 + }, + { + "epoch": 3.8208955223880596, + "grad_norm": 0.2626472514066274, + "learning_rate": 5.132683743712462e-06, + "loss": 0.29430970549583435, + "num_tokens": 462477850.0, + "step": 512 + }, + { + "epoch": 3.828358208955224, + "grad_norm": 0.2566886419848628, + "learning_rate": 5.094771830438689e-06, + "loss": 0.2987692952156067, + "num_tokens": 463412060.0, + "step": 513 + }, + { + "epoch": 3.835820895522388, + "grad_norm": 0.2484881857541934, + "learning_rate": 5.057171557591777e-06, + "loss": 0.2915360927581787, + "num_tokens": 464308740.0, + "step": 514 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.24381344201474844, + "learning_rate": 5.019883952574686e-06, + "loss": 0.28436267375946045, + "num_tokens": 465265384.0, + "step": 515 + }, + { + "epoch": 3.8507462686567164, + "grad_norm": 0.2481908860876439, + "learning_rate": 4.98291003424691e-06, + "loss": 0.28611573576927185, + "num_tokens": 466226494.0, + "step": 516 + }, + { + "epoch": 3.8582089552238807, + "grad_norm": 0.23871319999253146, + "learning_rate": 4.946250812896678e-06, + "loss": 0.2998065948486328, + "num_tokens": 467259239.0, + "step": 517 + }, + { + "epoch": 3.8656716417910446, + "grad_norm": 0.24799658125418186, + "learning_rate": 4.909907290213321e-06, + "loss": 0.2929803729057312, + "num_tokens": 468142586.0, + "step": 518 + }, + { + "epoch": 3.873134328358209, + "grad_norm": 0.2570907705948353, + "learning_rate": 4.873880459259913e-06, + "loss": 0.2957007884979248, + "num_tokens": 468951581.0, + "step": 519 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.2616713052030643, + "learning_rate": 4.838171304446129e-06, + "loss": 0.3021651804447174, + "num_tokens": 469861165.0, + "step": 520 + }, + { + "epoch": 3.888059701492537, + "grad_norm": 0.3010016938124609, + "learning_rate": 4.80278080150135e-06, + "loss": 0.308903306722641, + "num_tokens": 470804718.0, + "step": 521 + }, + { + "epoch": 3.8955223880597014, + "grad_norm": 0.24926265212747228, + "learning_rate": 4.767709917448009e-06, + "loss": 0.30023178458213806, + "num_tokens": 471749228.0, + "step": 522 + }, + { + "epoch": 3.9029850746268657, + "grad_norm": 0.24761743456611565, + "learning_rate": 4.732959610575154e-06, + "loss": 0.2946227788925171, + "num_tokens": 472698930.0, + "step": 523 + }, + { + "epoch": 3.91044776119403, + "grad_norm": 0.22825850031566985, + "learning_rate": 4.698530830412276e-06, + "loss": 0.2835308611392975, + "num_tokens": 473565553.0, + "step": 524 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.25239434326625193, + "learning_rate": 4.664424517703353e-06, + "loss": 0.3003775477409363, + "num_tokens": 474425434.0, + "step": 525 + }, + { + "epoch": 3.925373134328358, + "grad_norm": 0.24946733272255223, + "learning_rate": 4.630641604381151e-06, + "loss": 0.3032747507095337, + "num_tokens": 475400550.0, + "step": 526 + }, + { + "epoch": 3.9328358208955225, + "grad_norm": 0.23900706286857004, + "learning_rate": 4.597183013541764e-06, + "loss": 0.3009137809276581, + "num_tokens": 476322074.0, + "step": 527 + }, + { + "epoch": 3.9402985074626864, + "grad_norm": 0.23647944717460595, + "learning_rate": 4.564049659419379e-06, + "loss": 0.2712666392326355, + "num_tokens": 477127686.0, + "step": 528 + }, + { + "epoch": 3.9477611940298507, + "grad_norm": 0.24461158477231615, + "learning_rate": 4.531242447361308e-06, + "loss": 0.2808017432689667, + "num_tokens": 477992768.0, + "step": 529 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.25868694964779676, + "learning_rate": 4.498762273803233e-06, + "loss": 0.3064419627189636, + "num_tokens": 478818611.0, + "step": 530 + }, + { + "epoch": 3.9626865671641793, + "grad_norm": 0.2384742960079164, + "learning_rate": 4.4666100262447335e-06, + "loss": 0.28597795963287354, + "num_tokens": 479757992.0, + "step": 531 + }, + { + "epoch": 3.970149253731343, + "grad_norm": 0.23968331135860904, + "learning_rate": 4.434786583225018e-06, + "loss": 0.28608185052871704, + "num_tokens": 480686770.0, + "step": 532 + }, + { + "epoch": 3.9776119402985075, + "grad_norm": 0.23668555789215315, + "learning_rate": 4.403292814298932e-06, + "loss": 0.2850901782512665, + "num_tokens": 481556474.0, + "step": 533 + }, + { + "epoch": 3.9850746268656714, + "grad_norm": 0.25182124727383254, + "learning_rate": 4.372129580013179e-06, + "loss": 0.29344847798347473, + "num_tokens": 482402398.0, + "step": 534 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.26240005117001564, + "learning_rate": 4.341297731882833e-06, + "loss": 0.28991544246673584, + "num_tokens": 483144226.0, + "step": 535 + }, + { + "epoch": 4.0, + "grad_norm": 0.2394553338374243, + "learning_rate": 4.31079811236805e-06, + "loss": 0.28979605436325073, + "num_tokens": 484171179.0, + "step": 536 + }, + { + "epoch": 4.007462686567164, + "grad_norm": 0.33867608284366874, + "learning_rate": 4.280631554851052e-06, + "loss": 0.261859267950058, + "num_tokens": 484964422.0, + "step": 537 + }, + { + "epoch": 4.014925373134329, + "grad_norm": 0.3387690029521035, + "learning_rate": 4.250798883613371e-06, + "loss": 0.258260041475296, + "num_tokens": 485911398.0, + "step": 538 + }, + { + "epoch": 4.022388059701493, + "grad_norm": 0.31295687889359947, + "learning_rate": 4.221300913813297e-06, + "loss": 0.26438719034194946, + "num_tokens": 486765516.0, + "step": 539 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.2533693528515132, + "learning_rate": 4.192138451463637e-06, + "loss": 0.26276901364326477, + "num_tokens": 487755450.0, + "step": 540 + }, + { + "epoch": 4.037313432835821, + "grad_norm": 0.27941924345032165, + "learning_rate": 4.163312293409668e-06, + "loss": 0.2743380069732666, + "num_tokens": 488596501.0, + "step": 541 + }, + { + "epoch": 4.044776119402985, + "grad_norm": 0.3119420012284113, + "learning_rate": 4.134823227307376e-06, + "loss": 0.27551499009132385, + "num_tokens": 489333987.0, + "step": 542 + }, + { + "epoch": 4.052238805970149, + "grad_norm": 0.3376015245186099, + "learning_rate": 4.1066720316019176e-06, + "loss": 0.2677218019962311, + "num_tokens": 490271866.0, + "step": 543 + }, + { + "epoch": 4.059701492537314, + "grad_norm": 0.31476158712266056, + "learning_rate": 4.0788594755063754e-06, + "loss": 0.2655893564224243, + "num_tokens": 491167672.0, + "step": 544 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.2801726675903428, + "learning_rate": 4.051386318980717e-06, + "loss": 0.2636064291000366, + "num_tokens": 492117374.0, + "step": 545 + }, + { + "epoch": 4.074626865671641, + "grad_norm": 0.2642554446164588, + "learning_rate": 4.024253312711041e-06, + "loss": 0.2632978558540344, + "num_tokens": 493064577.0, + "step": 546 + }, + { + "epoch": 4.082089552238806, + "grad_norm": 0.2340921814939966, + "learning_rate": 3.99746119808906e-06, + "loss": 0.2561931908130646, + "num_tokens": 494008196.0, + "step": 547 + }, + { + "epoch": 4.08955223880597, + "grad_norm": 0.24746722464151832, + "learning_rate": 3.971010707191848e-06, + "loss": 0.2665466368198395, + "num_tokens": 495010032.0, + "step": 548 + }, + { + "epoch": 4.097014925373134, + "grad_norm": 0.28750263458503306, + "learning_rate": 3.9449025627618256e-06, + "loss": 0.2657792568206787, + "num_tokens": 495771485.0, + "step": 549 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.25981920943022424, + "learning_rate": 3.919137478187027e-06, + "loss": 0.2730734050273895, + "num_tokens": 496704001.0, + "step": 550 + }, + { + "epoch": 4.111940298507463, + "grad_norm": 0.26506589650257595, + "learning_rate": 3.893716157481598e-06, + "loss": 0.26241227984428406, + "num_tokens": 497580217.0, + "step": 551 + }, + { + "epoch": 4.119402985074627, + "grad_norm": 0.28902536946390145, + "learning_rate": 3.868639295266562e-06, + "loss": 0.27827292680740356, + "num_tokens": 498399947.0, + "step": 552 + }, + { + "epoch": 4.126865671641791, + "grad_norm": 0.2305613889202318, + "learning_rate": 3.8439075767508304e-06, + "loss": 0.25871434807777405, + "num_tokens": 499337510.0, + "step": 553 + }, + { + "epoch": 4.134328358208955, + "grad_norm": 0.2543579464580596, + "learning_rate": 3.819521677712498e-06, + "loss": 0.26276665925979614, + "num_tokens": 500211058.0, + "step": 554 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.2603404639875204, + "learning_rate": 3.7954822644803612e-06, + "loss": 0.27976810932159424, + "num_tokens": 501239171.0, + "step": 555 + }, + { + "epoch": 4.149253731343284, + "grad_norm": 0.24399135581961698, + "learning_rate": 3.7717899939157227e-06, + "loss": 0.2695601284503937, + "num_tokens": 502320140.0, + "step": 556 + }, + { + "epoch": 4.156716417910448, + "grad_norm": 0.2506194739658917, + "learning_rate": 3.748445513394432e-06, + "loss": 0.2601467967033386, + "num_tokens": 503200601.0, + "step": 557 + }, + { + "epoch": 4.164179104477612, + "grad_norm": 0.24780405901365382, + "learning_rate": 3.7254494607892062e-06, + "loss": 0.2658926248550415, + "num_tokens": 504111915.0, + "step": 558 + }, + { + "epoch": 4.1716417910447765, + "grad_norm": 0.25277522040944933, + "learning_rate": 3.7028024644521974e-06, + "loss": 0.26618829369544983, + "num_tokens": 951575.0, + "step": 559 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.260926277591076, + "learning_rate": 3.6805051431978215e-06, + "loss": 0.2764492630958557, + "num_tokens": 1870368.0, + "step": 560 + }, + { + "epoch": 4.186567164179104, + "grad_norm": 0.24151138917904563, + "learning_rate": 3.6585581062858515e-06, + "loss": 0.26785239577293396, + "num_tokens": 2827046.0, + "step": 561 + }, + { + "epoch": 4.1940298507462686, + "grad_norm": 0.24384225850500896, + "learning_rate": 3.636961953404763e-06, + "loss": 0.26912403106689453, + "num_tokens": 3739973.0, + "step": 562 + }, + { + "epoch": 4.201492537313433, + "grad_norm": 0.2916626614705674, + "learning_rate": 3.615717274655364e-06, + "loss": 0.26528483629226685, + "num_tokens": 4518704.0, + "step": 563 + }, + { + "epoch": 4.208955223880597, + "grad_norm": 0.24960926879350168, + "learning_rate": 3.5948246505346537e-06, + "loss": 0.27783459424972534, + "num_tokens": 5501253.0, + "step": 564 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.25681267819662723, + "learning_rate": 3.5742846519199715e-06, + "loss": 0.27307459712028503, + "num_tokens": 6402302.0, + "step": 565 + }, + { + "epoch": 4.223880597014926, + "grad_norm": 0.2412629050166804, + "learning_rate": 3.5540978400533933e-06, + "loss": 0.264928936958313, + "num_tokens": 7296048.0, + "step": 566 + }, + { + "epoch": 4.231343283582089, + "grad_norm": 0.26007426064530514, + "learning_rate": 3.5342647665263963e-06, + "loss": 0.27285411953926086, + "num_tokens": 8246217.0, + "step": 567 + }, + { + "epoch": 4.2388059701492535, + "grad_norm": 0.2505447033271199, + "learning_rate": 3.514785973264789e-06, + "loss": 0.2539595663547516, + "num_tokens": 9030493.0, + "step": 568 + }, + { + "epoch": 4.246268656716418, + "grad_norm": 0.24939677987959621, + "learning_rate": 3.495661992513905e-06, + "loss": 0.273257315158844, + "num_tokens": 9936844.0, + "step": 569 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.25283551407816135, + "learning_rate": 3.476893346824055e-06, + "loss": 0.2572386562824249, + "num_tokens": 10836976.0, + "step": 570 + }, + { + "epoch": 4.2611940298507465, + "grad_norm": 0.25014049995931353, + "learning_rate": 3.4584805490362493e-06, + "loss": 0.27239200472831726, + "num_tokens": 11812223.0, + "step": 571 + }, + { + "epoch": 4.268656716417911, + "grad_norm": 0.2565763851261565, + "learning_rate": 3.4404241022681873e-06, + "loss": 0.26448339223861694, + "num_tokens": 12615614.0, + "step": 572 + }, + { + "epoch": 4.276119402985074, + "grad_norm": 0.25289117775054565, + "learning_rate": 3.42272449990051e-06, + "loss": 0.29063016176223755, + "num_tokens": 13567548.0, + "step": 573 + }, + { + "epoch": 4.2835820895522385, + "grad_norm": 0.25823345228475075, + "learning_rate": 3.40538222556332e-06, + "loss": 0.27311235666275024, + "num_tokens": 14395131.0, + "step": 574 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.23315641988846117, + "learning_rate": 3.388397753122957e-06, + "loss": 0.25236693024635315, + "num_tokens": 15335598.0, + "step": 575 + }, + { + "epoch": 4.298507462686567, + "grad_norm": 0.2841401512615274, + "learning_rate": 3.3717715466690624e-06, + "loss": 0.2869318723678589, + "num_tokens": 16179341.0, + "step": 576 + }, + { + "epoch": 4.3059701492537314, + "grad_norm": 0.25632145802021455, + "learning_rate": 3.3555040605018935e-06, + "loss": 0.26220396161079407, + "num_tokens": 16988671.0, + "step": 577 + }, + { + "epoch": 4.313432835820896, + "grad_norm": 0.26924823560517036, + "learning_rate": 3.339595739119909e-06, + "loss": 0.28524714708328247, + "num_tokens": 17818903.0, + "step": 578 + }, + { + "epoch": 4.32089552238806, + "grad_norm": 0.24597376079056055, + "learning_rate": 3.3240470172076226e-06, + "loss": 0.25928568840026855, + "num_tokens": 18686514.0, + "step": 579 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.2296054299641554, + "learning_rate": 3.3088583196237253e-06, + "loss": 0.2673494219779968, + "num_tokens": 19710461.0, + "step": 580 + }, + { + "epoch": 4.335820895522388, + "grad_norm": 0.303772974273409, + "learning_rate": 3.294030061389481e-06, + "loss": 0.29324933886528015, + "num_tokens": 20505162.0, + "step": 581 + }, + { + "epoch": 4.343283582089552, + "grad_norm": 0.24075458576098716, + "learning_rate": 3.2795626476773833e-06, + "loss": 0.2494013011455536, + "num_tokens": 21440460.0, + "step": 582 + }, + { + "epoch": 4.350746268656716, + "grad_norm": 0.26414061441007297, + "learning_rate": 3.2654564738000822e-06, + "loss": 0.28142672777175903, + "num_tokens": 22250398.0, + "step": 583 + }, + { + "epoch": 4.358208955223881, + "grad_norm": 0.22985556392550052, + "learning_rate": 3.2517119251995873e-06, + "loss": 0.2574723958969116, + "num_tokens": 23184740.0, + "step": 584 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.2340946834808678, + "learning_rate": 3.2383293774367286e-06, + "loss": 0.262751042842865, + "num_tokens": 24111398.0, + "step": 585 + }, + { + "epoch": 4.373134328358209, + "grad_norm": 0.2622850655813035, + "learning_rate": 3.225309196180906e-06, + "loss": 0.26962852478027344, + "num_tokens": 24935442.0, + "step": 586 + }, + { + "epoch": 4.380597014925373, + "grad_norm": 0.30896829245037205, + "learning_rate": 3.212651737200086e-06, + "loss": 0.2718137502670288, + "num_tokens": 25850666.0, + "step": 587 + }, + { + "epoch": 4.388059701492537, + "grad_norm": 0.24656349669035904, + "learning_rate": 3.200357346351084e-06, + "loss": 0.2535630166530609, + "num_tokens": 26632303.0, + "step": 588 + }, + { + "epoch": 4.395522388059701, + "grad_norm": 0.2478490440504693, + "learning_rate": 3.188426359570121e-06, + "loss": 0.2648570239543915, + "num_tokens": 27523524.0, + "step": 589 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.2467537368918543, + "learning_rate": 3.176859102863631e-06, + "loss": 0.268078088760376, + "num_tokens": 28364038.0, + "step": 590 + }, + { + "epoch": 4.41044776119403, + "grad_norm": 0.24022496953126724, + "learning_rate": 3.16565589229937e-06, + "loss": 0.2637268900871277, + "num_tokens": 29254874.0, + "step": 591 + }, + { + "epoch": 4.417910447761194, + "grad_norm": 0.25520142610516455, + "learning_rate": 3.1548170339977626e-06, + "loss": 0.27608251571655273, + "num_tokens": 30099118.0, + "step": 592 + }, + { + "epoch": 4.425373134328359, + "grad_norm": 0.25418541542220713, + "learning_rate": 3.144342824123548e-06, + "loss": 0.27403631806373596, + "num_tokens": 30937080.0, + "step": 593 + }, + { + "epoch": 4.432835820895522, + "grad_norm": 0.32216925679050706, + "learning_rate": 3.134233548877684e-06, + "loss": 0.2749292850494385, + "num_tokens": 31868459.0, + "step": 594 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.23633842342693723, + "learning_rate": 3.1244894844895307e-06, + "loss": 0.26009055972099304, + "num_tokens": 32844776.0, + "step": 595 + }, + { + "epoch": 4.447761194029851, + "grad_norm": 0.23167846669851885, + "learning_rate": 3.115110897209297e-06, + "loss": 0.25624188780784607, + "num_tokens": 33800215.0, + "step": 596 + }, + { + "epoch": 4.455223880597015, + "grad_norm": 0.31853695227310724, + "learning_rate": 3.1060980433007674e-06, + "loss": 0.26650676131248474, + "num_tokens": 34652575.0, + "step": 597 + }, + { + "epoch": 4.462686567164179, + "grad_norm": 0.2474619333740578, + "learning_rate": 3.0974511690342995e-06, + "loss": 0.26506173610687256, + "num_tokens": 35526076.0, + "step": 598 + }, + { + "epoch": 4.470149253731344, + "grad_norm": 0.2379051889253177, + "learning_rate": 3.089170510680101e-06, + "loss": 0.2590046525001526, + "num_tokens": 36465383.0, + "step": 599 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.24103925822435351, + "learning_rate": 3.0812562945017625e-06, + "loss": 0.26156845688819885, + "num_tokens": 37402609.0, + "step": 600 + }, + { + "epoch": 4.485074626865671, + "grad_norm": 0.24478825299983173, + "learning_rate": 3.0737087367500848e-06, + "loss": 0.26436761021614075, + "num_tokens": 38372549.0, + "step": 601 + }, + { + "epoch": 4.492537313432836, + "grad_norm": 0.25588847680085197, + "learning_rate": 3.066528043657163e-06, + "loss": 0.2649264335632324, + "num_tokens": 39258770.0, + "step": 602 + }, + { + "epoch": 4.5, + "grad_norm": 0.2462865755078873, + "learning_rate": 3.0597144114307577e-06, + "loss": 0.2759783864021301, + "num_tokens": 40167992.0, + "step": 603 + }, + { + "epoch": 4.507462686567164, + "grad_norm": 0.24099167658546947, + "learning_rate": 3.0532680262489272e-06, + "loss": 0.2647096812725067, + "num_tokens": 41103593.0, + "step": 604 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.26228388440102607, + "learning_rate": 3.047189064254947e-06, + "loss": 0.2846449017524719, + "num_tokens": 41964920.0, + "step": 605 + }, + { + "epoch": 4.522388059701493, + "grad_norm": 0.24399175109648016, + "learning_rate": 3.0414776915524926e-06, + "loss": 0.2578504979610443, + "num_tokens": 42832698.0, + "step": 606 + }, + { + "epoch": 4.529850746268656, + "grad_norm": 0.24557003170358038, + "learning_rate": 3.0361340642010974e-06, + "loss": 0.2687520980834961, + "num_tokens": 43751841.0, + "step": 607 + }, + { + "epoch": 4.537313432835821, + "grad_norm": 0.2457169444504286, + "learning_rate": 3.0311583282119004e-06, + "loss": 0.2654935121536255, + "num_tokens": 44670570.0, + "step": 608 + }, + { + "epoch": 4.544776119402985, + "grad_norm": 0.23344187463481425, + "learning_rate": 3.026550619543641e-06, + "loss": 0.2680796980857849, + "num_tokens": 45565349.0, + "step": 609 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.2616978203803085, + "learning_rate": 3.0223110640989607e-06, + "loss": 0.2733978033065796, + "num_tokens": 46334877.0, + "step": 610 + }, + { + "epoch": 4.559701492537314, + "grad_norm": 0.24402710769793126, + "learning_rate": 3.0184397777209436e-06, + "loss": 0.26678377389907837, + "num_tokens": 47197933.0, + "step": 611 + }, + { + "epoch": 4.567164179104478, + "grad_norm": 0.23596997365787184, + "learning_rate": 3.0149368661899707e-06, + "loss": 0.2666507959365845, + "num_tokens": 48185966.0, + "step": 612 + }, + { + "epoch": 4.574626865671641, + "grad_norm": 0.26072871992164276, + "learning_rate": 3.0118024252208146e-06, + "loss": 0.2727803587913513, + "num_tokens": 49053041.0, + "step": 613 + }, + { + "epoch": 4.582089552238806, + "grad_norm": 0.2429680853323204, + "learning_rate": 3.0090365404600324e-06, + "loss": 0.27436989545822144, + "num_tokens": 49972669.0, + "step": 614 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.2492201703405157, + "learning_rate": 3.0066392874836254e-06, + "loss": 0.2650463581085205, + "num_tokens": 50759258.0, + "step": 615 + }, + { + "epoch": 4.597014925373134, + "grad_norm": 0.23159753484897908, + "learning_rate": 3.004610731794965e-06, + "loss": 0.2537558376789093, + "num_tokens": 51687796.0, + "step": 616 + }, + { + "epoch": 4.604477611940299, + "grad_norm": 0.23805277832433672, + "learning_rate": 3.002950928823016e-06, + "loss": 0.26197919249534607, + "num_tokens": 52660231.0, + "step": 617 + }, + { + "epoch": 4.611940298507463, + "grad_norm": 0.24026810337813148, + "learning_rate": 3.001659923920811e-06, + "loss": 0.2531256675720215, + "num_tokens": 53529194.0, + "step": 618 + }, + { + "epoch": 4.619402985074627, + "grad_norm": 0.258077064890661, + "learning_rate": 3.0007377523642196e-06, + "loss": 0.26511213183403015, + "num_tokens": 54455687.0, + "step": 619 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.24117669708783973, + "learning_rate": 3.0001844393509754e-06, + "loss": 0.2814059257507324, + "num_tokens": 55475962.0, + "step": 620 + }, + { + "epoch": 4.6268656716417915, + "step": 620, + "total_flos": 829282868854784.0, + "train_loss": 0.02679153286641644, + "train_runtime": 1845.6941, + "train_samples_per_second": 10.749, + "train_steps_per_second": 0.336 + } + ], + "logging_steps": 1, + "max_steps": 620, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 62, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 829282868854784.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..6091534 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:453ef576ce6078cc7a95b46c79bd60b61209672d5f12f8120656da9c1a677eba +size 7633