commit f5c3f6a26c3f90d063243193e8fb105c75e632cd Author: ModelHub XC Date: Sat Jun 20 17:47:31 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: edbeeching/Qwen3-4B-Instruct-2507-SFT-tr5 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..bf90504 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +base_model: Qwen/Qwen3-4B-Instruct-2507 +library_name: transformers +model_name: Qwen3-4B-Instruct-2507-SFT-tr5 +tags: +- generated_from_trainer +- sft +- trackio:https://huggingface.co/spaces/hf-imo-colab/trackio-distillation-sft +- trackio +- trl +- trl-internal +licence: license +--- + +# Model Card for Qwen3-4B-Instruct-2507-SFT-tr5 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="edbeeching/Qwen3-4B-Instruct-2507-SFT-tr5", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/huggingface/imo-distillation/runs/menw08rt) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.27.0.dev0 +- Transformers: 5.3.0.dev0 +- Pytorch: 2.10.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..6a736f5 --- /dev/null +++ b/all_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 4.6268656716417915, + "total_flos": 829937030004736.0, + "train_loss": 0.4202386662844689, + "train_runtime": 18585.0074, + "train_samples": 4281, + "train_samples_per_second": 1.068, + "train_steps_per_second": 0.033 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..70adff8 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,61 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e406718 --- /dev/null +++ b/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 5000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.3.0.dev0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2df8b40 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "do_sample": true, + "eos_token_id": 151645, + "pad_token_id": 151643, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.3.0.dev0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..1cc0da6 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f48cc1c0bb7c443abca2a6c7afef30c5b7e16cb0af9657bb269183c49ca76a76 +size 8044982080 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..63e1b5b --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 1010000, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..6a736f5 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 4.6268656716417915, + "total_flos": 829937030004736.0, + "train_loss": 0.4202386662844689, + "train_runtime": 18585.0074, + "train_samples": 4281, + "train_samples_per_second": 1.068, + "train_steps_per_second": 0.033 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..bd78d9f --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,5003 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.6268656716417915, + "eval_steps": 500, + "global_step": 620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007462686567164179, + "grad_norm": 11.35859680736035, + "learning_rate": 0.0, + "loss": 1.047095537185669, + "num_tokens": 940173.0, + "step": 1 + }, + { + "epoch": 0.014925373134328358, + "grad_norm": 11.310520487616877, + "learning_rate": 5.263157894736843e-07, + "loss": 1.0946075916290283, + "num_tokens": 1940908.0, + "step": 2 + }, + { + "epoch": 0.022388059701492536, + "grad_norm": 11.106569322922516, + "learning_rate": 1.0526315789473685e-06, + "loss": 1.0278105735778809, + "num_tokens": 2857302.0, + "step": 3 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 10.881054443812134, + "learning_rate": 1.5789473684210526e-06, + "loss": 1.0398736000061035, + "num_tokens": 3696299.0, + "step": 4 + }, + { + "epoch": 0.03731343283582089, + "grad_norm": 10.448295115598174, + "learning_rate": 2.105263157894737e-06, + "loss": 1.0615425109863281, + "num_tokens": 4528104.0, + "step": 5 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 10.151241780828355, + "learning_rate": 2.631578947368421e-06, + "loss": 1.0268486738204956, + "num_tokens": 5554518.0, + "step": 6 + }, + { + "epoch": 0.05223880597014925, + "grad_norm": 8.119312484055971, + "learning_rate": 3.157894736842105e-06, + "loss": 0.9329569935798645, + "num_tokens": 6422948.0, + "step": 7 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 7.409758964343402, + "learning_rate": 3.6842105263157896e-06, + "loss": 0.8917287588119507, + "num_tokens": 7201431.0, + "step": 8 + }, + { + "epoch": 0.06716417910447761, + "grad_norm": 5.971479536888058, + "learning_rate": 4.210526315789474e-06, + "loss": 0.8006043434143066, + "num_tokens": 8128474.0, + "step": 9 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 3.4445244902185927, + "learning_rate": 4.736842105263158e-06, + "loss": 0.7708431482315063, + "num_tokens": 9073762.0, + "step": 10 + }, + { + "epoch": 0.08208955223880597, + "grad_norm": 2.227913040407572, + "learning_rate": 5.263157894736842e-06, + "loss": 0.689713716506958, + "num_tokens": 9950348.0, + "step": 11 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 1.8665254369252244, + "learning_rate": 5.789473684210527e-06, + "loss": 0.7132350206375122, + "num_tokens": 10884740.0, + "step": 12 + }, + { + "epoch": 0.09701492537313433, + "grad_norm": 2.952404437976229, + "learning_rate": 6.31578947368421e-06, + "loss": 0.713362455368042, + "num_tokens": 11697616.0, + "step": 13 + }, + { + "epoch": 0.1044776119402985, + "grad_norm": 2.826605099421276, + "learning_rate": 6.842105263157896e-06, + "loss": 0.6958507895469666, + "num_tokens": 12632232.0, + "step": 14 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 2.4454572403082926, + "learning_rate": 7.368421052631579e-06, + "loss": 0.6733378171920776, + "num_tokens": 13568493.0, + "step": 15 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 2.0537063830263924, + "learning_rate": 7.894736842105265e-06, + "loss": 0.6741904020309448, + "num_tokens": 14533820.0, + "step": 16 + }, + { + "epoch": 0.12686567164179105, + "grad_norm": 1.4727507656008452, + "learning_rate": 8.421052631578948e-06, + "loss": 0.6536232829093933, + "num_tokens": 15435498.0, + "step": 17 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 1.054376608380898, + "learning_rate": 8.947368421052632e-06, + "loss": 0.6000441312789917, + "num_tokens": 16351791.0, + "step": 18 + }, + { + "epoch": 0.1417910447761194, + "grad_norm": 0.9835940111044099, + "learning_rate": 9.473684210526315e-06, + "loss": 0.6027337312698364, + "num_tokens": 17276920.0, + "step": 19 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.916308840098788, + "learning_rate": 1e-05, + "loss": 0.6199864149093628, + "num_tokens": 18270172.0, + "step": 20 + }, + { + "epoch": 0.15671641791044777, + "grad_norm": 0.6212633844448718, + "learning_rate": 9.999938520216343e-06, + "loss": 0.5760895609855652, + "num_tokens": 19308005.0, + "step": 21 + }, + { + "epoch": 0.16417910447761194, + "grad_norm": 0.5315615385439493, + "learning_rate": 9.999754082545261e-06, + "loss": 0.5423388481140137, + "num_tokens": 20162217.0, + "step": 22 + }, + { + "epoch": 0.17164179104477612, + "grad_norm": 0.5852277738108399, + "learning_rate": 9.999446692026396e-06, + "loss": 0.5618520975112915, + "num_tokens": 20980497.0, + "step": 23 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 0.5256536336611786, + "learning_rate": 9.999016357058996e-06, + "loss": 0.5482994914054871, + "num_tokens": 21857362.0, + "step": 24 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.436253543862231, + "learning_rate": 9.99846308940168e-06, + "loss": 0.5038638710975647, + "num_tokens": 22792620.0, + "step": 25 + }, + { + "epoch": 0.19402985074626866, + "grad_norm": 0.47872306271108794, + "learning_rate": 9.997786904172126e-06, + "loss": 0.5729074478149414, + "num_tokens": 23723110.0, + "step": 26 + }, + { + "epoch": 0.20149253731343283, + "grad_norm": 0.3887165593913177, + "learning_rate": 9.996987819846656e-06, + "loss": 0.5251473188400269, + "num_tokens": 24725024.0, + "step": 27 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 0.4864210479565411, + "learning_rate": 9.996065858259729e-06, + "loss": 0.560759425163269, + "num_tokens": 25729987.0, + "step": 28 + }, + { + "epoch": 0.21641791044776118, + "grad_norm": 0.4545327828204722, + "learning_rate": 9.995021044603343e-06, + "loss": 0.5304505825042725, + "num_tokens": 26557013.0, + "step": 29 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.369912070212526, + "learning_rate": 9.993853407426353e-06, + "loss": 0.5103640556335449, + "num_tokens": 27503464.0, + "step": 30 + }, + { + "epoch": 0.23134328358208955, + "grad_norm": 0.32843421942348455, + "learning_rate": 9.99256297863368e-06, + "loss": 0.5005761384963989, + "num_tokens": 28533732.0, + "step": 31 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 0.36571377121484666, + "learning_rate": 9.991149793485453e-06, + "loss": 0.5339782238006592, + "num_tokens": 29340667.0, + "step": 32 + }, + { + "epoch": 0.2462686567164179, + "grad_norm": 0.3706600251055638, + "learning_rate": 9.989613890596034e-06, + "loss": 0.5353128910064697, + "num_tokens": 30210961.0, + "step": 33 + }, + { + "epoch": 0.2537313432835821, + "grad_norm": 0.3689913973205178, + "learning_rate": 9.987955311932968e-06, + "loss": 0.5166599750518799, + "num_tokens": 31101886.0, + "step": 34 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.33967789101967927, + "learning_rate": 9.986174102815837e-06, + "loss": 0.5018597841262817, + "num_tokens": 31897310.0, + "step": 35 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 0.34077171626781105, + "learning_rate": 9.984270311915019e-06, + "loss": 0.48667871952056885, + "num_tokens": 32540943.0, + "step": 36 + }, + { + "epoch": 0.27611940298507465, + "grad_norm": 0.3621091474207233, + "learning_rate": 9.982243991250359e-06, + "loss": 0.5088210105895996, + "num_tokens": 33542067.0, + "step": 37 + }, + { + "epoch": 0.2835820895522388, + "grad_norm": 0.3534080682731624, + "learning_rate": 9.980095196189748e-06, + "loss": 0.4913540482521057, + "num_tokens": 34504224.0, + "step": 38 + }, + { + "epoch": 0.291044776119403, + "grad_norm": 0.34385148887540573, + "learning_rate": 9.977823985447613e-06, + "loss": 0.5291423797607422, + "num_tokens": 35410799.0, + "step": 39 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.3614616882970318, + "learning_rate": 9.975430421083307e-06, + "loss": 0.5238292217254639, + "num_tokens": 36306291.0, + "step": 40 + }, + { + "epoch": 0.30597014925373134, + "grad_norm": 0.34380854428467267, + "learning_rate": 9.972914568499412e-06, + "loss": 0.49555328488349915, + "num_tokens": 37195796.0, + "step": 41 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 0.32872739996760125, + "learning_rate": 9.970276496439967e-06, + "loss": 0.48128455877304077, + "num_tokens": 38111088.0, + "step": 42 + }, + { + "epoch": 0.3208955223880597, + "grad_norm": 0.32224419409640415, + "learning_rate": 9.967516276988569e-06, + "loss": 0.47381213307380676, + "num_tokens": 38854783.0, + "step": 43 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 0.313605152437139, + "learning_rate": 9.964633985566412e-06, + "loss": 0.4922352433204651, + "num_tokens": 39832057.0, + "step": 44 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.3221801938329887, + "learning_rate": 9.961629700930236e-06, + "loss": 0.5065716505050659, + "num_tokens": 40758959.0, + "step": 45 + }, + { + "epoch": 0.34328358208955223, + "grad_norm": 0.34336243037288433, + "learning_rate": 9.958503505170158e-06, + "loss": 0.4985169470310211, + "num_tokens": 41744543.0, + "step": 46 + }, + { + "epoch": 0.35074626865671643, + "grad_norm": 0.323405267106758, + "learning_rate": 9.95525548370744e-06, + "loss": 0.4811803996562958, + "num_tokens": 42685398.0, + "step": 47 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 0.3472754733495145, + "learning_rate": 9.951885725292152e-06, + "loss": 0.4971832036972046, + "num_tokens": 43509328.0, + "step": 48 + }, + { + "epoch": 0.3656716417910448, + "grad_norm": 0.30314939517994505, + "learning_rate": 9.948394322000747e-06, + "loss": 0.4676430821418762, + "num_tokens": 44360961.0, + "step": 49 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.3115400700181878, + "learning_rate": 9.944781369233544e-06, + "loss": 0.4450893700122833, + "num_tokens": 45215408.0, + "step": 50 + }, + { + "epoch": 0.3805970149253731, + "grad_norm": 0.3274967224701377, + "learning_rate": 9.941046965712124e-06, + "loss": 0.4661027491092682, + "num_tokens": 46008801.0, + "step": 51 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 0.3185260501598265, + "learning_rate": 9.937191213476627e-06, + "loss": 0.45998284220695496, + "num_tokens": 46857304.0, + "step": 52 + }, + { + "epoch": 0.39552238805970147, + "grad_norm": 0.3187630499897143, + "learning_rate": 9.933214217882973e-06, + "loss": 0.49932676553726196, + "num_tokens": 47835515.0, + "step": 53 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 0.3126440220395918, + "learning_rate": 9.929116087599973e-06, + "loss": 0.49588972330093384, + "num_tokens": 48834826.0, + "step": 54 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.31909099806625735, + "learning_rate": 9.924896934606365e-06, + "loss": 0.49547284841537476, + "num_tokens": 49858718.0, + "step": 55 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 0.2999327415505548, + "learning_rate": 9.920556874187757e-06, + "loss": 0.45831602811813354, + "num_tokens": 50784650.0, + "step": 56 + }, + { + "epoch": 0.4253731343283582, + "grad_norm": 0.33478138187870804, + "learning_rate": 9.91609602493347e-06, + "loss": 0.44470953941345215, + "num_tokens": 51788903.0, + "step": 57 + }, + { + "epoch": 0.43283582089552236, + "grad_norm": 0.3098385124963181, + "learning_rate": 9.911514508733307e-06, + "loss": 0.48413345217704773, + "num_tokens": 52740886.0, + "step": 58 + }, + { + "epoch": 0.44029850746268656, + "grad_norm": 0.31570000266376347, + "learning_rate": 9.906812450774207e-06, + "loss": 0.5016104578971863, + "num_tokens": 53671576.0, + "step": 59 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.3184241179650494, + "learning_rate": 9.901989979536841e-06, + "loss": 0.4333784580230713, + "num_tokens": 54565325.0, + "step": 60 + }, + { + "epoch": 0.4552238805970149, + "grad_norm": 0.3257766657124954, + "learning_rate": 9.897047226792093e-06, + "loss": 0.47651222348213196, + "num_tokens": 55458901.0, + "step": 61 + }, + { + "epoch": 0.4626865671641791, + "grad_norm": 0.2817242291155619, + "learning_rate": 9.891984327597462e-06, + "loss": 0.4714818000793457, + "num_tokens": 56519373.0, + "step": 62 + }, + { + "epoch": 0.4701492537313433, + "grad_norm": 0.32585513855646564, + "learning_rate": 9.886801420293365e-06, + "loss": 0.4708700180053711, + "num_tokens": 57420562.0, + "step": 63 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 0.32958409535328365, + "learning_rate": 9.88149864649937e-06, + "loss": 0.49606209993362427, + "num_tokens": 58259052.0, + "step": 64 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.31230811419608556, + "learning_rate": 9.876076151110313e-06, + "loss": 0.4840630888938904, + "num_tokens": 59121922.0, + "step": 65 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 0.31050271225919246, + "learning_rate": 9.870534082292349e-06, + "loss": 0.4600119888782501, + "num_tokens": 60031785.0, + "step": 66 + }, + { + "epoch": 0.5, + "grad_norm": 0.2885380845506061, + "learning_rate": 9.864872591478895e-06, + "loss": 0.44136810302734375, + "num_tokens": 60972704.0, + "step": 67 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 0.28887203572406756, + "learning_rate": 9.859091833366498e-06, + "loss": 0.4619043469429016, + "num_tokens": 61912202.0, + "step": 68 + }, + { + "epoch": 0.5149253731343284, + "grad_norm": 0.297913211640831, + "learning_rate": 9.853191965910606e-06, + "loss": 0.48681432008743286, + "num_tokens": 62799081.0, + "step": 69 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.2978081791490928, + "learning_rate": 9.847173150321252e-06, + "loss": 0.4710129499435425, + "num_tokens": 63821360.0, + "step": 70 + }, + { + "epoch": 0.5298507462686567, + "grad_norm": 0.33901428896502994, + "learning_rate": 9.84103555105865e-06, + "loss": 0.46070268750190735, + "num_tokens": 64698236.0, + "step": 71 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 0.2863724536535567, + "learning_rate": 9.8347793358287e-06, + "loss": 0.43551623821258545, + "num_tokens": 65531533.0, + "step": 72 + }, + { + "epoch": 0.5447761194029851, + "grad_norm": 0.30884498358581325, + "learning_rate": 9.828404675578405e-06, + "loss": 0.43174412846565247, + "num_tokens": 66409682.0, + "step": 73 + }, + { + "epoch": 0.5522388059701493, + "grad_norm": 0.39653106497260543, + "learning_rate": 9.821911744491203e-06, + "loss": 0.47224926948547363, + "num_tokens": 67201739.0, + "step": 74 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.34427781009373076, + "learning_rate": 9.815300719982204e-06, + "loss": 0.46234217286109924, + "num_tokens": 68054610.0, + "step": 75 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 0.28593313207513976, + "learning_rate": 9.808571782693345e-06, + "loss": 0.4445508122444153, + "num_tokens": 68905436.0, + "step": 76 + }, + { + "epoch": 0.5746268656716418, + "grad_norm": 0.27754253103287374, + "learning_rate": 9.80172511648845e-06, + "loss": 0.4535985291004181, + "num_tokens": 69815159.0, + "step": 77 + }, + { + "epoch": 0.582089552238806, + "grad_norm": 0.2751626726169941, + "learning_rate": 9.794760908448215e-06, + "loss": 0.4778493642807007, + "num_tokens": 70800960.0, + "step": 78 + }, + { + "epoch": 0.5895522388059702, + "grad_norm": 0.2878195146653705, + "learning_rate": 9.787679348865082e-06, + "loss": 0.43559134006500244, + "num_tokens": 71706284.0, + "step": 79 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.3046702186252135, + "learning_rate": 9.780480631238052e-06, + "loss": 0.45745372772216797, + "num_tokens": 72585611.0, + "step": 80 + }, + { + "epoch": 0.6044776119402985, + "grad_norm": 0.2580161347993156, + "learning_rate": 9.773164952267394e-06, + "loss": 0.44172853231430054, + "num_tokens": 73603712.0, + "step": 81 + }, + { + "epoch": 0.6119402985074627, + "grad_norm": 0.31823458045045494, + "learning_rate": 9.765732511849269e-06, + "loss": 0.4543741047382355, + "num_tokens": 74510353.0, + "step": 82 + }, + { + "epoch": 0.6194029850746269, + "grad_norm": 0.3262276808903542, + "learning_rate": 9.758183513070266e-06, + "loss": 0.48102468252182007, + "num_tokens": 75426311.0, + "step": 83 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 0.298246592306743, + "learning_rate": 9.750518162201858e-06, + "loss": 0.45155635476112366, + "num_tokens": 76290512.0, + "step": 84 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.30840978846450423, + "learning_rate": 9.74273666869476e-06, + "loss": 0.4398882985115051, + "num_tokens": 77207410.0, + "step": 85 + }, + { + "epoch": 0.6417910447761194, + "grad_norm": 0.2986447882814022, + "learning_rate": 9.734839245173213e-06, + "loss": 0.43722379207611084, + "num_tokens": 78061170.0, + "step": 86 + }, + { + "epoch": 0.6492537313432836, + "grad_norm": 0.3213308600234638, + "learning_rate": 9.726826107429168e-06, + "loss": 0.44796180725097656, + "num_tokens": 78868118.0, + "step": 87 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 0.3249532753373927, + "learning_rate": 9.71869747441639e-06, + "loss": 0.4503297805786133, + "num_tokens": 79869363.0, + "step": 88 + }, + { + "epoch": 0.664179104477612, + "grad_norm": 0.5892356895414527, + "learning_rate": 9.71045356824448e-06, + "loss": 0.4414302110671997, + "num_tokens": 80709876.0, + "step": 89 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.32884534307528746, + "learning_rate": 9.7020946141728e-06, + "loss": 0.42054399847984314, + "num_tokens": 81535856.0, + "step": 90 + }, + { + "epoch": 0.6791044776119403, + "grad_norm": 0.2754517512669749, + "learning_rate": 9.693620840604326e-06, + "loss": 0.4349040985107422, + "num_tokens": 82583455.0, + "step": 91 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 0.3190387165435769, + "learning_rate": 9.685032479079394e-06, + "loss": 0.44351187348365784, + "num_tokens": 83425036.0, + "step": 92 + }, + { + "epoch": 0.6940298507462687, + "grad_norm": 0.29203678336341016, + "learning_rate": 9.676329764269385e-06, + "loss": 0.4587559103965759, + "num_tokens": 84446952.0, + "step": 93 + }, + { + "epoch": 0.7014925373134329, + "grad_norm": 0.2977218953461726, + "learning_rate": 9.667512933970315e-06, + "loss": 0.429887980222702, + "num_tokens": 85254048.0, + "step": 94 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.319328445980617, + "learning_rate": 9.65858222909632e-06, + "loss": 0.4590649902820587, + "num_tokens": 86163467.0, + "step": 95 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 0.5444784762173913, + "learning_rate": 9.649537893673096e-06, + "loss": 0.4472053647041321, + "num_tokens": 86980140.0, + "step": 96 + }, + { + "epoch": 0.7238805970149254, + "grad_norm": 0.33070572527793457, + "learning_rate": 9.640380174831209e-06, + "loss": 0.44589415192604065, + "num_tokens": 87928454.0, + "step": 97 + }, + { + "epoch": 0.7313432835820896, + "grad_norm": 0.31480720093895037, + "learning_rate": 9.631109322799362e-06, + "loss": 0.45890533924102783, + "num_tokens": 88687125.0, + "step": 98 + }, + { + "epoch": 0.7388059701492538, + "grad_norm": 0.3045515849614143, + "learning_rate": 9.621725590897544e-06, + "loss": 0.4472447633743286, + "num_tokens": 89545040.0, + "step": 99 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.31053505819411625, + "learning_rate": 9.61222923553011e-06, + "loss": 0.44827064871788025, + "num_tokens": 90294885.0, + "step": 100 + }, + { + "epoch": 0.753731343283582, + "grad_norm": 0.3029175634429252, + "learning_rate": 9.60262051617879e-06, + "loss": 0.4412766695022583, + "num_tokens": 91184198.0, + "step": 101 + }, + { + "epoch": 0.7611940298507462, + "grad_norm": 0.31643279761949383, + "learning_rate": 9.592899695395569e-06, + "loss": 0.4483514428138733, + "num_tokens": 91984545.0, + "step": 102 + }, + { + "epoch": 0.7686567164179104, + "grad_norm": 0.29772953486777926, + "learning_rate": 9.583067038795547e-06, + "loss": 0.48575955629348755, + "num_tokens": 92895986.0, + "step": 103 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 0.3103900650504769, + "learning_rate": 9.57312281504965e-06, + "loss": 0.4450864791870117, + "num_tokens": 93788383.0, + "step": 104 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.2842262724404981, + "learning_rate": 9.563067295877319e-06, + "loss": 0.4178208112716675, + "num_tokens": 94636525.0, + "step": 105 + }, + { + "epoch": 0.7910447761194029, + "grad_norm": 0.318233292303752, + "learning_rate": 9.552900756039057e-06, + "loss": 0.48816001415252686, + "num_tokens": 95397416.0, + "step": 106 + }, + { + "epoch": 0.7985074626865671, + "grad_norm": 0.3031459599411157, + "learning_rate": 9.54262347332894e-06, + "loss": 0.4687079191207886, + "num_tokens": 96224288.0, + "step": 107 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 0.3044834471531261, + "learning_rate": 9.532235728567025e-06, + "loss": 0.4333556890487671, + "num_tokens": 97053744.0, + "step": 108 + }, + { + "epoch": 0.8134328358208955, + "grad_norm": 0.382174488436462, + "learning_rate": 9.521737805591662e-06, + "loss": 0.45386844873428345, + "num_tokens": 97941243.0, + "step": 109 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.29853935870773984, + "learning_rate": 9.511129991251755e-06, + "loss": 0.4180367588996887, + "num_tokens": 98814023.0, + "step": 110 + }, + { + "epoch": 0.8283582089552238, + "grad_norm": 0.3152812743712433, + "learning_rate": 9.500412575398923e-06, + "loss": 0.45900076627731323, + "num_tokens": 99770911.0, + "step": 111 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 0.2798327916645599, + "learning_rate": 9.489585850879565e-06, + "loss": 0.4589983820915222, + "num_tokens": 100802886.0, + "step": 112 + }, + { + "epoch": 0.8432835820895522, + "grad_norm": 0.3302819245429099, + "learning_rate": 9.478650113526875e-06, + "loss": 0.44858676195144653, + "num_tokens": 101744970.0, + "step": 113 + }, + { + "epoch": 0.8507462686567164, + "grad_norm": 0.29962088349132515, + "learning_rate": 9.467605662152746e-06, + "loss": 0.4746031165122986, + "num_tokens": 102730722.0, + "step": 114 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.2939144591705004, + "learning_rate": 9.456452798539617e-06, + "loss": 0.4174093008041382, + "num_tokens": 103574949.0, + "step": 115 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 0.3825239836099086, + "learning_rate": 9.445191827432216e-06, + "loss": 0.439868301153183, + "num_tokens": 104504791.0, + "step": 116 + }, + { + "epoch": 0.8731343283582089, + "grad_norm": 0.30386076772048964, + "learning_rate": 9.433823056529241e-06, + "loss": 0.47291260957717896, + "num_tokens": 105479834.0, + "step": 117 + }, + { + "epoch": 0.8805970149253731, + "grad_norm": 0.2762720558491326, + "learning_rate": 9.42234679647495e-06, + "loss": 0.4426780045032501, + "num_tokens": 106438084.0, + "step": 118 + }, + { + "epoch": 0.8880597014925373, + "grad_norm": 0.3057508592926945, + "learning_rate": 9.410763360850666e-06, + "loss": 0.4623616933822632, + "num_tokens": 107262750.0, + "step": 119 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.3127855621446368, + "learning_rate": 9.399073066166218e-06, + "loss": 0.4572855234146118, + "num_tokens": 108143548.0, + "step": 120 + }, + { + "epoch": 0.9029850746268657, + "grad_norm": 0.32166755849704814, + "learning_rate": 9.387276231851292e-06, + "loss": 0.4610549211502075, + "num_tokens": 109031239.0, + "step": 121 + }, + { + "epoch": 0.9104477611940298, + "grad_norm": 0.308391680528446, + "learning_rate": 9.375373180246698e-06, + "loss": 0.4695647358894348, + "num_tokens": 109986382.0, + "step": 122 + }, + { + "epoch": 0.917910447761194, + "grad_norm": 0.2975657588114746, + "learning_rate": 9.363364236595561e-06, + "loss": 0.47796621918678284, + "num_tokens": 110966120.0, + "step": 123 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 0.31052979583373397, + "learning_rate": 9.351249729034441e-06, + "loss": 0.46253445744514465, + "num_tokens": 111841748.0, + "step": 124 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.30804176635348807, + "learning_rate": 9.339029988584364e-06, + "loss": 0.45033249258995056, + "num_tokens": 112797621.0, + "step": 125 + }, + { + "epoch": 0.9402985074626866, + "grad_norm": 0.2896323126815727, + "learning_rate": 9.326705349141772e-06, + "loss": 0.46928197145462036, + "num_tokens": 113854322.0, + "step": 126 + }, + { + "epoch": 0.9477611940298507, + "grad_norm": 0.2863377703738466, + "learning_rate": 9.31427614746941e-06, + "loss": 0.44036608934402466, + "num_tokens": 114797592.0, + "step": 127 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 0.3136460841921916, + "learning_rate": 9.301742723187106e-06, + "loss": 0.4462299644947052, + "num_tokens": 115756574.0, + "step": 128 + }, + { + "epoch": 0.9626865671641791, + "grad_norm": 0.30712216569223755, + "learning_rate": 9.289105418762512e-06, + "loss": 0.46634775400161743, + "num_tokens": 116620827.0, + "step": 129 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30150157073298506, + "learning_rate": 9.276364579501743e-06, + "loss": 0.4525374174118042, + "num_tokens": 117496028.0, + "step": 130 + }, + { + "epoch": 0.9776119402985075, + "grad_norm": 0.2863498319159055, + "learning_rate": 9.263520553539919e-06, + "loss": 0.43308988213539124, + "num_tokens": 118326101.0, + "step": 131 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 0.31739713823558746, + "learning_rate": 9.250573691831688e-06, + "loss": 0.4591742753982544, + "num_tokens": 119217901.0, + "step": 132 + }, + { + "epoch": 0.9925373134328358, + "grad_norm": 0.3107389978804748, + "learning_rate": 9.2375243481416e-06, + "loss": 0.4491395056247711, + "num_tokens": 120120192.0, + "step": 133 + }, + { + "epoch": 1.0, + "grad_norm": 0.29934735002842794, + "learning_rate": 9.224372879034471e-06, + "loss": 0.44749873876571655, + "num_tokens": 121051485.0, + "step": 134 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.33488387869414854, + "learning_rate": 9.211119643865626e-06, + "loss": 0.4307776689529419, + "num_tokens": 121991896.0, + "step": 135 + }, + { + "epoch": 1.0149253731343284, + "grad_norm": 0.32499655410029626, + "learning_rate": 9.197765004771074e-06, + "loss": 0.4204443097114563, + "num_tokens": 122819690.0, + "step": 136 + }, + { + "epoch": 1.0223880597014925, + "grad_norm": 0.34181089478733623, + "learning_rate": 9.184309326657627e-06, + "loss": 0.41079288721084595, + "num_tokens": 123657032.0, + "step": 137 + }, + { + "epoch": 1.0298507462686568, + "grad_norm": 0.5825488788426431, + "learning_rate": 9.17075297719292e-06, + "loss": 0.4082901179790497, + "num_tokens": 124550556.0, + "step": 138 + }, + { + "epoch": 1.037313432835821, + "grad_norm": 1.1799244713672623, + "learning_rate": 9.157096326795369e-06, + "loss": 0.42325854301452637, + "num_tokens": 125328617.0, + "step": 139 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3981431547057968, + "learning_rate": 9.143339748624044e-06, + "loss": 0.40712812542915344, + "num_tokens": 126306594.0, + "step": 140 + }, + { + "epoch": 1.0522388059701493, + "grad_norm": 0.32884099051410826, + "learning_rate": 9.129483618568478e-06, + "loss": 0.4147931933403015, + "num_tokens": 127215038.0, + "step": 141 + }, + { + "epoch": 1.0597014925373134, + "grad_norm": 0.3071551975535917, + "learning_rate": 9.115528315238396e-06, + "loss": 0.4247783422470093, + "num_tokens": 128054129.0, + "step": 142 + }, + { + "epoch": 1.0671641791044777, + "grad_norm": 0.3132240777032372, + "learning_rate": 9.101474219953367e-06, + "loss": 0.4133056104183197, + "num_tokens": 128952014.0, + "step": 143 + }, + { + "epoch": 1.0746268656716418, + "grad_norm": 0.31895939410654406, + "learning_rate": 9.087321716732384e-06, + "loss": 0.4213321805000305, + "num_tokens": 129774041.0, + "step": 144 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.32304487832880724, + "learning_rate": 9.073071192283374e-06, + "loss": 0.4195047616958618, + "num_tokens": 130656187.0, + "step": 145 + }, + { + "epoch": 1.0895522388059702, + "grad_norm": 0.31668877560620456, + "learning_rate": 9.058723035992632e-06, + "loss": 0.4216320514678955, + "num_tokens": 131546421.0, + "step": 146 + }, + { + "epoch": 1.0970149253731343, + "grad_norm": 0.30109857359574926, + "learning_rate": 9.044277639914177e-06, + "loss": 0.4255885183811188, + "num_tokens": 132482644.0, + "step": 147 + }, + { + "epoch": 1.1044776119402986, + "grad_norm": 0.28611352244816046, + "learning_rate": 9.029735398759044e-06, + "loss": 0.4004859924316406, + "num_tokens": 133363098.0, + "step": 148 + }, + { + "epoch": 1.1119402985074627, + "grad_norm": 0.3246541214309705, + "learning_rate": 9.015096709884493e-06, + "loss": 0.41801226139068604, + "num_tokens": 134281169.0, + "step": 149 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.39523810160114464, + "learning_rate": 9.00036197328316e-06, + "loss": 0.39403271675109863, + "num_tokens": 135132326.0, + "step": 150 + }, + { + "epoch": 1.126865671641791, + "grad_norm": 0.3372219635650443, + "learning_rate": 8.985531591572117e-06, + "loss": 0.40995997190475464, + "num_tokens": 136009199.0, + "step": 151 + }, + { + "epoch": 1.1343283582089552, + "grad_norm": 0.2880187226242739, + "learning_rate": 8.97060596998188e-06, + "loss": 0.44250696897506714, + "num_tokens": 136974761.0, + "step": 152 + }, + { + "epoch": 1.1417910447761195, + "grad_norm": 0.2840439662929065, + "learning_rate": 8.955585516345333e-06, + "loss": 0.41125112771987915, + "num_tokens": 137953131.0, + "step": 153 + }, + { + "epoch": 1.1492537313432836, + "grad_norm": 0.30854018310336556, + "learning_rate": 8.940470641086583e-06, + "loss": 0.41466018557548523, + "num_tokens": 138890202.0, + "step": 154 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.2861522107018775, + "learning_rate": 8.925261757209744e-06, + "loss": 0.4421645998954773, + "num_tokens": 139921851.0, + "step": 155 + }, + { + "epoch": 1.164179104477612, + "grad_norm": 0.30184466401361404, + "learning_rate": 8.909959280287657e-06, + "loss": 0.41726770997047424, + "num_tokens": 140840212.0, + "step": 156 + }, + { + "epoch": 1.171641791044776, + "grad_norm": 0.29786414496705443, + "learning_rate": 8.894563628450534e-06, + "loss": 0.4137997627258301, + "num_tokens": 141681181.0, + "step": 157 + }, + { + "epoch": 1.1791044776119404, + "grad_norm": 0.27612956474353256, + "learning_rate": 8.879075222374522e-06, + "loss": 0.3967845141887665, + "num_tokens": 142603331.0, + "step": 158 + }, + { + "epoch": 1.1865671641791045, + "grad_norm": 0.2936198747641151, + "learning_rate": 8.863494485270228e-06, + "loss": 0.3882240355014801, + "num_tokens": 143438386.0, + "step": 159 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.28750782577222145, + "learning_rate": 8.847821842871137e-06, + "loss": 0.42263633012771606, + "num_tokens": 144352522.0, + "step": 160 + }, + { + "epoch": 1.2014925373134329, + "grad_norm": 0.32255178451364774, + "learning_rate": 8.832057723421989e-06, + "loss": 0.42398497462272644, + "num_tokens": 145160558.0, + "step": 161 + }, + { + "epoch": 1.208955223880597, + "grad_norm": 0.32016607068719616, + "learning_rate": 8.816202557667076e-06, + "loss": 0.40889400243759155, + "num_tokens": 145970221.0, + "step": 162 + }, + { + "epoch": 1.2164179104477613, + "grad_norm": 0.30212941397274007, + "learning_rate": 8.800256778838468e-06, + "loss": 0.3960338234901428, + "num_tokens": 146893310.0, + "step": 163 + }, + { + "epoch": 1.2238805970149254, + "grad_norm": 0.31197303744834676, + "learning_rate": 8.78422082264418e-06, + "loss": 0.44305476546287537, + "num_tokens": 147701963.0, + "step": 164 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.2823293130053843, + "learning_rate": 8.768095127256263e-06, + "loss": 0.3833114206790924, + "num_tokens": 148634179.0, + "step": 165 + }, + { + "epoch": 1.2388059701492538, + "grad_norm": 0.2811151003410808, + "learning_rate": 8.751880133298834e-06, + "loss": 0.4171923100948334, + "num_tokens": 149594443.0, + "step": 166 + }, + { + "epoch": 1.2462686567164178, + "grad_norm": 0.31565679619489956, + "learning_rate": 8.735576283836039e-06, + "loss": 0.43264657258987427, + "num_tokens": 150495465.0, + "step": 167 + }, + { + "epoch": 1.2537313432835822, + "grad_norm": 0.3023001398731657, + "learning_rate": 8.719184024359935e-06, + "loss": 0.4185860753059387, + "num_tokens": 151402535.0, + "step": 168 + }, + { + "epoch": 1.2611940298507462, + "grad_norm": 0.3114367097991156, + "learning_rate": 8.702703802778332e-06, + "loss": 0.444894403219223, + "num_tokens": 152354215.0, + "step": 169 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3130958107073367, + "learning_rate": 8.686136069402542e-06, + "loss": 0.3862420916557312, + "num_tokens": 153135819.0, + "step": 170 + }, + { + "epoch": 1.2761194029850746, + "grad_norm": 0.32026467648986173, + "learning_rate": 8.669481276935085e-06, + "loss": 0.43771523237228394, + "num_tokens": 154060950.0, + "step": 171 + }, + { + "epoch": 1.2835820895522387, + "grad_norm": 0.33753040760769915, + "learning_rate": 8.652739880457309e-06, + "loss": 0.4314393401145935, + "num_tokens": 154999582.0, + "step": 172 + }, + { + "epoch": 1.291044776119403, + "grad_norm": 0.31404977555481944, + "learning_rate": 8.635912337416963e-06, + "loss": 0.4238457679748535, + "num_tokens": 155889540.0, + "step": 173 + }, + { + "epoch": 1.2985074626865671, + "grad_norm": 0.2917828706410469, + "learning_rate": 8.618999107615694e-06, + "loss": 0.4157620072364807, + "num_tokens": 156887223.0, + "step": 174 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.2929002597150211, + "learning_rate": 8.602000653196484e-06, + "loss": 0.4093779921531677, + "num_tokens": 157776705.0, + "step": 175 + }, + { + "epoch": 1.3134328358208955, + "grad_norm": 0.2981368517552101, + "learning_rate": 8.584917438631022e-06, + "loss": 0.4151228070259094, + "num_tokens": 158724790.0, + "step": 176 + }, + { + "epoch": 1.3208955223880596, + "grad_norm": 0.307459834676784, + "learning_rate": 8.567749930707012e-06, + "loss": 0.42905163764953613, + "num_tokens": 159719326.0, + "step": 177 + }, + { + "epoch": 1.328358208955224, + "grad_norm": 0.3174851983597954, + "learning_rate": 8.55049859851542e-06, + "loss": 0.44639986753463745, + "num_tokens": 160650411.0, + "step": 178 + }, + { + "epoch": 1.335820895522388, + "grad_norm": 0.37310729673210785, + "learning_rate": 8.533163913437657e-06, + "loss": 0.4070381820201874, + "num_tokens": 161685151.0, + "step": 179 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.34243880652688075, + "learning_rate": 8.515746349132693e-06, + "loss": 0.40524742007255554, + "num_tokens": 162668291.0, + "step": 180 + }, + { + "epoch": 1.3507462686567164, + "grad_norm": 0.3314697629279733, + "learning_rate": 8.498246381524123e-06, + "loss": 0.39374542236328125, + "num_tokens": 163602019.0, + "step": 181 + }, + { + "epoch": 1.3582089552238805, + "grad_norm": 0.39714424307879675, + "learning_rate": 8.480664488787157e-06, + "loss": 0.41536325216293335, + "num_tokens": 164374987.0, + "step": 182 + }, + { + "epoch": 1.3656716417910448, + "grad_norm": 0.30470654817019394, + "learning_rate": 8.463001151335556e-06, + "loss": 0.420206755399704, + "num_tokens": 165277351.0, + "step": 183 + }, + { + "epoch": 1.373134328358209, + "grad_norm": 0.30147269826178413, + "learning_rate": 8.445256851808504e-06, + "loss": 0.40577423572540283, + "num_tokens": 166179864.0, + "step": 184 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.3160553991473881, + "learning_rate": 8.427432075057422e-06, + "loss": 0.3979928195476532, + "num_tokens": 167127067.0, + "step": 185 + }, + { + "epoch": 1.3880597014925373, + "grad_norm": 0.31665903933128287, + "learning_rate": 8.409527308132717e-06, + "loss": 0.4436604976654053, + "num_tokens": 168100947.0, + "step": 186 + }, + { + "epoch": 1.3955223880597014, + "grad_norm": 0.296181555140025, + "learning_rate": 8.391543040270477e-06, + "loss": 0.42373591661453247, + "num_tokens": 168977100.0, + "step": 187 + }, + { + "epoch": 1.4029850746268657, + "grad_norm": 0.340781706854354, + "learning_rate": 8.373479762879104e-06, + "loss": 0.4242423474788666, + "num_tokens": 169809036.0, + "step": 188 + }, + { + "epoch": 1.4104477611940298, + "grad_norm": 0.2912347476979519, + "learning_rate": 8.355337969525876e-06, + "loss": 0.3881043791770935, + "num_tokens": 170799001.0, + "step": 189 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.3167891630018227, + "learning_rate": 8.337118155923474e-06, + "loss": 0.417064368724823, + "num_tokens": 171563636.0, + "step": 190 + }, + { + "epoch": 1.4253731343283582, + "grad_norm": 0.32116936347486175, + "learning_rate": 8.318820819916433e-06, + "loss": 0.40856266021728516, + "num_tokens": 172297711.0, + "step": 191 + }, + { + "epoch": 1.4328358208955223, + "grad_norm": 0.3019887016574649, + "learning_rate": 8.300446461467533e-06, + "loss": 0.4446168541908264, + "num_tokens": 173246434.0, + "step": 192 + }, + { + "epoch": 1.4402985074626866, + "grad_norm": 0.3138769818399579, + "learning_rate": 8.281995582644145e-06, + "loss": 0.4181920289993286, + "num_tokens": 174149904.0, + "step": 193 + }, + { + "epoch": 1.4477611940298507, + "grad_norm": 0.313975344503838, + "learning_rate": 8.263468687604508e-06, + "loss": 0.4371890425682068, + "num_tokens": 174963687.0, + "step": 194 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.29628794439446526, + "learning_rate": 8.244866282583957e-06, + "loss": 0.43816518783569336, + "num_tokens": 175988598.0, + "step": 195 + }, + { + "epoch": 1.462686567164179, + "grad_norm": 0.2963583065242463, + "learning_rate": 8.226188875881082e-06, + "loss": 0.41185736656188965, + "num_tokens": 176960311.0, + "step": 196 + }, + { + "epoch": 1.4701492537313432, + "grad_norm": 0.2991189293307387, + "learning_rate": 8.20743697784385e-06, + "loss": 0.46473461389541626, + "num_tokens": 177889691.0, + "step": 197 + }, + { + "epoch": 1.4776119402985075, + "grad_norm": 0.26573849496019714, + "learning_rate": 8.188611100855656e-06, + "loss": 0.3865639567375183, + "num_tokens": 178835508.0, + "step": 198 + }, + { + "epoch": 1.4850746268656716, + "grad_norm": 0.28471866573069565, + "learning_rate": 8.169711759321318e-06, + "loss": 0.4254840612411499, + "num_tokens": 179780829.0, + "step": 199 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.27591064975620333, + "learning_rate": 8.150739469653026e-06, + "loss": 0.3821393847465515, + "num_tokens": 180675259.0, + "step": 200 + }, + { + "epoch": 1.5, + "grad_norm": 0.2912891463065521, + "learning_rate": 8.131694750256234e-06, + "loss": 0.4260258972644806, + "num_tokens": 181593083.0, + "step": 201 + }, + { + "epoch": 1.5074626865671643, + "grad_norm": 0.3470505245514532, + "learning_rate": 8.112578121515485e-06, + "loss": 0.42295166850090027, + "num_tokens": 182453649.0, + "step": 202 + }, + { + "epoch": 1.5149253731343284, + "grad_norm": 0.333624297966994, + "learning_rate": 8.0933901057802e-06, + "loss": 0.4165676534175873, + "num_tokens": 183252908.0, + "step": 203 + }, + { + "epoch": 1.5223880597014925, + "grad_norm": 0.2999450247966616, + "learning_rate": 8.074131227350408e-06, + "loss": 0.42348137497901917, + "num_tokens": 184218061.0, + "step": 204 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.33075885588759496, + "learning_rate": 8.05480201246241e-06, + "loss": 0.4413604140281677, + "num_tokens": 185123701.0, + "step": 205 + }, + { + "epoch": 1.537313432835821, + "grad_norm": 0.3236918821990334, + "learning_rate": 8.035402989274402e-06, + "loss": 0.4267103970050812, + "num_tokens": 186020054.0, + "step": 206 + }, + { + "epoch": 1.544776119402985, + "grad_norm": 0.28545115313146596, + "learning_rate": 8.015934687852053e-06, + "loss": 0.4010322690010071, + "num_tokens": 186957926.0, + "step": 207 + }, + { + "epoch": 1.5522388059701493, + "grad_norm": 0.33525388932605726, + "learning_rate": 7.996397640154012e-06, + "loss": 0.43479830026626587, + "num_tokens": 187967937.0, + "step": 208 + }, + { + "epoch": 1.5597014925373134, + "grad_norm": 0.2852110581692416, + "learning_rate": 7.976792380017374e-06, + "loss": 0.3835904002189636, + "num_tokens": 188699883.0, + "step": 209 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.38746256380732114, + "learning_rate": 7.957119443143093e-06, + "loss": 0.43473392724990845, + "num_tokens": 189533459.0, + "step": 210 + }, + { + "epoch": 1.5746268656716418, + "grad_norm": 0.30040372660742176, + "learning_rate": 7.937379367081356e-06, + "loss": 0.4094908535480499, + "num_tokens": 190331401.0, + "step": 211 + }, + { + "epoch": 1.582089552238806, + "grad_norm": 0.35097170028371405, + "learning_rate": 7.917572691216868e-06, + "loss": 0.44787487387657166, + "num_tokens": 191163315.0, + "step": 212 + }, + { + "epoch": 1.5895522388059702, + "grad_norm": 0.29035162522974023, + "learning_rate": 7.897699956754142e-06, + "loss": 0.41564756631851196, + "num_tokens": 192105809.0, + "step": 213 + }, + { + "epoch": 1.5970149253731343, + "grad_norm": 0.3234055460991543, + "learning_rate": 7.877761706702698e-06, + "loss": 0.42737478017807007, + "num_tokens": 193098168.0, + "step": 214 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.3181366599415042, + "learning_rate": 7.85775848586222e-06, + "loss": 0.4263436794281006, + "num_tokens": 193975959.0, + "step": 215 + }, + { + "epoch": 1.6119402985074627, + "grad_norm": 0.3047597849777916, + "learning_rate": 7.837690840807688e-06, + "loss": 0.4356343150138855, + "num_tokens": 194828963.0, + "step": 216 + }, + { + "epoch": 1.6194029850746268, + "grad_norm": 0.2953366209904587, + "learning_rate": 7.817559319874417e-06, + "loss": 0.39498403668403625, + "num_tokens": 195757337.0, + "step": 217 + }, + { + "epoch": 1.626865671641791, + "grad_norm": 0.2936401683412748, + "learning_rate": 7.797364473143105e-06, + "loss": 0.4154474139213562, + "num_tokens": 196731181.0, + "step": 218 + }, + { + "epoch": 1.6343283582089554, + "grad_norm": 0.2898185408597091, + "learning_rate": 7.77710685242477e-06, + "loss": 0.42473846673965454, + "num_tokens": 197621017.0, + "step": 219 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.29114088952907274, + "learning_rate": 7.7567870112457e-06, + "loss": 0.4433613419532776, + "num_tokens": 198631859.0, + "step": 220 + }, + { + "epoch": 1.6492537313432836, + "grad_norm": 0.31287064287880717, + "learning_rate": 7.736405504832314e-06, + "loss": 0.4322376549243927, + "num_tokens": 199557498.0, + "step": 221 + }, + { + "epoch": 1.6567164179104479, + "grad_norm": 0.3031132335175992, + "learning_rate": 7.715962890095988e-06, + "loss": 0.41872939467430115, + "num_tokens": 200455519.0, + "step": 222 + }, + { + "epoch": 1.664179104477612, + "grad_norm": 0.5127084447985639, + "learning_rate": 7.695459725617851e-06, + "loss": 0.4426816999912262, + "num_tokens": 201364168.0, + "step": 223 + }, + { + "epoch": 1.671641791044776, + "grad_norm": 0.36355358662257686, + "learning_rate": 7.674896571633507e-06, + "loss": 0.3920941650867462, + "num_tokens": 202272665.0, + "step": 224 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.2918543179655489, + "learning_rate": 7.654273990017742e-06, + "loss": 0.3865686058998108, + "num_tokens": 203236852.0, + "step": 225 + }, + { + "epoch": 1.6865671641791045, + "grad_norm": 0.29443958475831755, + "learning_rate": 7.633592544269152e-06, + "loss": 0.41160887479782104, + "num_tokens": 204144409.0, + "step": 226 + }, + { + "epoch": 1.6940298507462686, + "grad_norm": 0.29368087510062574, + "learning_rate": 7.61285279949477e-06, + "loss": 0.41996899247169495, + "num_tokens": 205087641.0, + "step": 227 + }, + { + "epoch": 1.7014925373134329, + "grad_norm": 0.2981876720268518, + "learning_rate": 7.592055322394602e-06, + "loss": 0.4322773814201355, + "num_tokens": 205964269.0, + "step": 228 + }, + { + "epoch": 1.7089552238805972, + "grad_norm": 0.3032205060654827, + "learning_rate": 7.5712006812461595e-06, + "loss": 0.4357481002807617, + "num_tokens": 206853325.0, + "step": 229 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.30382769873452287, + "learning_rate": 7.5502894458889154e-06, + "loss": 0.42187392711639404, + "num_tokens": 207780456.0, + "step": 230 + }, + { + "epoch": 1.7238805970149254, + "grad_norm": 0.28458753280851, + "learning_rate": 7.529322187708752e-06, + "loss": 0.4417547583580017, + "num_tokens": 208692271.0, + "step": 231 + }, + { + "epoch": 1.7313432835820897, + "grad_norm": 0.28678480761878283, + "learning_rate": 7.5082994796223355e-06, + "loss": 0.4000692367553711, + "num_tokens": 209542301.0, + "step": 232 + }, + { + "epoch": 1.7388059701492538, + "grad_norm": 0.3105804034516556, + "learning_rate": 7.487221896061458e-06, + "loss": 0.43237993121147156, + "num_tokens": 210462903.0, + "step": 233 + }, + { + "epoch": 1.7462686567164178, + "grad_norm": 0.3069476203994755, + "learning_rate": 7.466090012957361e-06, + "loss": 0.4426308274269104, + "num_tokens": 211451379.0, + "step": 234 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.29187302592713965, + "learning_rate": 7.444904407724973e-06, + "loss": 0.4144989252090454, + "num_tokens": 212341336.0, + "step": 235 + }, + { + "epoch": 1.7611940298507462, + "grad_norm": 0.2715020106858522, + "learning_rate": 7.423665659247154e-06, + "loss": 0.4140280485153198, + "num_tokens": 213184565.0, + "step": 236 + }, + { + "epoch": 1.7686567164179103, + "grad_norm": 0.3042751492929567, + "learning_rate": 7.402374347858862e-06, + "loss": 0.4220738708972931, + "num_tokens": 214162910.0, + "step": 237 + }, + { + "epoch": 1.7761194029850746, + "grad_norm": 0.283596579410495, + "learning_rate": 7.381031055331306e-06, + "loss": 0.43350133299827576, + "num_tokens": 215182240.0, + "step": 238 + }, + { + "epoch": 1.783582089552239, + "grad_norm": 0.29114085647177373, + "learning_rate": 7.3596363648560445e-06, + "loss": 0.4327085316181183, + "num_tokens": 216074554.0, + "step": 239 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.28379283338161987, + "learning_rate": 7.338190861029052e-06, + "loss": 0.4293884038925171, + "num_tokens": 216989156.0, + "step": 240 + }, + { + "epoch": 1.7985074626865671, + "grad_norm": 0.31407525298001004, + "learning_rate": 7.316695129834744e-06, + "loss": 0.4033690392971039, + "num_tokens": 217859754.0, + "step": 241 + }, + { + "epoch": 1.8059701492537314, + "grad_norm": 0.3013707320804031, + "learning_rate": 7.2951497586299665e-06, + "loss": 0.415780246257782, + "num_tokens": 218674048.0, + "step": 242 + }, + { + "epoch": 1.8134328358208955, + "grad_norm": 0.3130414485143585, + "learning_rate": 7.273555336127948e-06, + "loss": 0.4289485216140747, + "num_tokens": 219544627.0, + "step": 243 + }, + { + "epoch": 1.8208955223880596, + "grad_norm": 0.271886252549519, + "learning_rate": 7.251912452382206e-06, + "loss": 0.4117184579372406, + "num_tokens": 220510777.0, + "step": 244 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.3095984364408915, + "learning_rate": 7.2302216987704395e-06, + "loss": 0.40528762340545654, + "num_tokens": 221358648.0, + "step": 245 + }, + { + "epoch": 1.835820895522388, + "grad_norm": 0.28537942146166506, + "learning_rate": 7.208483667978351e-06, + "loss": 0.37842410802841187, + "num_tokens": 222227328.0, + "step": 246 + }, + { + "epoch": 1.8432835820895521, + "grad_norm": 0.3285002711937223, + "learning_rate": 7.186698953983466e-06, + "loss": 0.4463423192501068, + "num_tokens": 223216379.0, + "step": 247 + }, + { + "epoch": 1.8507462686567164, + "grad_norm": 0.29900827070350944, + "learning_rate": 7.164868152038899e-06, + "loss": 0.42675986886024475, + "num_tokens": 224109870.0, + "step": 248 + }, + { + "epoch": 1.8582089552238807, + "grad_norm": 0.27490080435841, + "learning_rate": 7.1429918586570815e-06, + "loss": 0.4331856667995453, + "num_tokens": 225101205.0, + "step": 249 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.2935787072389711, + "learning_rate": 7.121070671593477e-06, + "loss": 0.4262286424636841, + "num_tokens": 226119986.0, + "step": 250 + }, + { + "epoch": 1.873134328358209, + "grad_norm": 0.3045861994484339, + "learning_rate": 7.099105189830235e-06, + "loss": 0.4218306541442871, + "num_tokens": 226995732.0, + "step": 251 + }, + { + "epoch": 1.8805970149253732, + "grad_norm": 0.27595409032706397, + "learning_rate": 7.077096013559831e-06, + "loss": 0.4189199209213257, + "num_tokens": 227872634.0, + "step": 252 + }, + { + "epoch": 1.8880597014925373, + "grad_norm": 0.289326233334052, + "learning_rate": 7.055043744168658e-06, + "loss": 0.44568511843681335, + "num_tokens": 228843256.0, + "step": 253 + }, + { + "epoch": 1.8955223880597014, + "grad_norm": 0.3108178596802667, + "learning_rate": 7.032948984220611e-06, + "loss": 0.39977630972862244, + "num_tokens": 229749232.0, + "step": 254 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.3029945133044889, + "learning_rate": 7.0108123374406046e-06, + "loss": 0.41192835569381714, + "num_tokens": 230524739.0, + "step": 255 + }, + { + "epoch": 1.9104477611940298, + "grad_norm": 0.25289759257512634, + "learning_rate": 6.988634408698083e-06, + "loss": 0.38565781712532043, + "num_tokens": 231455850.0, + "step": 256 + }, + { + "epoch": 1.917910447761194, + "grad_norm": 0.298108417839461, + "learning_rate": 6.966415803990501e-06, + "loss": 0.4397220015525818, + "num_tokens": 232349234.0, + "step": 257 + }, + { + "epoch": 1.9253731343283582, + "grad_norm": 0.30576254773905986, + "learning_rate": 6.944157130426745e-06, + "loss": 0.43654486536979675, + "num_tokens": 233187315.0, + "step": 258 + }, + { + "epoch": 1.9328358208955225, + "grad_norm": 0.28668295683966216, + "learning_rate": 6.9218589962105695e-06, + "loss": 0.40597644448280334, + "num_tokens": 234091956.0, + "step": 259 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.2807573548073224, + "learning_rate": 6.899522010623959e-06, + "loss": 0.42698317766189575, + "num_tokens": 235133005.0, + "step": 260 + }, + { + "epoch": 1.9477611940298507, + "grad_norm": 0.2676937710994811, + "learning_rate": 6.877146784010486e-06, + "loss": 0.4118936061859131, + "num_tokens": 235967243.0, + "step": 261 + }, + { + "epoch": 1.955223880597015, + "grad_norm": 0.29199333652094117, + "learning_rate": 6.854733927758636e-06, + "loss": 0.42816537618637085, + "num_tokens": 236876001.0, + "step": 262 + }, + { + "epoch": 1.962686567164179, + "grad_norm": 0.3572922506463511, + "learning_rate": 6.832284054285101e-06, + "loss": 0.43847325444221497, + "num_tokens": 237876952.0, + "step": 263 + }, + { + "epoch": 1.9701492537313432, + "grad_norm": 0.2960985809182997, + "learning_rate": 6.809797777018041e-06, + "loss": 0.43155139684677124, + "num_tokens": 238704164.0, + "step": 264 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.3169980642916318, + "learning_rate": 6.78727571038033e-06, + "loss": 0.4308193027973175, + "num_tokens": 239595870.0, + "step": 265 + }, + { + "epoch": 1.9850746268656716, + "grad_norm": 0.3191747061655072, + "learning_rate": 6.764718469772759e-06, + "loss": 0.4188956022262573, + "num_tokens": 240337386.0, + "step": 266 + }, + { + "epoch": 1.9925373134328357, + "grad_norm": 0.28286588606011187, + "learning_rate": 6.7421266715572275e-06, + "loss": 0.40036123991012573, + "num_tokens": 241215348.0, + "step": 267 + }, + { + "epoch": 2.0, + "grad_norm": 0.2981753233991589, + "learning_rate": 6.719500933039898e-06, + "loss": 0.41549932956695557, + "num_tokens": 242121111.0, + "step": 268 + }, + { + "epoch": 2.0074626865671643, + "grad_norm": 0.33640737374184443, + "learning_rate": 6.696841872454332e-06, + "loss": 0.4132290482521057, + "num_tokens": 243025320.0, + "step": 269 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.2822051764181089, + "learning_rate": 6.674150108944593e-06, + "loss": 0.37781068682670593, + "num_tokens": 243793916.0, + "step": 270 + }, + { + "epoch": 2.0223880597014925, + "grad_norm": 0.38987929902231017, + "learning_rate": 6.651426262548326e-06, + "loss": 0.40918004512786865, + "num_tokens": 244799351.0, + "step": 271 + }, + { + "epoch": 2.029850746268657, + "grad_norm": 0.348061447310908, + "learning_rate": 6.62867095417983e-06, + "loss": 0.3939589858055115, + "num_tokens": 245795313.0, + "step": 272 + }, + { + "epoch": 2.0373134328358207, + "grad_norm": 0.3046732710135438, + "learning_rate": 6.605884805613073e-06, + "loss": 0.36584192514419556, + "num_tokens": 246732184.0, + "step": 273 + }, + { + "epoch": 2.044776119402985, + "grad_norm": 0.3664198494618375, + "learning_rate": 6.583068439464716e-06, + "loss": 0.4081302881240845, + "num_tokens": 247606091.0, + "step": 274 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.3112614984470978, + "learning_rate": 6.560222479177095e-06, + "loss": 0.3947848081588745, + "num_tokens": 248474307.0, + "step": 275 + }, + { + "epoch": 2.0597014925373136, + "grad_norm": 0.3268123714386943, + "learning_rate": 6.537347549001184e-06, + "loss": 0.39627498388290405, + "num_tokens": 249293743.0, + "step": 276 + }, + { + "epoch": 2.0671641791044775, + "grad_norm": 0.30038025917744793, + "learning_rate": 6.514444273979544e-06, + "loss": 0.3961779773235321, + "num_tokens": 250164041.0, + "step": 277 + }, + { + "epoch": 2.074626865671642, + "grad_norm": 0.30941665860783496, + "learning_rate": 6.491513279929238e-06, + "loss": 0.3704898953437805, + "num_tokens": 251063865.0, + "step": 278 + }, + { + "epoch": 2.082089552238806, + "grad_norm": 0.2822311579038674, + "learning_rate": 6.468555193424736e-06, + "loss": 0.3888505697250366, + "num_tokens": 251954121.0, + "step": 279 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.2838966427637005, + "learning_rate": 6.445570641780786e-06, + "loss": 0.3732953667640686, + "num_tokens": 252767775.0, + "step": 280 + }, + { + "epoch": 2.0970149253731343, + "grad_norm": 0.30198287700287857, + "learning_rate": 6.422560253035287e-06, + "loss": 0.3989664614200592, + "num_tokens": 253671573.0, + "step": 281 + }, + { + "epoch": 2.1044776119402986, + "grad_norm": 0.3143195160978541, + "learning_rate": 6.399524655932111e-06, + "loss": 0.4071004390716553, + "num_tokens": 254540226.0, + "step": 282 + }, + { + "epoch": 2.111940298507463, + "grad_norm": 0.29633039155095714, + "learning_rate": 6.376464479903938e-06, + "loss": 0.3590371012687683, + "num_tokens": 255292355.0, + "step": 283 + }, + { + "epoch": 2.1194029850746268, + "grad_norm": 0.2746728490799242, + "learning_rate": 6.353380355055051e-06, + "loss": 0.38884416222572327, + "num_tokens": 256176530.0, + "step": 284 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.2951568696719758, + "learning_rate": 6.330272912144116e-06, + "loss": 0.42871013283729553, + "num_tokens": 257090645.0, + "step": 285 + }, + { + "epoch": 2.1343283582089554, + "grad_norm": 0.2902093873074645, + "learning_rate": 6.307142782566952e-06, + "loss": 0.3986203670501709, + "num_tokens": 258131119.0, + "step": 286 + }, + { + "epoch": 2.1417910447761193, + "grad_norm": 0.3900114303550773, + "learning_rate": 6.283990598339274e-06, + "loss": 0.390123188495636, + "num_tokens": 258880552.0, + "step": 287 + }, + { + "epoch": 2.1492537313432836, + "grad_norm": 0.2806374479908933, + "learning_rate": 6.2608169920794314e-06, + "loss": 0.36130136251449585, + "num_tokens": 259758999.0, + "step": 288 + }, + { + "epoch": 2.156716417910448, + "grad_norm": 0.2942927245657638, + "learning_rate": 6.237622596991106e-06, + "loss": 0.40030941367149353, + "num_tokens": 260602559.0, + "step": 289 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.3214957885578966, + "learning_rate": 6.214408046846034e-06, + "loss": 0.39499810338020325, + "num_tokens": 261439646.0, + "step": 290 + }, + { + "epoch": 2.171641791044776, + "grad_norm": 0.27240683635483437, + "learning_rate": 6.191173975966669e-06, + "loss": 0.3880019783973694, + "num_tokens": 262474020.0, + "step": 291 + }, + { + "epoch": 2.1791044776119404, + "grad_norm": 0.34023027676143563, + "learning_rate": 6.167921019208851e-06, + "loss": 0.42268985509872437, + "num_tokens": 263528820.0, + "step": 292 + }, + { + "epoch": 2.1865671641791047, + "grad_norm": 0.287848829860692, + "learning_rate": 6.144649811944474e-06, + "loss": 0.3913387656211853, + "num_tokens": 264372315.0, + "step": 293 + }, + { + "epoch": 2.1940298507462686, + "grad_norm": 0.29220713499868917, + "learning_rate": 6.121360990044107e-06, + "loss": 0.40157270431518555, + "num_tokens": 265188957.0, + "step": 294 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.286455151799939, + "learning_rate": 6.098055189859634e-06, + "loss": 0.3945062756538391, + "num_tokens": 266184697.0, + "step": 295 + }, + { + "epoch": 2.208955223880597, + "grad_norm": 0.289286738435993, + "learning_rate": 6.074733048206852e-06, + "loss": 0.3945891559123993, + "num_tokens": 267190971.0, + "step": 296 + }, + { + "epoch": 2.216417910447761, + "grad_norm": 0.27448176767847715, + "learning_rate": 6.051395202348089e-06, + "loss": 0.3953642249107361, + "num_tokens": 268121281.0, + "step": 297 + }, + { + "epoch": 2.2238805970149254, + "grad_norm": 0.297149102735408, + "learning_rate": 6.028042289974768e-06, + "loss": 0.3815913796424866, + "num_tokens": 269026334.0, + "step": 298 + }, + { + "epoch": 2.2313432835820897, + "grad_norm": 0.29135459719595014, + "learning_rate": 6.004674949190004e-06, + "loss": 0.3744094967842102, + "num_tokens": 269848673.0, + "step": 299 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.3163386130777747, + "learning_rate": 5.981293818491153e-06, + "loss": 0.411973237991333, + "num_tokens": 270729219.0, + "step": 300 + }, + { + "epoch": 2.246268656716418, + "grad_norm": 0.2996160649578529, + "learning_rate": 5.957899536752373e-06, + "loss": 0.4180707335472107, + "num_tokens": 271647605.0, + "step": 301 + }, + { + "epoch": 2.253731343283582, + "grad_norm": 0.2744717376139136, + "learning_rate": 5.934492743207168e-06, + "loss": 0.36764925718307495, + "num_tokens": 272444857.0, + "step": 302 + }, + { + "epoch": 2.2611940298507465, + "grad_norm": 0.3051287913390687, + "learning_rate": 5.911074077430917e-06, + "loss": 0.3950934410095215, + "num_tokens": 273313831.0, + "step": 303 + }, + { + "epoch": 2.2686567164179103, + "grad_norm": 0.2740805047822694, + "learning_rate": 5.887644179323403e-06, + "loss": 0.38602137565612793, + "num_tokens": 274151817.0, + "step": 304 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.2811027592780593, + "learning_rate": 5.864203689091316e-06, + "loss": 0.40490180253982544, + "num_tokens": 275023603.0, + "step": 305 + }, + { + "epoch": 2.283582089552239, + "grad_norm": 0.37103511230501807, + "learning_rate": 5.840753247230781e-06, + "loss": 0.39756178855895996, + "num_tokens": 275922951.0, + "step": 306 + }, + { + "epoch": 2.291044776119403, + "grad_norm": 0.260165834106451, + "learning_rate": 5.817293494509836e-06, + "loss": 0.3657914996147156, + "num_tokens": 276733073.0, + "step": 307 + }, + { + "epoch": 2.298507462686567, + "grad_norm": 0.2676322746611296, + "learning_rate": 5.793825071950936e-06, + "loss": 0.3826783299446106, + "num_tokens": 277699551.0, + "step": 308 + }, + { + "epoch": 2.3059701492537314, + "grad_norm": 0.3171630796152734, + "learning_rate": 5.770348620813433e-06, + "loss": 0.38245660066604614, + "num_tokens": 278695133.0, + "step": 309 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.2749216503608562, + "learning_rate": 5.746864782576054e-06, + "loss": 0.38771188259124756, + "num_tokens": 279483451.0, + "step": 310 + }, + { + "epoch": 2.3208955223880596, + "grad_norm": 0.34619757766961257, + "learning_rate": 5.723374198919376e-06, + "loss": 0.40358829498291016, + "num_tokens": 280316518.0, + "step": 311 + }, + { + "epoch": 2.328358208955224, + "grad_norm": 0.2628421365077709, + "learning_rate": 5.699877511708285e-06, + "loss": 0.37161552906036377, + "num_tokens": 281300113.0, + "step": 312 + }, + { + "epoch": 2.3358208955223883, + "grad_norm": 0.2865924626367908, + "learning_rate": 5.67637536297445e-06, + "loss": 0.3707822561264038, + "num_tokens": 282213553.0, + "step": 313 + }, + { + "epoch": 2.343283582089552, + "grad_norm": 0.2782360921000711, + "learning_rate": 5.652868394898766e-06, + "loss": 0.38021302223205566, + "num_tokens": 283069634.0, + "step": 314 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.274968159536365, + "learning_rate": 5.6293572497938165e-06, + "loss": 0.4070481061935425, + "num_tokens": 284055909.0, + "step": 315 + }, + { + "epoch": 2.3582089552238807, + "grad_norm": 0.25137582516547385, + "learning_rate": 5.605842570086321e-06, + "loss": 0.38819169998168945, + "num_tokens": 285072190.0, + "step": 316 + }, + { + "epoch": 2.3656716417910446, + "grad_norm": 0.27416935469654424, + "learning_rate": 5.582324998299573e-06, + "loss": 0.3976019620895386, + "num_tokens": 285997942.0, + "step": 317 + }, + { + "epoch": 2.373134328358209, + "grad_norm": 0.28976153755834105, + "learning_rate": 5.558805177035902e-06, + "loss": 0.39910900592803955, + "num_tokens": 286957228.0, + "step": 318 + }, + { + "epoch": 2.3805970149253732, + "grad_norm": 0.3526174425898886, + "learning_rate": 5.53528374895909e-06, + "loss": 0.37735995650291443, + "num_tokens": 287834123.0, + "step": 319 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.2753135236966283, + "learning_rate": 5.511761356776834e-06, + "loss": 0.3974205553531647, + "num_tokens": 288755581.0, + "step": 320 + }, + { + "epoch": 2.3955223880597014, + "grad_norm": 0.2836500955971764, + "learning_rate": 5.488238643223167e-06, + "loss": 0.4040617346763611, + "num_tokens": 289616887.0, + "step": 321 + }, + { + "epoch": 2.4029850746268657, + "grad_norm": 0.3001483066578534, + "learning_rate": 5.464716251040911e-06, + "loss": 0.39118584990501404, + "num_tokens": 290466034.0, + "step": 322 + }, + { + "epoch": 2.41044776119403, + "grad_norm": 0.29609458212755346, + "learning_rate": 5.4411948229641e-06, + "loss": 0.4012300372123718, + "num_tokens": 291327531.0, + "step": 323 + }, + { + "epoch": 2.417910447761194, + "grad_norm": 0.282307409973888, + "learning_rate": 5.417675001700428e-06, + "loss": 0.39297211170196533, + "num_tokens": 292249211.0, + "step": 324 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.31947796875203593, + "learning_rate": 5.394157429913681e-06, + "loss": 0.43389707803726196, + "num_tokens": 293154262.0, + "step": 325 + }, + { + "epoch": 2.4328358208955225, + "grad_norm": 0.2806921837500959, + "learning_rate": 5.370642750206184e-06, + "loss": 0.4193563461303711, + "num_tokens": 294190925.0, + "step": 326 + }, + { + "epoch": 2.4402985074626864, + "grad_norm": 0.28217215862589007, + "learning_rate": 5.347131605101237e-06, + "loss": 0.42073380947113037, + "num_tokens": 295155201.0, + "step": 327 + }, + { + "epoch": 2.4477611940298507, + "grad_norm": 0.2595127351145338, + "learning_rate": 5.323624637025552e-06, + "loss": 0.38413190841674805, + "num_tokens": 296039941.0, + "step": 328 + }, + { + "epoch": 2.455223880597015, + "grad_norm": 0.27537880701127315, + "learning_rate": 5.300122488291717e-06, + "loss": 0.3896210193634033, + "num_tokens": 296897125.0, + "step": 329 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.2806456708250513, + "learning_rate": 5.276625801080626e-06, + "loss": 0.40547412633895874, + "num_tokens": 297829206.0, + "step": 330 + }, + { + "epoch": 2.470149253731343, + "grad_norm": 0.3233513262930407, + "learning_rate": 5.253135217423948e-06, + "loss": 0.3998452425003052, + "num_tokens": 298813976.0, + "step": 331 + }, + { + "epoch": 2.4776119402985075, + "grad_norm": 0.2870679405386201, + "learning_rate": 5.229651379186569e-06, + "loss": 0.41445013880729675, + "num_tokens": 299755392.0, + "step": 332 + }, + { + "epoch": 2.485074626865672, + "grad_norm": 0.2623639243435129, + "learning_rate": 5.206174928049066e-06, + "loss": 0.3996489644050598, + "num_tokens": 300745461.0, + "step": 333 + }, + { + "epoch": 2.4925373134328357, + "grad_norm": 0.2657883700801823, + "learning_rate": 5.182706505490166e-06, + "loss": 0.3919597864151001, + "num_tokens": 301635785.0, + "step": 334 + }, + { + "epoch": 2.5, + "grad_norm": 0.2730887704012263, + "learning_rate": 5.15924675276922e-06, + "loss": 0.37381941080093384, + "num_tokens": 302529314.0, + "step": 335 + }, + { + "epoch": 2.5074626865671643, + "grad_norm": 0.27926647905507407, + "learning_rate": 5.135796310908685e-06, + "loss": 0.4020169675350189, + "num_tokens": 303325140.0, + "step": 336 + }, + { + "epoch": 2.5149253731343286, + "grad_norm": 0.2573449307599577, + "learning_rate": 5.1123558206766e-06, + "loss": 0.3959069848060608, + "num_tokens": 304291697.0, + "step": 337 + }, + { + "epoch": 2.5223880597014925, + "grad_norm": 0.2713627052957801, + "learning_rate": 5.088925922569084e-06, + "loss": 0.4036637246608734, + "num_tokens": 305167326.0, + "step": 338 + }, + { + "epoch": 2.529850746268657, + "grad_norm": 0.29137688284390684, + "learning_rate": 5.065507256792833e-06, + "loss": 0.40749210119247437, + "num_tokens": 306083413.0, + "step": 339 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.27645786153124524, + "learning_rate": 5.04210046324763e-06, + "loss": 0.3960036039352417, + "num_tokens": 306930925.0, + "step": 340 + }, + { + "epoch": 2.544776119402985, + "grad_norm": 0.2959257579408876, + "learning_rate": 5.018706181508851e-06, + "loss": 0.40943804383277893, + "num_tokens": 307667223.0, + "step": 341 + }, + { + "epoch": 2.5522388059701493, + "grad_norm": 0.2941768147406628, + "learning_rate": 4.995325050809999e-06, + "loss": 0.42352843284606934, + "num_tokens": 308548843.0, + "step": 342 + }, + { + "epoch": 2.5597014925373136, + "grad_norm": 0.3093404075043933, + "learning_rate": 4.971957710025235e-06, + "loss": 0.4167254567146301, + "num_tokens": 309456869.0, + "step": 343 + }, + { + "epoch": 2.5671641791044775, + "grad_norm": 0.285830294036988, + "learning_rate": 4.948604797651914e-06, + "loss": 0.41970574855804443, + "num_tokens": 310374426.0, + "step": 344 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.2822303940696211, + "learning_rate": 4.925266951793149e-06, + "loss": 0.39743444323539734, + "num_tokens": 311185331.0, + "step": 345 + }, + { + "epoch": 2.582089552238806, + "grad_norm": 0.2722209732746419, + "learning_rate": 4.90194481014037e-06, + "loss": 0.4093334674835205, + "num_tokens": 312287344.0, + "step": 346 + }, + { + "epoch": 2.58955223880597, + "grad_norm": 0.3685744506907742, + "learning_rate": 4.878639009955896e-06, + "loss": 0.3837957978248596, + "num_tokens": 313203808.0, + "step": 347 + }, + { + "epoch": 2.5970149253731343, + "grad_norm": 0.26210814461472964, + "learning_rate": 4.855350188055528e-06, + "loss": 0.374228835105896, + "num_tokens": 314127724.0, + "step": 348 + }, + { + "epoch": 2.6044776119402986, + "grad_norm": 0.26577422679986124, + "learning_rate": 4.83207898079115e-06, + "loss": 0.3950842022895813, + "num_tokens": 315094649.0, + "step": 349 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.2694330124125045, + "learning_rate": 4.808826024033334e-06, + "loss": 0.3894980251789093, + "num_tokens": 315902867.0, + "step": 350 + }, + { + "epoch": 2.6194029850746268, + "grad_norm": 0.30012143917049156, + "learning_rate": 4.785591953153966e-06, + "loss": 0.3923467695713043, + "num_tokens": 316809248.0, + "step": 351 + }, + { + "epoch": 2.626865671641791, + "grad_norm": 0.27202743774586025, + "learning_rate": 4.762377403008895e-06, + "loss": 0.40671366453170776, + "num_tokens": 317806785.0, + "step": 352 + }, + { + "epoch": 2.6343283582089554, + "grad_norm": 0.2663498979647159, + "learning_rate": 4.739183007920572e-06, + "loss": 0.40148887038230896, + "num_tokens": 318773135.0, + "step": 353 + }, + { + "epoch": 2.6417910447761193, + "grad_norm": 0.26964724456667694, + "learning_rate": 4.716009401660728e-06, + "loss": 0.36810237169265747, + "num_tokens": 319712540.0, + "step": 354 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.2745583940218022, + "learning_rate": 4.69285721743305e-06, + "loss": 0.3969258666038513, + "num_tokens": 320623524.0, + "step": 355 + }, + { + "epoch": 2.656716417910448, + "grad_norm": 0.2691069702675602, + "learning_rate": 4.669727087855886e-06, + "loss": 0.39531204104423523, + "num_tokens": 321558026.0, + "step": 356 + }, + { + "epoch": 2.664179104477612, + "grad_norm": 0.2790488198361277, + "learning_rate": 4.646619644944951e-06, + "loss": 0.3691323399543762, + "num_tokens": 322457137.0, + "step": 357 + }, + { + "epoch": 2.671641791044776, + "grad_norm": 0.25676092193729705, + "learning_rate": 4.623535520096063e-06, + "loss": 0.3830498456954956, + "num_tokens": 323406835.0, + "step": 358 + }, + { + "epoch": 2.6791044776119404, + "grad_norm": 0.27765790893840286, + "learning_rate": 4.6004753440678894e-06, + "loss": 0.38582926988601685, + "num_tokens": 324270762.0, + "step": 359 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.2578194748970744, + "learning_rate": 4.577439746964715e-06, + "loss": 0.39646175503730774, + "num_tokens": 325172716.0, + "step": 360 + }, + { + "epoch": 2.6940298507462686, + "grad_norm": 0.26611474982215905, + "learning_rate": 4.554429358219214e-06, + "loss": 0.38044852018356323, + "num_tokens": 326161663.0, + "step": 361 + }, + { + "epoch": 2.701492537313433, + "grad_norm": 0.2670566328628317, + "learning_rate": 4.531444806575266e-06, + "loss": 0.40564393997192383, + "num_tokens": 327106201.0, + "step": 362 + }, + { + "epoch": 2.708955223880597, + "grad_norm": 0.274772662861299, + "learning_rate": 4.508486720070761e-06, + "loss": 0.39564812183380127, + "num_tokens": 328050673.0, + "step": 363 + }, + { + "epoch": 2.716417910447761, + "grad_norm": 0.3094439511198801, + "learning_rate": 4.485555726020455e-06, + "loss": 0.3800423741340637, + "num_tokens": 328859100.0, + "step": 364 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.2875993414193674, + "learning_rate": 4.462652450998816e-06, + "loss": 0.4001840353012085, + "num_tokens": 329666962.0, + "step": 365 + }, + { + "epoch": 2.7313432835820897, + "grad_norm": 0.27308262203119327, + "learning_rate": 4.439777520822905e-06, + "loss": 0.39083579182624817, + "num_tokens": 330477732.0, + "step": 366 + }, + { + "epoch": 2.7388059701492535, + "grad_norm": 0.2708315720399402, + "learning_rate": 4.416931560535284e-06, + "loss": 0.39352381229400635, + "num_tokens": 331330359.0, + "step": 367 + }, + { + "epoch": 2.746268656716418, + "grad_norm": 0.2678850422820554, + "learning_rate": 4.394115194386928e-06, + "loss": 0.38045477867126465, + "num_tokens": 332347647.0, + "step": 368 + }, + { + "epoch": 2.753731343283582, + "grad_norm": 0.2753212357157175, + "learning_rate": 4.371329045820172e-06, + "loss": 0.3969570994377136, + "num_tokens": 333284873.0, + "step": 369 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.28683339254512785, + "learning_rate": 4.3485737374516745e-06, + "loss": 0.4235033392906189, + "num_tokens": 334098107.0, + "step": 370 + }, + { + "epoch": 2.7686567164179103, + "grad_norm": 0.2698726522878529, + "learning_rate": 4.3258498910554095e-06, + "loss": 0.38629546761512756, + "num_tokens": 334979408.0, + "step": 371 + }, + { + "epoch": 2.7761194029850746, + "grad_norm": 0.2615554761622241, + "learning_rate": 4.303158127545669e-06, + "loss": 0.3924221694469452, + "num_tokens": 335891381.0, + "step": 372 + }, + { + "epoch": 2.783582089552239, + "grad_norm": 0.26064429917011145, + "learning_rate": 4.280499066960102e-06, + "loss": 0.3906182050704956, + "num_tokens": 336949128.0, + "step": 373 + }, + { + "epoch": 2.791044776119403, + "grad_norm": 0.27127505364411514, + "learning_rate": 4.257873328442774e-06, + "loss": 0.3783274292945862, + "num_tokens": 337776659.0, + "step": 374 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.27410164043945023, + "learning_rate": 4.2352815302272425e-06, + "loss": 0.3829938471317291, + "num_tokens": 338685204.0, + "step": 375 + }, + { + "epoch": 2.8059701492537314, + "grad_norm": 0.2706332327188829, + "learning_rate": 4.212724289619672e-06, + "loss": 0.37140512466430664, + "num_tokens": 339492119.0, + "step": 376 + }, + { + "epoch": 2.8134328358208958, + "grad_norm": 0.29552966231342986, + "learning_rate": 4.190202222981959e-06, + "loss": 0.41518405079841614, + "num_tokens": 340414044.0, + "step": 377 + }, + { + "epoch": 2.8208955223880596, + "grad_norm": 0.4384124363415056, + "learning_rate": 4.1677159457149005e-06, + "loss": 0.3670823574066162, + "num_tokens": 341275739.0, + "step": 378 + }, + { + "epoch": 2.828358208955224, + "grad_norm": 0.2818008385366561, + "learning_rate": 4.145266072241365e-06, + "loss": 0.38579511642456055, + "num_tokens": 342203284.0, + "step": 379 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.26814078006971265, + "learning_rate": 4.122853215989515e-06, + "loss": 0.4062846899032593, + "num_tokens": 343206534.0, + "step": 380 + }, + { + "epoch": 2.843283582089552, + "grad_norm": 0.27452179515826414, + "learning_rate": 4.1004779893760424e-06, + "loss": 0.397432416677475, + "num_tokens": 344154341.0, + "step": 381 + }, + { + "epoch": 2.8507462686567164, + "grad_norm": 0.27288188181425943, + "learning_rate": 4.078141003789431e-06, + "loss": 0.391731858253479, + "num_tokens": 345024971.0, + "step": 382 + }, + { + "epoch": 2.8582089552238807, + "grad_norm": 0.2967872715212152, + "learning_rate": 4.055842869573256e-06, + "loss": 0.400160551071167, + "num_tokens": 345812228.0, + "step": 383 + }, + { + "epoch": 2.8656716417910446, + "grad_norm": 0.27985989099065167, + "learning_rate": 4.0335841960095025e-06, + "loss": 0.3944920599460602, + "num_tokens": 346769134.0, + "step": 384 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.2548795141867926, + "learning_rate": 4.011365591301918e-06, + "loss": 0.404415488243103, + "num_tokens": 347740543.0, + "step": 385 + }, + { + "epoch": 2.8805970149253732, + "grad_norm": 0.2353554630176529, + "learning_rate": 3.989187662559397e-06, + "loss": 0.3925011157989502, + "num_tokens": 348799551.0, + "step": 386 + }, + { + "epoch": 2.888059701492537, + "grad_norm": 0.4371240438139863, + "learning_rate": 3.967051015779389e-06, + "loss": 0.394489049911499, + "num_tokens": 349833256.0, + "step": 387 + }, + { + "epoch": 2.8955223880597014, + "grad_norm": 0.492017294414543, + "learning_rate": 3.944956255831342e-06, + "loss": 0.3901214003562927, + "num_tokens": 350675901.0, + "step": 388 + }, + { + "epoch": 2.9029850746268657, + "grad_norm": 0.28604462735158265, + "learning_rate": 3.922903986440171e-06, + "loss": 0.3956416845321655, + "num_tokens": 351593161.0, + "step": 389 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.3019009320890686, + "learning_rate": 3.900894810169766e-06, + "loss": 0.4037666618824005, + "num_tokens": 352556035.0, + "step": 390 + }, + { + "epoch": 2.917910447761194, + "grad_norm": 0.2929989612906795, + "learning_rate": 3.878929328406524e-06, + "loss": 0.38326603174209595, + "num_tokens": 353175046.0, + "step": 391 + }, + { + "epoch": 2.925373134328358, + "grad_norm": 0.2811533155158446, + "learning_rate": 3.857008141342921e-06, + "loss": 0.3970789909362793, + "num_tokens": 354040412.0, + "step": 392 + }, + { + "epoch": 2.9328358208955225, + "grad_norm": 0.2642763742866724, + "learning_rate": 3.8351318479611045e-06, + "loss": 0.40754109621047974, + "num_tokens": 354957977.0, + "step": 393 + }, + { + "epoch": 2.9402985074626864, + "grad_norm": 0.2553969638436942, + "learning_rate": 3.8133010460165364e-06, + "loss": 0.3917849361896515, + "num_tokens": 355897000.0, + "step": 394 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3227768986284808, + "learning_rate": 3.791516332021651e-06, + "loss": 0.38059675693511963, + "num_tokens": 356775946.0, + "step": 395 + }, + { + "epoch": 2.955223880597015, + "grad_norm": 0.26373506539724473, + "learning_rate": 3.769778301229562e-06, + "loss": 0.392505407333374, + "num_tokens": 357732570.0, + "step": 396 + }, + { + "epoch": 2.9626865671641793, + "grad_norm": 0.27141559638214446, + "learning_rate": 3.748087547617795e-06, + "loss": 0.38036075234413147, + "num_tokens": 358510667.0, + "step": 397 + }, + { + "epoch": 2.970149253731343, + "grad_norm": 0.24786828522735252, + "learning_rate": 3.7264446638720542e-06, + "loss": 0.37426790595054626, + "num_tokens": 359444745.0, + "step": 398 + }, + { + "epoch": 2.9776119402985075, + "grad_norm": 0.25219066519802286, + "learning_rate": 3.704850241370035e-06, + "loss": 0.3932304382324219, + "num_tokens": 360351403.0, + "step": 399 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.2314040595153558, + "learning_rate": 3.6833048701652574e-06, + "loss": 0.3921104669570923, + "num_tokens": 361414260.0, + "step": 400 + }, + { + "epoch": 2.9925373134328357, + "grad_norm": 0.24531758323496658, + "learning_rate": 3.661809138970951e-06, + "loss": 0.39479339122772217, + "num_tokens": 362313539.0, + "step": 401 + }, + { + "epoch": 3.0, + "grad_norm": 0.269225436814872, + "learning_rate": 3.6403636351439577e-06, + "loss": 0.39549848437309265, + "num_tokens": 363114852.0, + "step": 402 + }, + { + "epoch": 3.0074626865671643, + "grad_norm": 0.28662511975668975, + "learning_rate": 3.618968944668696e-06, + "loss": 0.35942816734313965, + "num_tokens": 363883703.0, + "step": 403 + }, + { + "epoch": 3.014925373134328, + "grad_norm": 0.2897343949926782, + "learning_rate": 3.5976256521411402e-06, + "loss": 0.37709563970565796, + "num_tokens": 364726957.0, + "step": 404 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.25819303755354905, + "learning_rate": 3.576334340752847e-06, + "loss": 0.3720802664756775, + "num_tokens": 365712205.0, + "step": 405 + }, + { + "epoch": 3.029850746268657, + "grad_norm": 0.28009429409591957, + "learning_rate": 3.5550955922750275e-06, + "loss": 0.3992989659309387, + "num_tokens": 366502371.0, + "step": 406 + }, + { + "epoch": 3.0373134328358207, + "grad_norm": 0.2764674226920931, + "learning_rate": 3.533909987042642e-06, + "loss": 0.39246252179145813, + "num_tokens": 367405016.0, + "step": 407 + }, + { + "epoch": 3.044776119402985, + "grad_norm": 0.30985373317019865, + "learning_rate": 3.512778103938542e-06, + "loss": 0.4023834466934204, + "num_tokens": 368186973.0, + "step": 408 + }, + { + "epoch": 3.0522388059701493, + "grad_norm": 0.28547534212425507, + "learning_rate": 3.491700520377667e-06, + "loss": 0.38294538855552673, + "num_tokens": 369054384.0, + "step": 409 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.2749822220227637, + "learning_rate": 3.470677812291248e-06, + "loss": 0.3690488636493683, + "num_tokens": 370021137.0, + "step": 410 + }, + { + "epoch": 3.0671641791044775, + "grad_norm": 0.2617585883370724, + "learning_rate": 3.4497105541110847e-06, + "loss": 0.39320921897888184, + "num_tokens": 370954131.0, + "step": 411 + }, + { + "epoch": 3.074626865671642, + "grad_norm": 0.276121676089303, + "learning_rate": 3.4287993187538445e-06, + "loss": 0.3605678975582123, + "num_tokens": 371779138.0, + "step": 412 + }, + { + "epoch": 3.082089552238806, + "grad_norm": 0.3190227559580631, + "learning_rate": 3.407944677605399e-06, + "loss": 0.408037006855011, + "num_tokens": 372652437.0, + "step": 413 + }, + { + "epoch": 3.08955223880597, + "grad_norm": 0.3764832484269211, + "learning_rate": 3.387147200505232e-06, + "loss": 0.38565126061439514, + "num_tokens": 373477902.0, + "step": 414 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.28107007973769577, + "learning_rate": 3.366407455730849e-06, + "loss": 0.414955735206604, + "num_tokens": 374298186.0, + "step": 415 + }, + { + "epoch": 3.1044776119402986, + "grad_norm": 0.2538068604333711, + "learning_rate": 3.345726009982262e-06, + "loss": 0.3739873766899109, + "num_tokens": 375232722.0, + "step": 416 + }, + { + "epoch": 3.111940298507463, + "grad_norm": 0.25345140165817104, + "learning_rate": 3.3251034283664945e-06, + "loss": 0.39425763487815857, + "num_tokens": 376192544.0, + "step": 417 + }, + { + "epoch": 3.1194029850746268, + "grad_norm": 0.26126693334804235, + "learning_rate": 3.304540274382151e-06, + "loss": 0.3673323094844818, + "num_tokens": 377142524.0, + "step": 418 + }, + { + "epoch": 3.126865671641791, + "grad_norm": 0.2718425837582604, + "learning_rate": 3.284037109904013e-06, + "loss": 0.38800495862960815, + "num_tokens": 378076354.0, + "step": 419 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.24762599606026042, + "learning_rate": 3.263594495167688e-06, + "loss": 0.3551333248615265, + "num_tokens": 378966330.0, + "step": 420 + }, + { + "epoch": 3.1417910447761193, + "grad_norm": 0.3979931015660995, + "learning_rate": 3.2432129887543026e-06, + "loss": 0.3955429196357727, + "num_tokens": 379888904.0, + "step": 421 + }, + { + "epoch": 3.1492537313432836, + "grad_norm": 0.27409522127657593, + "learning_rate": 3.2228931475752323e-06, + "loss": 0.35347574949264526, + "num_tokens": 380738966.0, + "step": 422 + }, + { + "epoch": 3.156716417910448, + "grad_norm": 0.26157991571638095, + "learning_rate": 3.2026355268568987e-06, + "loss": 0.35351991653442383, + "num_tokens": 381614529.0, + "step": 423 + }, + { + "epoch": 3.1641791044776117, + "grad_norm": 0.253961852327095, + "learning_rate": 3.1824406801255836e-06, + "loss": 0.36370548605918884, + "num_tokens": 382513458.0, + "step": 424 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.24868042189319053, + "learning_rate": 3.162309159192316e-06, + "loss": 0.3607192635536194, + "num_tokens": 383449861.0, + "step": 425 + }, + { + "epoch": 3.1791044776119404, + "grad_norm": 0.26485700184898936, + "learning_rate": 3.1422415141377815e-06, + "loss": 0.3481111228466034, + "num_tokens": 384253017.0, + "step": 426 + }, + { + "epoch": 3.1865671641791047, + "grad_norm": 0.28281284316278155, + "learning_rate": 3.122238293297305e-06, + "loss": 0.3816152811050415, + "num_tokens": 385257443.0, + "step": 427 + }, + { + "epoch": 3.1940298507462686, + "grad_norm": 0.2628707804556158, + "learning_rate": 3.10230004324586e-06, + "loss": 0.349966824054718, + "num_tokens": 386017753.0, + "step": 428 + }, + { + "epoch": 3.201492537313433, + "grad_norm": 0.2606711695382564, + "learning_rate": 3.0824273087831335e-06, + "loss": 0.38945478200912476, + "num_tokens": 386978912.0, + "step": 429 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.2747230623623624, + "learning_rate": 3.062620632918648e-06, + "loss": 0.3638556897640228, + "num_tokens": 387852467.0, + "step": 430 + }, + { + "epoch": 3.216417910447761, + "grad_norm": 0.2803007110615389, + "learning_rate": 3.0428805568569076e-06, + "loss": 0.38482367992401123, + "num_tokens": 388658923.0, + "step": 431 + }, + { + "epoch": 3.2238805970149254, + "grad_norm": 0.2645967994643593, + "learning_rate": 3.023207619982629e-06, + "loss": 0.36384740471839905, + "num_tokens": 389508858.0, + "step": 432 + }, + { + "epoch": 3.2313432835820897, + "grad_norm": 0.27202749711662244, + "learning_rate": 3.0036023598459895e-06, + "loss": 0.39492571353912354, + "num_tokens": 390450838.0, + "step": 433 + }, + { + "epoch": 3.2388059701492535, + "grad_norm": 0.2858842639475798, + "learning_rate": 2.9840653121479478e-06, + "loss": 0.3738439679145813, + "num_tokens": 391283207.0, + "step": 434 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.24793258551891303, + "learning_rate": 2.9645970107255997e-06, + "loss": 0.35694074630737305, + "num_tokens": 392285292.0, + "step": 435 + }, + { + "epoch": 3.253731343283582, + "grad_norm": 0.2819278079547717, + "learning_rate": 2.9451979875375913e-06, + "loss": 0.3710547387599945, + "num_tokens": 393145041.0, + "step": 436 + }, + { + "epoch": 3.2611940298507465, + "grad_norm": 0.2631494530375275, + "learning_rate": 2.925868772649591e-06, + "loss": 0.3825373351573944, + "num_tokens": 394022264.0, + "step": 437 + }, + { + "epoch": 3.2686567164179103, + "grad_norm": 0.2555768738323888, + "learning_rate": 2.9066098942197995e-06, + "loss": 0.36353516578674316, + "num_tokens": 394892104.0, + "step": 438 + }, + { + "epoch": 3.2761194029850746, + "grad_norm": 0.252531665467931, + "learning_rate": 2.887421878484516e-06, + "loss": 0.38284653425216675, + "num_tokens": 395835092.0, + "step": 439 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.2845282411268666, + "learning_rate": 2.8683052497437665e-06, + "loss": 0.3927590548992157, + "num_tokens": 396725722.0, + "step": 440 + }, + { + "epoch": 3.291044776119403, + "grad_norm": 0.26812964504110554, + "learning_rate": 2.8492605303469732e-06, + "loss": 0.37616321444511414, + "num_tokens": 397618546.0, + "step": 441 + }, + { + "epoch": 3.298507462686567, + "grad_norm": 0.25144632819615587, + "learning_rate": 2.8302882406786817e-06, + "loss": 0.382343053817749, + "num_tokens": 398571441.0, + "step": 442 + }, + { + "epoch": 3.3059701492537314, + "grad_norm": 0.29981470486255846, + "learning_rate": 2.811388899144345e-06, + "loss": 0.3775964379310608, + "num_tokens": 399409770.0, + "step": 443 + }, + { + "epoch": 3.3134328358208958, + "grad_norm": 0.37890241833609745, + "learning_rate": 2.7925630221561506e-06, + "loss": 0.37770912051200867, + "num_tokens": 400392695.0, + "step": 444 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.2755196059695862, + "learning_rate": 2.7738111241189185e-06, + "loss": 0.3694460690021515, + "num_tokens": 401345623.0, + "step": 445 + }, + { + "epoch": 3.328358208955224, + "grad_norm": 0.2680514510877795, + "learning_rate": 2.755133717416043e-06, + "loss": 0.3776453137397766, + "num_tokens": 402260500.0, + "step": 446 + }, + { + "epoch": 3.3358208955223883, + "grad_norm": 0.24171155783588386, + "learning_rate": 2.7365313123954916e-06, + "loss": 0.3985833525657654, + "num_tokens": 403276687.0, + "step": 447 + }, + { + "epoch": 3.343283582089552, + "grad_norm": 0.2569574542175994, + "learning_rate": 2.718004417355855e-06, + "loss": 0.3654242157936096, + "num_tokens": 404190134.0, + "step": 448 + }, + { + "epoch": 3.3507462686567164, + "grad_norm": 0.2493193792220214, + "learning_rate": 2.699553538532467e-06, + "loss": 0.3807545006275177, + "num_tokens": 405215802.0, + "step": 449 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.35218345161133224, + "learning_rate": 2.6811791800835684e-06, + "loss": 0.37028026580810547, + "num_tokens": 406189028.0, + "step": 450 + }, + { + "epoch": 3.3656716417910446, + "grad_norm": 0.2630947673101325, + "learning_rate": 2.662881844076527e-06, + "loss": 0.3866269886493683, + "num_tokens": 407112961.0, + "step": 451 + }, + { + "epoch": 3.373134328358209, + "grad_norm": 0.24502727833486168, + "learning_rate": 2.6446620304741267e-06, + "loss": 0.3389516770839691, + "num_tokens": 407955720.0, + "step": 452 + }, + { + "epoch": 3.3805970149253732, + "grad_norm": 0.29642792873153473, + "learning_rate": 2.6265202371208985e-06, + "loss": 0.3727038502693176, + "num_tokens": 408861534.0, + "step": 453 + }, + { + "epoch": 3.388059701492537, + "grad_norm": 0.2729055281837691, + "learning_rate": 2.6084569597295227e-06, + "loss": 0.37226539850234985, + "num_tokens": 409769033.0, + "step": 454 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.2591963730270879, + "learning_rate": 2.590472691867284e-06, + "loss": 0.3665540814399719, + "num_tokens": 410734429.0, + "step": 455 + }, + { + "epoch": 3.4029850746268657, + "grad_norm": 0.24603379464247438, + "learning_rate": 2.57256792494258e-06, + "loss": 0.3557394742965698, + "num_tokens": 411668760.0, + "step": 456 + }, + { + "epoch": 3.41044776119403, + "grad_norm": 0.26710941082613454, + "learning_rate": 2.5547431481914973e-06, + "loss": 0.3810808062553406, + "num_tokens": 412593612.0, + "step": 457 + }, + { + "epoch": 3.417910447761194, + "grad_norm": 0.24517969588523647, + "learning_rate": 2.536998848664445e-06, + "loss": 0.36506032943725586, + "num_tokens": 413566574.0, + "step": 458 + }, + { + "epoch": 3.425373134328358, + "grad_norm": 0.26080308677293107, + "learning_rate": 2.5193355112128436e-06, + "loss": 0.375240683555603, + "num_tokens": 414490201.0, + "step": 459 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.2507828857248926, + "learning_rate": 2.501753618475877e-06, + "loss": 0.3682469129562378, + "num_tokens": 415392501.0, + "step": 460 + }, + { + "epoch": 3.4402985074626864, + "grad_norm": 0.2692051104680508, + "learning_rate": 2.4842536508673087e-06, + "loss": 0.37688201665878296, + "num_tokens": 416317197.0, + "step": 461 + }, + { + "epoch": 3.4477611940298507, + "grad_norm": 0.2549135365443059, + "learning_rate": 2.466836086562345e-06, + "loss": 0.36603114008903503, + "num_tokens": 417156988.0, + "step": 462 + }, + { + "epoch": 3.455223880597015, + "grad_norm": 0.2453940193702825, + "learning_rate": 2.4495014014845807e-06, + "loss": 0.3681268095970154, + "num_tokens": 418076187.0, + "step": 463 + }, + { + "epoch": 3.4626865671641793, + "grad_norm": 0.2725566155237126, + "learning_rate": 2.432250069292989e-06, + "loss": 0.37921467423439026, + "num_tokens": 418901462.0, + "step": 464 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.25907951334448015, + "learning_rate": 2.415082561368979e-06, + "loss": 0.39200738072395325, + "num_tokens": 419804291.0, + "step": 465 + }, + { + "epoch": 3.4776119402985075, + "grad_norm": 0.26406315997541896, + "learning_rate": 2.397999346803518e-06, + "loss": 0.39208582043647766, + "num_tokens": 420712064.0, + "step": 466 + }, + { + "epoch": 3.485074626865672, + "grad_norm": 0.23773901962622077, + "learning_rate": 2.3810008923843077e-06, + "loss": 0.37207821011543274, + "num_tokens": 421699792.0, + "step": 467 + }, + { + "epoch": 3.4925373134328357, + "grad_norm": 0.2479152678036227, + "learning_rate": 2.3640876625830385e-06, + "loss": 0.37208831310272217, + "num_tokens": 422643169.0, + "step": 468 + }, + { + "epoch": 3.5, + "grad_norm": 0.2563637058550244, + "learning_rate": 2.347260119542692e-06, + "loss": 0.378294974565506, + "num_tokens": 423633161.0, + "step": 469 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.2606090648417702, + "learning_rate": 2.3305187230649177e-06, + "loss": 0.3819723129272461, + "num_tokens": 424556814.0, + "step": 470 + }, + { + "epoch": 3.5149253731343286, + "grad_norm": 0.251352839735243, + "learning_rate": 2.3138639305974596e-06, + "loss": 0.37940090894699097, + "num_tokens": 425479906.0, + "step": 471 + }, + { + "epoch": 3.5223880597014925, + "grad_norm": 0.24680617583096287, + "learning_rate": 2.2972961972216703e-06, + "loss": 0.3712913393974304, + "num_tokens": 426446651.0, + "step": 472 + }, + { + "epoch": 3.529850746268657, + "grad_norm": 0.25068376553010957, + "learning_rate": 2.2808159756400667e-06, + "loss": 0.36781617999076843, + "num_tokens": 427310770.0, + "step": 473 + }, + { + "epoch": 3.5373134328358207, + "grad_norm": 0.2575081517211329, + "learning_rate": 2.264423716163962e-06, + "loss": 0.38692015409469604, + "num_tokens": 428270355.0, + "step": 474 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.26353888813152593, + "learning_rate": 2.2481198667011675e-06, + "loss": 0.4076312184333801, + "num_tokens": 429240026.0, + "step": 475 + }, + { + "epoch": 3.5522388059701493, + "grad_norm": 0.24599771689956054, + "learning_rate": 2.231904872743739e-06, + "loss": 0.3803725838661194, + "num_tokens": 430167582.0, + "step": 476 + }, + { + "epoch": 3.5597014925373136, + "grad_norm": 0.24639104277677626, + "learning_rate": 2.2157791773558222e-06, + "loss": 0.3705400228500366, + "num_tokens": 431118645.0, + "step": 477 + }, + { + "epoch": 3.5671641791044775, + "grad_norm": 0.25169470907126656, + "learning_rate": 2.199743221161533e-06, + "loss": 0.40112996101379395, + "num_tokens": 432105903.0, + "step": 478 + }, + { + "epoch": 3.574626865671642, + "grad_norm": 0.2494015959735466, + "learning_rate": 2.1837974423329254e-06, + "loss": 0.37427645921707153, + "num_tokens": 432968989.0, + "step": 479 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.2510312087393284, + "learning_rate": 2.1679422765780115e-06, + "loss": 0.3761802613735199, + "num_tokens": 433879607.0, + "step": 480 + }, + { + "epoch": 3.58955223880597, + "grad_norm": 0.2472662036793444, + "learning_rate": 2.152178157128865e-06, + "loss": 0.37739771604537964, + "num_tokens": 434793981.0, + "step": 481 + }, + { + "epoch": 3.5970149253731343, + "grad_norm": 0.2526868205714577, + "learning_rate": 2.136505514729774e-06, + "loss": 0.3701442778110504, + "num_tokens": 435697283.0, + "step": 482 + }, + { + "epoch": 3.6044776119402986, + "grad_norm": 0.2516013371512894, + "learning_rate": 2.1209247776254795e-06, + "loss": 0.3924868106842041, + "num_tokens": 436627533.0, + "step": 483 + }, + { + "epoch": 3.611940298507463, + "grad_norm": 0.24502502103917492, + "learning_rate": 2.1054363715494695e-06, + "loss": 0.34178441762924194, + "num_tokens": 437481939.0, + "step": 484 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.26412516983013123, + "learning_rate": 2.0900407197123444e-06, + "loss": 0.3800678253173828, + "num_tokens": 438276274.0, + "step": 485 + }, + { + "epoch": 3.626865671641791, + "grad_norm": 0.2650046240456923, + "learning_rate": 2.0747382427902574e-06, + "loss": 0.4031677544116974, + "num_tokens": 439089480.0, + "step": 486 + }, + { + "epoch": 3.6343283582089554, + "grad_norm": 0.2587379164469128, + "learning_rate": 2.059529358913418e-06, + "loss": 0.37271153926849365, + "num_tokens": 439983559.0, + "step": 487 + }, + { + "epoch": 3.6417910447761193, + "grad_norm": 0.2540913543502109, + "learning_rate": 2.0444144836546684e-06, + "loss": 0.3822531998157501, + "num_tokens": 440850324.0, + "step": 488 + }, + { + "epoch": 3.6492537313432836, + "grad_norm": 0.27783327558446214, + "learning_rate": 2.0293940300181216e-06, + "loss": 0.3831808269023895, + "num_tokens": 441605590.0, + "step": 489 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.2796967269697153, + "learning_rate": 2.0144684084278847e-06, + "loss": 0.3709692060947418, + "num_tokens": 442348946.0, + "step": 490 + }, + { + "epoch": 3.664179104477612, + "grad_norm": 0.2465314987769435, + "learning_rate": 1.999638026716842e-06, + "loss": 0.35937702655792236, + "num_tokens": 443300971.0, + "step": 491 + }, + { + "epoch": 3.671641791044776, + "grad_norm": 0.24683809772269052, + "learning_rate": 1.9849032901155075e-06, + "loss": 0.39329999685287476, + "num_tokens": 444301774.0, + "step": 492 + }, + { + "epoch": 3.6791044776119404, + "grad_norm": 0.23859031932692393, + "learning_rate": 1.970264601240958e-06, + "loss": 0.3722185492515564, + "num_tokens": 445224414.0, + "step": 493 + }, + { + "epoch": 3.6865671641791042, + "grad_norm": 0.2707873095537451, + "learning_rate": 1.955722360085824e-06, + "loss": 0.38121020793914795, + "num_tokens": 446138719.0, + "step": 494 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.27134162465622047, + "learning_rate": 1.941276964007369e-06, + "loss": 0.41704389452934265, + "num_tokens": 447027595.0, + "step": 495 + }, + { + "epoch": 3.701492537313433, + "grad_norm": 0.27102779980384334, + "learning_rate": 1.9269288077166264e-06, + "loss": 0.41601014137268066, + "num_tokens": 447918016.0, + "step": 496 + }, + { + "epoch": 3.708955223880597, + "grad_norm": 0.2795343486481597, + "learning_rate": 1.9126782832676175e-06, + "loss": 0.37963247299194336, + "num_tokens": 448782123.0, + "step": 497 + }, + { + "epoch": 3.716417910447761, + "grad_norm": 0.24533152172023073, + "learning_rate": 1.898525780046635e-06, + "loss": 0.37255096435546875, + "num_tokens": 449735295.0, + "step": 498 + }, + { + "epoch": 3.7238805970149254, + "grad_norm": 0.25068064600084505, + "learning_rate": 1.8844716847616053e-06, + "loss": 0.3953704237937927, + "num_tokens": 450703519.0, + "step": 499 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.27826952511668485, + "learning_rate": 1.870516381431523e-06, + "loss": 0.37893447279930115, + "num_tokens": 451523722.0, + "step": 500 + }, + { + "epoch": 3.7388059701492535, + "grad_norm": 0.2470705912133386, + "learning_rate": 1.8566602513759573e-06, + "loss": 0.36960500478744507, + "num_tokens": 452496914.0, + "step": 501 + }, + { + "epoch": 3.746268656716418, + "grad_norm": 0.2380353729045607, + "learning_rate": 1.8429036732046328e-06, + "loss": 0.3598456084728241, + "num_tokens": 453486873.0, + "step": 502 + }, + { + "epoch": 3.753731343283582, + "grad_norm": 0.24753875738528466, + "learning_rate": 1.8292470228070808e-06, + "loss": 0.3775923550128937, + "num_tokens": 454415514.0, + "step": 503 + }, + { + "epoch": 3.7611940298507465, + "grad_norm": 0.24852622318044526, + "learning_rate": 1.815690673342374e-06, + "loss": 0.377275288105011, + "num_tokens": 455330400.0, + "step": 504 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.24830439594342327, + "learning_rate": 1.8022349952289275e-06, + "loss": 0.3592768907546997, + "num_tokens": 456232858.0, + "step": 505 + }, + { + "epoch": 3.7761194029850746, + "grad_norm": 0.2661718758635726, + "learning_rate": 1.7888803561343755e-06, + "loss": 0.3917810320854187, + "num_tokens": 457091321.0, + "step": 506 + }, + { + "epoch": 3.783582089552239, + "grad_norm": 0.2652414658319871, + "learning_rate": 1.7756271209655296e-06, + "loss": 0.41377222537994385, + "num_tokens": 457990573.0, + "step": 507 + }, + { + "epoch": 3.791044776119403, + "grad_norm": 0.260047413567863, + "learning_rate": 1.7624756518584015e-06, + "loss": 0.3786197304725647, + "num_tokens": 458827375.0, + "step": 508 + }, + { + "epoch": 3.798507462686567, + "grad_norm": 0.24921975710502509, + "learning_rate": 1.7494263081683134e-06, + "loss": 0.36924827098846436, + "num_tokens": 459694321.0, + "step": 509 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.24376539520051552, + "learning_rate": 1.736479446460081e-06, + "loss": 0.3597017526626587, + "num_tokens": 460616396.0, + "step": 510 + }, + { + "epoch": 3.8134328358208958, + "grad_norm": 0.24365917664342365, + "learning_rate": 1.723635420498259e-06, + "loss": 0.36935943365097046, + "num_tokens": 461530829.0, + "step": 511 + }, + { + "epoch": 3.8208955223880596, + "grad_norm": 0.23932370443954964, + "learning_rate": 1.7108945812374874e-06, + "loss": 0.387093722820282, + "num_tokens": 462464505.0, + "step": 512 + }, + { + "epoch": 3.828358208955224, + "grad_norm": 0.257078997056124, + "learning_rate": 1.6982572768128964e-06, + "loss": 0.38530057668685913, + "num_tokens": 463398691.0, + "step": 513 + }, + { + "epoch": 3.835820895522388, + "grad_norm": 0.24718882255890465, + "learning_rate": 1.6857238525305924e-06, + "loss": 0.3774847388267517, + "num_tokens": 464295344.0, + "step": 514 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.23460337795500458, + "learning_rate": 1.6732946508582288e-06, + "loss": 0.3643302619457245, + "num_tokens": 465251963.0, + "step": 515 + }, + { + "epoch": 3.8507462686567164, + "grad_norm": 0.23769889055431628, + "learning_rate": 1.6609700114156368e-06, + "loss": 0.3710617423057556, + "num_tokens": 466213047.0, + "step": 516 + }, + { + "epoch": 3.8582089552238807, + "grad_norm": 0.23396843344896867, + "learning_rate": 1.6487502709655591e-06, + "loss": 0.382940411567688, + "num_tokens": 467245768.0, + "step": 517 + }, + { + "epoch": 3.8656716417910446, + "grad_norm": 0.23909686285484746, + "learning_rate": 1.6366357634044406e-06, + "loss": 0.3723403215408325, + "num_tokens": 468129089.0, + "step": 518 + }, + { + "epoch": 3.873134328358209, + "grad_norm": 0.2688981703218909, + "learning_rate": 1.6246268197533046e-06, + "loss": 0.3829047381877899, + "num_tokens": 468938058.0, + "step": 519 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.25519957310879615, + "learning_rate": 1.6127237681487096e-06, + "loss": 0.39446866512298584, + "num_tokens": 469847619.0, + "step": 520 + }, + { + "epoch": 3.888059701492537, + "grad_norm": 0.2486366229197261, + "learning_rate": 1.6009269338337832e-06, + "loss": 0.3983200788497925, + "num_tokens": 470791148.0, + "step": 521 + }, + { + "epoch": 3.8955223880597014, + "grad_norm": 0.24830658428540756, + "learning_rate": 1.5892366391493363e-06, + "loss": 0.38877153396606445, + "num_tokens": 471735636.0, + "step": 522 + }, + { + "epoch": 3.9029850746268657, + "grad_norm": 0.24923977507707654, + "learning_rate": 1.5776532035250513e-06, + "loss": 0.37799936532974243, + "num_tokens": 472685312.0, + "step": 523 + }, + { + "epoch": 3.91044776119403, + "grad_norm": 0.23192614084158372, + "learning_rate": 1.5661769434707585e-06, + "loss": 0.36345481872558594, + "num_tokens": 473551908.0, + "step": 524 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.2552136498693883, + "learning_rate": 1.5548081725677843e-06, + "loss": 0.38905611634254456, + "num_tokens": 474411763.0, + "step": 525 + }, + { + "epoch": 3.925373134328358, + "grad_norm": 0.24222236291728852, + "learning_rate": 1.543547201460384e-06, + "loss": 0.39437806606292725, + "num_tokens": 475386853.0, + "step": 526 + }, + { + "epoch": 3.9328358208955225, + "grad_norm": 0.24678201558159044, + "learning_rate": 1.5323943378472547e-06, + "loss": 0.38338255882263184, + "num_tokens": 476308351.0, + "step": 527 + }, + { + "epoch": 3.9402985074626864, + "grad_norm": 0.24156858852006619, + "learning_rate": 1.5213498864731266e-06, + "loss": 0.3475341796875, + "num_tokens": 477113932.0, + "step": 528 + }, + { + "epoch": 3.9477611940298507, + "grad_norm": 0.2450649252841632, + "learning_rate": 1.510414149120436e-06, + "loss": 0.3621699810028076, + "num_tokens": 477978986.0, + "step": 529 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.2615934671849586, + "learning_rate": 1.4995874246010778e-06, + "loss": 0.39790230989456177, + "num_tokens": 478804801.0, + "step": 530 + }, + { + "epoch": 3.9626865671641793, + "grad_norm": 0.23841246285008993, + "learning_rate": 1.4888700087482447e-06, + "loss": 0.36489465832710266, + "num_tokens": 479744154.0, + "step": 531 + }, + { + "epoch": 3.970149253731343, + "grad_norm": 0.23884234571084306, + "learning_rate": 1.4782621944083395e-06, + "loss": 0.3676777482032776, + "num_tokens": 480672910.0, + "step": 532 + }, + { + "epoch": 3.9776119402985075, + "grad_norm": 0.24521642019046497, + "learning_rate": 1.4677642714329772e-06, + "loss": 0.36571812629699707, + "num_tokens": 481542586.0, + "step": 533 + }, + { + "epoch": 3.9850746268656714, + "grad_norm": 0.2490357512875355, + "learning_rate": 1.45737652667106e-06, + "loss": 0.3776237964630127, + "num_tokens": 482388483.0, + "step": 534 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.26895614724288625, + "learning_rate": 1.4470992439609447e-06, + "loss": 0.36370331048965454, + "num_tokens": 483130281.0, + "step": 535 + }, + { + "epoch": 4.0, + "grad_norm": 0.23598448132329167, + "learning_rate": 1.4369327041226832e-06, + "loss": 0.3770376443862915, + "num_tokens": 484157211.0, + "step": 536 + }, + { + "epoch": 4.007462686567164, + "grad_norm": 0.2696832935027054, + "learning_rate": 1.4268771849503507e-06, + "loss": 0.3495013117790222, + "num_tokens": 484950425.0, + "step": 537 + }, + { + "epoch": 4.014925373134329, + "grad_norm": 0.2523061504872546, + "learning_rate": 1.416932961204457e-06, + "loss": 0.35033246874809265, + "num_tokens": 485897373.0, + "step": 538 + }, + { + "epoch": 4.022388059701493, + "grad_norm": 0.24871017609979634, + "learning_rate": 1.4071003046044324e-06, + "loss": 0.3654225468635559, + "num_tokens": 486751466.0, + "step": 539 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.23941923046022578, + "learning_rate": 1.3973794838212124e-06, + "loss": 0.36163097620010376, + "num_tokens": 487741373.0, + "step": 540 + }, + { + "epoch": 4.037313432835821, + "grad_norm": 0.2662894021736037, + "learning_rate": 1.3877707644698895e-06, + "loss": 0.3875274062156677, + "num_tokens": 488582397.0, + "step": 541 + }, + { + "epoch": 4.044776119402985, + "grad_norm": 0.2747015512526315, + "learning_rate": 1.3782744091024586e-06, + "loss": 0.3777075409889221, + "num_tokens": 489319854.0, + "step": 542 + }, + { + "epoch": 4.052238805970149, + "grad_norm": 0.2525490812172139, + "learning_rate": 1.3688906772006393e-06, + "loss": 0.36404550075531006, + "num_tokens": 490257709.0, + "step": 543 + }, + { + "epoch": 4.059701492537314, + "grad_norm": 0.24847635532681875, + "learning_rate": 1.359619825168792e-06, + "loss": 0.36995524168014526, + "num_tokens": 491153491.0, + "step": 544 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.23960840738937653, + "learning_rate": 1.3504621063269058e-06, + "loss": 0.36562579870224, + "num_tokens": 492103168.0, + "step": 545 + }, + { + "epoch": 4.074626865671641, + "grad_norm": 0.25453231573026114, + "learning_rate": 1.3414177709036802e-06, + "loss": 0.36385661363601685, + "num_tokens": 493050344.0, + "step": 546 + }, + { + "epoch": 4.082089552238806, + "grad_norm": 0.2409618977265192, + "learning_rate": 1.3324870660296869e-06, + "loss": 0.34029990434646606, + "num_tokens": 493993937.0, + "step": 547 + }, + { + "epoch": 4.08955223880597, + "grad_norm": 0.23732030399559195, + "learning_rate": 1.3236702357306157e-06, + "loss": 0.37044817209243774, + "num_tokens": 494995752.0, + "step": 548 + }, + { + "epoch": 4.097014925373134, + "grad_norm": 0.27145957936067777, + "learning_rate": 1.3149675209206086e-06, + "loss": 0.36308181285858154, + "num_tokens": 495757177.0, + "step": 549 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.2833034141091316, + "learning_rate": 1.3063791593956758e-06, + "loss": 0.37331539392471313, + "num_tokens": 496689668.0, + "step": 550 + }, + { + "epoch": 4.111940298507463, + "grad_norm": 0.240823460931463, + "learning_rate": 1.2979053858271995e-06, + "loss": 0.36020007729530334, + "num_tokens": 497565858.0, + "step": 551 + }, + { + "epoch": 4.119402985074627, + "grad_norm": 0.2594604561644764, + "learning_rate": 1.2895464317555206e-06, + "loss": 0.3884323239326477, + "num_tokens": 498385563.0, + "step": 552 + }, + { + "epoch": 4.126865671641791, + "grad_norm": 0.23073517132438157, + "learning_rate": 1.2813025255836104e-06, + "loss": 0.349163293838501, + "num_tokens": 499323100.0, + "step": 553 + }, + { + "epoch": 4.134328358208955, + "grad_norm": 0.2603679958630556, + "learning_rate": 1.2731738925708328e-06, + "loss": 0.36741840839385986, + "num_tokens": 500196622.0, + "step": 554 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.24326119145979633, + "learning_rate": 1.2651607548267873e-06, + "loss": 0.3810882568359375, + "num_tokens": 501224710.0, + "step": 555 + }, + { + "epoch": 4.149253731343284, + "grad_norm": 0.22934377798425087, + "learning_rate": 1.257263331305241e-06, + "loss": 0.37762486934661865, + "num_tokens": 502305655.0, + "step": 556 + }, + { + "epoch": 4.156716417910448, + "grad_norm": 0.2399419262393838, + "learning_rate": 1.249481837798144e-06, + "loss": 0.360861212015152, + "num_tokens": 503186087.0, + "step": 557 + }, + { + "epoch": 4.164179104477612, + "grad_norm": 0.2356017748084062, + "learning_rate": 1.2418164869297353e-06, + "loss": 0.36369866132736206, + "num_tokens": 504097376.0, + "step": 558 + }, + { + "epoch": 4.1716417910447765, + "grad_norm": 0.239368624704367, + "learning_rate": 1.2342674881507327e-06, + "loss": 0.36475175619125366, + "num_tokens": 505048926.0, + "step": 559 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.24555194813944806, + "learning_rate": 1.2268350477326073e-06, + "loss": 0.3852774500846863, + "num_tokens": 505967694.0, + "step": 560 + }, + { + "epoch": 4.186567164179104, + "grad_norm": 0.24385261062576613, + "learning_rate": 1.2195193687619505e-06, + "loss": 0.3750133812427521, + "num_tokens": 506924348.0, + "step": 561 + }, + { + "epoch": 4.1940298507462686, + "grad_norm": 0.24733441550806298, + "learning_rate": 1.2123206511349212e-06, + "loss": 0.36548683047294617, + "num_tokens": 507837247.0, + "step": 562 + }, + { + "epoch": 4.201492537313433, + "grad_norm": 0.2626516276894915, + "learning_rate": 1.2052390915517881e-06, + "loss": 0.36941125988960266, + "num_tokens": 508615951.0, + "step": 563 + }, + { + "epoch": 4.208955223880597, + "grad_norm": 0.24609691004441409, + "learning_rate": 1.1982748835115512e-06, + "loss": 0.3862428665161133, + "num_tokens": 509598473.0, + "step": 564 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.24842515895556683, + "learning_rate": 1.1914282173066574e-06, + "loss": 0.38270822167396545, + "num_tokens": 510499495.0, + "step": 565 + }, + { + "epoch": 4.223880597014926, + "grad_norm": 0.2407337171765148, + "learning_rate": 1.1846992800177979e-06, + "loss": 0.3664012551307678, + "num_tokens": 511393216.0, + "step": 566 + }, + { + "epoch": 4.231343283582089, + "grad_norm": 0.2442416141258047, + "learning_rate": 1.1780882555087988e-06, + "loss": 0.3886314034461975, + "num_tokens": 512343363.0, + "step": 567 + }, + { + "epoch": 4.2388059701492535, + "grad_norm": 0.2577619381883818, + "learning_rate": 1.1715953244215964e-06, + "loss": 0.3437773585319519, + "num_tokens": 513127609.0, + "step": 568 + }, + { + "epoch": 4.246268656716418, + "grad_norm": 0.25087871697950354, + "learning_rate": 1.165220664171302e-06, + "loss": 0.3734786808490753, + "num_tokens": 514033936.0, + "step": 569 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.2392856334846873, + "learning_rate": 1.1589644489413516e-06, + "loss": 0.35015231370925903, + "num_tokens": 514934044.0, + "step": 570 + }, + { + "epoch": 4.2611940298507465, + "grad_norm": 0.23533059380991045, + "learning_rate": 1.1528268496787498e-06, + "loss": 0.3818935453891754, + "num_tokens": 515909265.0, + "step": 571 + }, + { + "epoch": 4.268656716417911, + "grad_norm": 0.28002873497751246, + "learning_rate": 1.1468080340893958e-06, + "loss": 0.3613874316215515, + "num_tokens": 516712628.0, + "step": 572 + }, + { + "epoch": 4.276119402985074, + "grad_norm": 0.26573428139291055, + "learning_rate": 1.1409081666335035e-06, + "loss": 0.40466490387916565, + "num_tokens": 517664539.0, + "step": 573 + }, + { + "epoch": 4.2835820895522385, + "grad_norm": 0.2622221544713941, + "learning_rate": 1.1351274085211068e-06, + "loss": 0.36875689029693604, + "num_tokens": 518492097.0, + "step": 574 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.8295997519231081, + "learning_rate": 1.1294659177076523e-06, + "loss": 0.343036413192749, + "num_tokens": 519432536.0, + "step": 575 + }, + { + "epoch": 4.298507462686567, + "grad_norm": 0.26477934459538893, + "learning_rate": 1.1239238488896875e-06, + "loss": 0.39276033639907837, + "num_tokens": 520276253.0, + "step": 576 + }, + { + "epoch": 4.3059701492537314, + "grad_norm": 0.2751291575165678, + "learning_rate": 1.118501353500631e-06, + "loss": 0.36554020643234253, + "num_tokens": 521085557.0, + "step": 577 + }, + { + "epoch": 4.313432835820896, + "grad_norm": 0.26704770077542006, + "learning_rate": 1.1131985797066364e-06, + "loss": 0.39840590953826904, + "num_tokens": 521915761.0, + "step": 578 + }, + { + "epoch": 4.32089552238806, + "grad_norm": 0.267325084112826, + "learning_rate": 1.1080156724025409e-06, + "loss": 0.3594783842563629, + "num_tokens": 522783342.0, + "step": 579 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.23810536176661679, + "learning_rate": 1.1029527732079084e-06, + "loss": 0.37440672516822815, + "num_tokens": 523807264.0, + "step": 580 + }, + { + "epoch": 4.335820895522388, + "grad_norm": 0.27369911060242186, + "learning_rate": 1.0980100204631604e-06, + "loss": 0.40351587533950806, + "num_tokens": 524601938.0, + "step": 581 + }, + { + "epoch": 4.343283582089552, + "grad_norm": 0.23536111609123755, + "learning_rate": 1.0931875492257946e-06, + "loss": 0.33745962381362915, + "num_tokens": 525537212.0, + "step": 582 + }, + { + "epoch": 4.350746268656716, + "grad_norm": 0.2600131581237491, + "learning_rate": 1.088485491266694e-06, + "loss": 0.38494178652763367, + "num_tokens": 526347121.0, + "step": 583 + }, + { + "epoch": 4.358208955223881, + "grad_norm": 0.23219951832538527, + "learning_rate": 1.0839039750665292e-06, + "loss": 0.35427361726760864, + "num_tokens": 527281437.0, + "step": 584 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.2489391057817072, + "learning_rate": 1.079443125812243e-06, + "loss": 0.3624609708786011, + "num_tokens": 528208071.0, + "step": 585 + }, + { + "epoch": 4.373134328358209, + "grad_norm": 0.2539695897127002, + "learning_rate": 1.0751030653936356e-06, + "loss": 0.3747778534889221, + "num_tokens": 529032089.0, + "step": 586 + }, + { + "epoch": 4.380597014925373, + "grad_norm": 0.2499880144186626, + "learning_rate": 1.0708839124000287e-06, + "loss": 0.38273054361343384, + "num_tokens": 529947287.0, + "step": 587 + }, + { + "epoch": 4.388059701492537, + "grad_norm": 0.2506974248310357, + "learning_rate": 1.0667857821170282e-06, + "loss": 0.3470362424850464, + "num_tokens": 530728896.0, + "step": 588 + }, + { + "epoch": 4.395522388059701, + "grad_norm": 0.24506418459436066, + "learning_rate": 1.0628087865233737e-06, + "loss": 0.35882338881492615, + "num_tokens": 531620091.0, + "step": 589 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.24329483114740325, + "learning_rate": 1.058953034287877e-06, + "loss": 0.37174564599990845, + "num_tokens": 532460579.0, + "step": 590 + }, + { + "epoch": 4.41044776119403, + "grad_norm": 0.23831984993388738, + "learning_rate": 1.0552186307664567e-06, + "loss": 0.363148033618927, + "num_tokens": 533351390.0, + "step": 591 + }, + { + "epoch": 4.417910447761194, + "grad_norm": 0.26162136426743393, + "learning_rate": 1.0516056779992543e-06, + "loss": 0.38013726472854614, + "num_tokens": 534195605.0, + "step": 592 + }, + { + "epoch": 4.425373134328359, + "grad_norm": 0.2635745464481523, + "learning_rate": 1.0481142747078494e-06, + "loss": 0.3700369596481323, + "num_tokens": 535033541.0, + "step": 593 + }, + { + "epoch": 4.432835820895522, + "grad_norm": 0.25007207032778783, + "learning_rate": 1.0447445162925614e-06, + "loss": 0.3790166974067688, + "num_tokens": 535964895.0, + "step": 594 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.22799545701890034, + "learning_rate": 1.0414964948298436e-06, + "loss": 0.36508986353874207, + "num_tokens": 536941184.0, + "step": 595 + }, + { + "epoch": 4.447761194029851, + "grad_norm": 0.23265306394886567, + "learning_rate": 1.0383702990697657e-06, + "loss": 0.3546326160430908, + "num_tokens": 537896596.0, + "step": 596 + }, + { + "epoch": 4.455223880597015, + "grad_norm": 0.2452826212608677, + "learning_rate": 1.0353660144335892e-06, + "loss": 0.3647281229496002, + "num_tokens": 538748931.0, + "step": 597 + }, + { + "epoch": 4.462686567164179, + "grad_norm": 0.24623855227956742, + "learning_rate": 1.0324837230114332e-06, + "loss": 0.3664322793483734, + "num_tokens": 539622406.0, + "step": 598 + }, + { + "epoch": 4.470149253731344, + "grad_norm": 0.24476867667376634, + "learning_rate": 1.0297235035600337e-06, + "loss": 0.35626494884490967, + "num_tokens": 540561688.0, + "step": 599 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.22411638197357536, + "learning_rate": 1.0270854315005874e-06, + "loss": 0.3493247628211975, + "num_tokens": 541498885.0, + "step": 600 + }, + { + "epoch": 4.485074626865671, + "grad_norm": 0.23854702147816884, + "learning_rate": 1.024569578916695e-06, + "loss": 0.36460673809051514, + "num_tokens": 542468798.0, + "step": 601 + }, + { + "epoch": 4.492537313432836, + "grad_norm": 0.24473776240066009, + "learning_rate": 1.0221760145523876e-06, + "loss": 0.3664558529853821, + "num_tokens": 543354992.0, + "step": 602 + }, + { + "epoch": 4.5, + "grad_norm": 0.3484100772975978, + "learning_rate": 1.0199048038102528e-06, + "loss": 0.3781493902206421, + "num_tokens": 544264190.0, + "step": 603 + }, + { + "epoch": 4.507462686567164, + "grad_norm": 0.23041088788536823, + "learning_rate": 1.0177560087496425e-06, + "loss": 0.36557939648628235, + "num_tokens": 545199765.0, + "step": 604 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.26397201636028744, + "learning_rate": 1.0157296880849826e-06, + "loss": 0.39719897508621216, + "num_tokens": 546061065.0, + "step": 605 + }, + { + "epoch": 4.522388059701493, + "grad_norm": 0.2510378043077616, + "learning_rate": 1.0138258971841642e-06, + "loss": 0.3602595925331116, + "num_tokens": 546928816.0, + "step": 606 + }, + { + "epoch": 4.529850746268656, + "grad_norm": 0.25217406420558186, + "learning_rate": 1.0120446880670326e-06, + "loss": 0.3766353130340576, + "num_tokens": 547847934.0, + "step": 607 + }, + { + "epoch": 4.537313432835821, + "grad_norm": 0.23959568238841403, + "learning_rate": 1.010386109403967e-06, + "loss": 0.3650025725364685, + "num_tokens": 548766636.0, + "step": 608 + }, + { + "epoch": 4.544776119402985, + "grad_norm": 0.2377901920772251, + "learning_rate": 1.008850206514547e-06, + "loss": 0.3625343143939972, + "num_tokens": 549661389.0, + "step": 609 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.26122470845807755, + "learning_rate": 1.0074370213663202e-06, + "loss": 0.3682940602302551, + "num_tokens": 550430887.0, + "step": 610 + }, + { + "epoch": 4.559701492537314, + "grad_norm": 0.2481365703161649, + "learning_rate": 1.0061465925736478e-06, + "loss": 0.36531317234039307, + "num_tokens": 551293916.0, + "step": 611 + }, + { + "epoch": 4.567164179104478, + "grad_norm": 0.23719670021949013, + "learning_rate": 1.004978955396657e-06, + "loss": 0.3669975996017456, + "num_tokens": 552281926.0, + "step": 612 + }, + { + "epoch": 4.574626865671641, + "grad_norm": 0.25803252973725255, + "learning_rate": 1.0039341417402715e-06, + "loss": 0.37066352367401123, + "num_tokens": 553148975.0, + "step": 613 + }, + { + "epoch": 4.582089552238806, + "grad_norm": 0.2476936983459798, + "learning_rate": 1.0030121801533442e-06, + "loss": 0.3824441134929657, + "num_tokens": 554068576.0, + "step": 614 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.2489594826146839, + "learning_rate": 1.002213095827875e-06, + "loss": 0.3596557378768921, + "num_tokens": 554855138.0, + "step": 615 + }, + { + "epoch": 4.597014925373134, + "grad_norm": 0.2550266059020853, + "learning_rate": 1.0015369105983218e-06, + "loss": 0.34850555658340454, + "num_tokens": 555783649.0, + "step": 616 + }, + { + "epoch": 4.604477611940299, + "grad_norm": 0.28933444541800885, + "learning_rate": 1.0009836429410053e-06, + "loss": 0.3593859076499939, + "num_tokens": 556756059.0, + "step": 617 + }, + { + "epoch": 4.611940298507463, + "grad_norm": 0.24100103005251267, + "learning_rate": 1.0005533079736037e-06, + "loss": 0.34157663583755493, + "num_tokens": 557624997.0, + "step": 618 + }, + { + "epoch": 4.619402985074627, + "grad_norm": 0.2434497947580223, + "learning_rate": 1.00024591745474e-06, + "loss": 0.35940393805503845, + "num_tokens": 558551462.0, + "step": 619 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.2334659825308566, + "learning_rate": 1.0000614797836587e-06, + "loss": 0.3954239785671234, + "num_tokens": 559571713.0, + "step": 620 + }, + { + "epoch": 4.6268656716417915, + "step": 620, + "total_flos": 829937030004736.0, + "train_loss": 0.4202386662844689, + "train_runtime": 18585.0074, + "train_samples_per_second": 1.068, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1, + "max_steps": 620, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 62, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 829937030004736.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..6a903ab --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eec048e628a653c916a3717a9f2d295434c00bf772eb742f14f7f0d21a36376 +size 7633