commit b4815209db76d640b1978dab96e64917329ff4b1 Author: ModelHub XC Date: Wed Jun 3 02:34:19 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: gshasiri/SmolLM3-Mid-Second-Round Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..01a5a52 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +base_model: gshasiri/SmolLM3-Mid +library_name: transformers +model_name: SmolLM3-Mid-Second-Round +tags: +- generated_from_trainer +- trl +- sft +- alignment-handbook +licence: license +--- + +# Model Card for SmolLM3-Mid-Second-Round + +This model is a fine-tuned version of [gshasiri/SmolLM3-Mid](https://huggingface.co/gshasiri/SmolLM3-Mid). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="gshasiri/SmolLM3-Mid-Second-Round", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/shamanework-pl/huggingface/runs/rs9eip31) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.25.1 +- Transformers: 4.57.1 +- Pytorch: 2.6.0+cu126 +- Datasets: 4.4.1 +- Tokenizers: 0.22.1 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..8ffa2a3 --- /dev/null +++ b/all_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 2.0, + "total_flos": 2.85249316814848e+16, + "train_loss": 0.8500257841618516, + "train_runtime": 594460.0391, + "train_samples": 4779894, + "train_samples_per_second": 3.624, + "train_steps_per_second": 0.028 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..a1ab1be --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,6 @@ +{%- for message in messages -%} + {{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>" + "\n" -}} +{%- endfor -%} +{%- if add_generation_prompt -%} + {{- "<|im_start|>assistant\n" -}} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..45c637a --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128012, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..e8c9e21 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128012, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.1" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..0889c09 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1cc94be3e66bbfa4d1e9016f48c17fb521735a89a209cf43bfa614442895e0d +size 2471645608 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..a24b1f3 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..f342589 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7c979daf2c715603b21e094ce7e032280b007311a070cdf98ed708c492d614 +size 17209792 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..7aa903e --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128014": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128015": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128017": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128018": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|im_end|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..8ffa2a3 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 2.0, + "total_flos": 2.85249316814848e+16, + "train_loss": 0.8500257841618516, + "train_runtime": 594460.0391, + "train_samples": 4779894, + "train_samples_per_second": 3.624, + "train_steps_per_second": 0.028 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..093f874 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,134683 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 16830, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011883541295306001, + "grad_norm": 0.206886744291027, + "learning_rate": 0.0, + "loss": 1.0093, + "num_tokens": 4180563.0, + "step": 1 + }, + { + "epoch": 0.00023767082590612002, + "grad_norm": 0.20639821568636793, + "learning_rate": 3.960396039603961e-08, + "loss": 0.9724, + "num_tokens": 8370494.0, + "step": 2 + }, + { + "epoch": 0.00035650623885918, + "grad_norm": 0.21176249644883066, + "learning_rate": 7.920792079207921e-08, + "loss": 1.0189, + "num_tokens": 12559168.0, + "step": 3 + }, + { + "epoch": 0.00047534165181224003, + "grad_norm": 0.21797003738748602, + "learning_rate": 1.1881188118811883e-07, + "loss": 1.0364, + "num_tokens": 16726606.0, + "step": 4 + }, + { + "epoch": 0.0005941770647653001, + "grad_norm": 0.2079646306036497, + "learning_rate": 1.5841584158415843e-07, + "loss": 0.9931, + "num_tokens": 20909336.0, + "step": 5 + }, + { + "epoch": 0.00071301247771836, + "grad_norm": 0.21655916119580834, + "learning_rate": 1.9801980198019803e-07, + "loss": 0.9919, + "num_tokens": 25097878.0, + "step": 6 + }, + { + "epoch": 0.0008318478906714201, + "grad_norm": 0.20783485508367122, + "learning_rate": 2.3762376237623766e-07, + "loss": 1.0331, + "num_tokens": 29287351.0, + "step": 7 + }, + { + "epoch": 0.0009506833036244801, + "grad_norm": 0.20884755137926633, + "learning_rate": 2.7722772277227726e-07, + "loss": 1.0278, + "num_tokens": 33474708.0, + "step": 8 + }, + { + "epoch": 0.0010695187165775401, + "grad_norm": 0.20613489143267122, + "learning_rate": 3.1683168316831686e-07, + "loss": 1.0045, + "num_tokens": 37663506.0, + "step": 9 + }, + { + "epoch": 0.0011883541295306002, + "grad_norm": 0.20838075507916007, + "learning_rate": 3.5643564356435646e-07, + "loss": 1.0358, + "num_tokens": 41852551.0, + "step": 10 + }, + { + "epoch": 0.00130718954248366, + "grad_norm": 0.20887624893192688, + "learning_rate": 3.9603960396039606e-07, + "loss": 0.9962, + "num_tokens": 46029460.0, + "step": 11 + }, + { + "epoch": 0.00142602495543672, + "grad_norm": 0.21519909405373167, + "learning_rate": 4.3564356435643566e-07, + "loss": 1.0067, + "num_tokens": 50219852.0, + "step": 12 + }, + { + "epoch": 0.0015448603683897802, + "grad_norm": 0.20877772629212457, + "learning_rate": 4.752475247524753e-07, + "loss": 0.9946, + "num_tokens": 54338743.0, + "step": 13 + }, + { + "epoch": 0.0016636957813428402, + "grad_norm": 0.2114727192294147, + "learning_rate": 5.148514851485149e-07, + "loss": 1.0003, + "num_tokens": 58512275.0, + "step": 14 + }, + { + "epoch": 0.0017825311942959, + "grad_norm": 0.2117918175510505, + "learning_rate": 5.544554455445545e-07, + "loss": 1.0208, + "num_tokens": 62677046.0, + "step": 15 + }, + { + "epoch": 0.0019013666072489601, + "grad_norm": 0.2122421235048402, + "learning_rate": 5.940594059405941e-07, + "loss": 1.0366, + "num_tokens": 66812104.0, + "step": 16 + }, + { + "epoch": 0.00202020202020202, + "grad_norm": 0.20816339402853784, + "learning_rate": 6.336633663366337e-07, + "loss": 0.9805, + "num_tokens": 70956773.0, + "step": 17 + }, + { + "epoch": 0.0021390374331550803, + "grad_norm": 0.20907910797861032, + "learning_rate": 6.732673267326734e-07, + "loss": 1.0015, + "num_tokens": 75147491.0, + "step": 18 + }, + { + "epoch": 0.0022578728461081403, + "grad_norm": 0.20837494657331002, + "learning_rate": 7.128712871287129e-07, + "loss": 1.0353, + "num_tokens": 79311111.0, + "step": 19 + }, + { + "epoch": 0.0023767082590612004, + "grad_norm": 0.21418718017214491, + "learning_rate": 7.524752475247525e-07, + "loss": 1.0165, + "num_tokens": 83469191.0, + "step": 20 + }, + { + "epoch": 0.0024955436720142605, + "grad_norm": 0.21586939101224675, + "learning_rate": 7.920792079207921e-07, + "loss": 1.0271, + "num_tokens": 87658665.0, + "step": 21 + }, + { + "epoch": 0.00261437908496732, + "grad_norm": 0.2165727555851177, + "learning_rate": 8.316831683168318e-07, + "loss": 1.0047, + "num_tokens": 91847475.0, + "step": 22 + }, + { + "epoch": 0.00273321449792038, + "grad_norm": 0.22282086398107254, + "learning_rate": 8.712871287128713e-07, + "loss": 1.0434, + "num_tokens": 96037750.0, + "step": 23 + }, + { + "epoch": 0.00285204991087344, + "grad_norm": 0.21065052236070922, + "learning_rate": 9.10891089108911e-07, + "loss": 0.9913, + "num_tokens": 100214773.0, + "step": 24 + }, + { + "epoch": 0.0029708853238265003, + "grad_norm": 0.22126154125001418, + "learning_rate": 9.504950495049506e-07, + "loss": 0.9948, + "num_tokens": 104396675.0, + "step": 25 + }, + { + "epoch": 0.0030897207367795603, + "grad_norm": 0.21355756130489525, + "learning_rate": 9.900990099009902e-07, + "loss": 1.0007, + "num_tokens": 108585850.0, + "step": 26 + }, + { + "epoch": 0.0032085561497326204, + "grad_norm": 0.22573959379060807, + "learning_rate": 1.0297029702970297e-06, + "loss": 1.0343, + "num_tokens": 112747701.0, + "step": 27 + }, + { + "epoch": 0.0033273915626856805, + "grad_norm": 0.22286738505225148, + "learning_rate": 1.0693069306930693e-06, + "loss": 1.0198, + "num_tokens": 116913858.0, + "step": 28 + }, + { + "epoch": 0.0034462269756387405, + "grad_norm": 0.22273613783374246, + "learning_rate": 1.108910891089109e-06, + "loss": 0.996, + "num_tokens": 121101848.0, + "step": 29 + }, + { + "epoch": 0.0035650623885918, + "grad_norm": 0.22246689485095852, + "learning_rate": 1.1485148514851486e-06, + "loss": 1.0146, + "num_tokens": 125283500.0, + "step": 30 + }, + { + "epoch": 0.00368389780154486, + "grad_norm": 0.23347694173737266, + "learning_rate": 1.1881188118811881e-06, + "loss": 1.0079, + "num_tokens": 129473295.0, + "step": 31 + }, + { + "epoch": 0.0038027332144979203, + "grad_norm": 0.21309929782004863, + "learning_rate": 1.2277227722772279e-06, + "loss": 1.0035, + "num_tokens": 133663892.0, + "step": 32 + }, + { + "epoch": 0.00392156862745098, + "grad_norm": 0.22141760041776753, + "learning_rate": 1.2673267326732674e-06, + "loss": 1.0325, + "num_tokens": 137834036.0, + "step": 33 + }, + { + "epoch": 0.00404040404040404, + "grad_norm": 0.22569225622717465, + "learning_rate": 1.3069306930693072e-06, + "loss": 1.0225, + "num_tokens": 142022234.0, + "step": 34 + }, + { + "epoch": 0.0041592394533571005, + "grad_norm": 0.2217284209428393, + "learning_rate": 1.3465346534653467e-06, + "loss": 1.0355, + "num_tokens": 146210698.0, + "step": 35 + }, + { + "epoch": 0.0042780748663101605, + "grad_norm": 0.22940969111796924, + "learning_rate": 1.3861386138613863e-06, + "loss": 1.0225, + "num_tokens": 150400377.0, + "step": 36 + }, + { + "epoch": 0.004396910279263221, + "grad_norm": 0.2293804419363582, + "learning_rate": 1.4257425742574258e-06, + "loss": 1.0318, + "num_tokens": 154581849.0, + "step": 37 + }, + { + "epoch": 0.004515745692216281, + "grad_norm": 0.23614259575508975, + "learning_rate": 1.4653465346534654e-06, + "loss": 1.0175, + "num_tokens": 158771120.0, + "step": 38 + }, + { + "epoch": 0.004634581105169341, + "grad_norm": 0.23211298612351547, + "learning_rate": 1.504950495049505e-06, + "loss": 0.9816, + "num_tokens": 162960410.0, + "step": 39 + }, + { + "epoch": 0.004753416518122401, + "grad_norm": 0.2339151109711963, + "learning_rate": 1.5445544554455447e-06, + "loss": 1.015, + "num_tokens": 167149159.0, + "step": 40 + }, + { + "epoch": 0.004872251931075461, + "grad_norm": 0.23028160380332297, + "learning_rate": 1.5841584158415842e-06, + "loss": 1.0106, + "num_tokens": 171315008.0, + "step": 41 + }, + { + "epoch": 0.004991087344028521, + "grad_norm": 0.22846894631375544, + "learning_rate": 1.623762376237624e-06, + "loss": 1.0287, + "num_tokens": 175499905.0, + "step": 42 + }, + { + "epoch": 0.00510992275698158, + "grad_norm": 0.24225185639441213, + "learning_rate": 1.6633663366336635e-06, + "loss": 1.0126, + "num_tokens": 179689550.0, + "step": 43 + }, + { + "epoch": 0.00522875816993464, + "grad_norm": 0.23763029756423795, + "learning_rate": 1.7029702970297033e-06, + "loss": 1.0154, + "num_tokens": 183878965.0, + "step": 44 + }, + { + "epoch": 0.0053475935828877, + "grad_norm": 0.23405999706073116, + "learning_rate": 1.7425742574257426e-06, + "loss": 1.0064, + "num_tokens": 188067708.0, + "step": 45 + }, + { + "epoch": 0.00546642899584076, + "grad_norm": 0.23148831900037234, + "learning_rate": 1.7821782178217822e-06, + "loss": 1.0435, + "num_tokens": 192243651.0, + "step": 46 + }, + { + "epoch": 0.00558526440879382, + "grad_norm": 0.24368543664446649, + "learning_rate": 1.821782178217822e-06, + "loss": 1.0223, + "num_tokens": 196433733.0, + "step": 47 + }, + { + "epoch": 0.00570409982174688, + "grad_norm": 0.2478285097193804, + "learning_rate": 1.8613861386138615e-06, + "loss": 1.0381, + "num_tokens": 200623526.0, + "step": 48 + }, + { + "epoch": 0.0058229352346999405, + "grad_norm": 0.2379535584340292, + "learning_rate": 1.9009900990099013e-06, + "loss": 1.0076, + "num_tokens": 204813333.0, + "step": 49 + }, + { + "epoch": 0.0059417706476530005, + "grad_norm": 0.25557206511039776, + "learning_rate": 1.940594059405941e-06, + "loss": 1.0088, + "num_tokens": 208975383.0, + "step": 50 + }, + { + "epoch": 0.006060606060606061, + "grad_norm": 0.26103007061709144, + "learning_rate": 1.9801980198019803e-06, + "loss": 1.0362, + "num_tokens": 213166951.0, + "step": 51 + }, + { + "epoch": 0.006179441473559121, + "grad_norm": 0.24412063771479792, + "learning_rate": 2.01980198019802e-06, + "loss": 1.0084, + "num_tokens": 217355561.0, + "step": 52 + }, + { + "epoch": 0.006298276886512181, + "grad_norm": 0.23709465634955332, + "learning_rate": 2.0594059405940594e-06, + "loss": 1.0317, + "num_tokens": 221544680.0, + "step": 53 + }, + { + "epoch": 0.006417112299465241, + "grad_norm": 0.27098588414325625, + "learning_rate": 2.099009900990099e-06, + "loss": 1.0243, + "num_tokens": 225735101.0, + "step": 54 + }, + { + "epoch": 0.006535947712418301, + "grad_norm": 0.25532134680813146, + "learning_rate": 2.1386138613861385e-06, + "loss": 1.0422, + "num_tokens": 229841736.0, + "step": 55 + }, + { + "epoch": 0.006654783125371361, + "grad_norm": 0.24581036911081983, + "learning_rate": 2.1782178217821785e-06, + "loss": 1.0335, + "num_tokens": 234032283.0, + "step": 56 + }, + { + "epoch": 0.006773618538324421, + "grad_norm": 0.26088816418807864, + "learning_rate": 2.217821782178218e-06, + "loss": 1.0269, + "num_tokens": 238221216.0, + "step": 57 + }, + { + "epoch": 0.006892453951277481, + "grad_norm": 0.2442372055543288, + "learning_rate": 2.2574257425742576e-06, + "loss": 1.0276, + "num_tokens": 242389546.0, + "step": 58 + }, + { + "epoch": 0.007011289364230541, + "grad_norm": 0.27075813135161636, + "learning_rate": 2.297029702970297e-06, + "loss": 1.0199, + "num_tokens": 246578657.0, + "step": 59 + }, + { + "epoch": 0.0071301247771836, + "grad_norm": 0.2800399277324682, + "learning_rate": 2.336633663366337e-06, + "loss": 1.0069, + "num_tokens": 250767574.0, + "step": 60 + }, + { + "epoch": 0.00724896019013666, + "grad_norm": 0.24144789423570256, + "learning_rate": 2.3762376237623762e-06, + "loss": 1.0241, + "num_tokens": 254957351.0, + "step": 61 + }, + { + "epoch": 0.00736779560308972, + "grad_norm": 0.2514528757316017, + "learning_rate": 2.415841584158416e-06, + "loss": 0.9979, + "num_tokens": 259148213.0, + "step": 62 + }, + { + "epoch": 0.0074866310160427805, + "grad_norm": 0.2863020556075366, + "learning_rate": 2.4554455445544558e-06, + "loss": 1.0545, + "num_tokens": 263337990.0, + "step": 63 + }, + { + "epoch": 0.0076054664289958405, + "grad_norm": 0.27302569909231783, + "learning_rate": 2.4950495049504953e-06, + "loss": 0.9801, + "num_tokens": 267514951.0, + "step": 64 + }, + { + "epoch": 0.007724301841948901, + "grad_norm": 0.26279799337693327, + "learning_rate": 2.534653465346535e-06, + "loss": 1.027, + "num_tokens": 271678857.0, + "step": 65 + }, + { + "epoch": 0.00784313725490196, + "grad_norm": 0.2654566611463024, + "learning_rate": 2.5742574257425744e-06, + "loss": 1.0312, + "num_tokens": 275835527.0, + "step": 66 + }, + { + "epoch": 0.00796197266785502, + "grad_norm": 0.27807560351128224, + "learning_rate": 2.6138613861386144e-06, + "loss": 1.0324, + "num_tokens": 279993283.0, + "step": 67 + }, + { + "epoch": 0.00808080808080808, + "grad_norm": 0.269800643267211, + "learning_rate": 2.653465346534654e-06, + "loss": 0.9797, + "num_tokens": 284157761.0, + "step": 68 + }, + { + "epoch": 0.00819964349376114, + "grad_norm": 0.27082987223950306, + "learning_rate": 2.6930693069306935e-06, + "loss": 1.03, + "num_tokens": 288346544.0, + "step": 69 + }, + { + "epoch": 0.008318478906714201, + "grad_norm": 0.2540668634477303, + "learning_rate": 2.732673267326733e-06, + "loss": 1.0021, + "num_tokens": 292528763.0, + "step": 70 + }, + { + "epoch": 0.008437314319667261, + "grad_norm": 0.28263275844921354, + "learning_rate": 2.7722772277227726e-06, + "loss": 1.0125, + "num_tokens": 296717395.0, + "step": 71 + }, + { + "epoch": 0.008556149732620321, + "grad_norm": 0.2647856038901977, + "learning_rate": 2.8118811881188125e-06, + "loss": 0.986, + "num_tokens": 300906709.0, + "step": 72 + }, + { + "epoch": 0.008674985145573381, + "grad_norm": 0.2529821279959578, + "learning_rate": 2.8514851485148517e-06, + "loss": 1.0527, + "num_tokens": 305097160.0, + "step": 73 + }, + { + "epoch": 0.008793820558526441, + "grad_norm": 0.2646181165517474, + "learning_rate": 2.8910891089108912e-06, + "loss": 0.9943, + "num_tokens": 309284945.0, + "step": 74 + }, + { + "epoch": 0.008912655971479501, + "grad_norm": 0.26959702784291356, + "learning_rate": 2.9306930693069308e-06, + "loss": 0.9812, + "num_tokens": 313473221.0, + "step": 75 + }, + { + "epoch": 0.009031491384432561, + "grad_norm": 0.3138808413577322, + "learning_rate": 2.9702970297029703e-06, + "loss": 0.9813, + "num_tokens": 317662606.0, + "step": 76 + }, + { + "epoch": 0.009150326797385621, + "grad_norm": 0.28408517876330386, + "learning_rate": 3.00990099009901e-06, + "loss": 1.032, + "num_tokens": 321852483.0, + "step": 77 + }, + { + "epoch": 0.009269162210338681, + "grad_norm": 0.2868438213538743, + "learning_rate": 3.04950495049505e-06, + "loss": 1.0378, + "num_tokens": 325987809.0, + "step": 78 + }, + { + "epoch": 0.009387997623291741, + "grad_norm": 0.29157050535581663, + "learning_rate": 3.0891089108910894e-06, + "loss": 1.0647, + "num_tokens": 330144622.0, + "step": 79 + }, + { + "epoch": 0.009506833036244802, + "grad_norm": 0.3001782630435508, + "learning_rate": 3.128712871287129e-06, + "loss": 1.0182, + "num_tokens": 334331207.0, + "step": 80 + }, + { + "epoch": 0.009625668449197862, + "grad_norm": 0.291465235364328, + "learning_rate": 3.1683168316831685e-06, + "loss": 1.0163, + "num_tokens": 338490797.0, + "step": 81 + }, + { + "epoch": 0.009744503862150922, + "grad_norm": 0.33509397522012213, + "learning_rate": 3.2079207920792084e-06, + "loss": 1.0223, + "num_tokens": 342679983.0, + "step": 82 + }, + { + "epoch": 0.009863339275103982, + "grad_norm": 0.29561346765607516, + "learning_rate": 3.247524752475248e-06, + "loss": 1.0194, + "num_tokens": 346869205.0, + "step": 83 + }, + { + "epoch": 0.009982174688057042, + "grad_norm": 0.3066379714562275, + "learning_rate": 3.2871287128712875e-06, + "loss": 0.9991, + "num_tokens": 351057230.0, + "step": 84 + }, + { + "epoch": 0.010101010101010102, + "grad_norm": 0.2775624967671524, + "learning_rate": 3.326732673267327e-06, + "loss": 1.0062, + "num_tokens": 355245512.0, + "step": 85 + }, + { + "epoch": 0.01021984551396316, + "grad_norm": 0.3531819398272045, + "learning_rate": 3.3663366336633666e-06, + "loss": 0.9892, + "num_tokens": 359434144.0, + "step": 86 + }, + { + "epoch": 0.01033868092691622, + "grad_norm": 0.27853640749796726, + "learning_rate": 3.4059405940594066e-06, + "loss": 1.0639, + "num_tokens": 363610239.0, + "step": 87 + }, + { + "epoch": 0.01045751633986928, + "grad_norm": 0.32156008818707293, + "learning_rate": 3.4455445544554457e-06, + "loss": 1.0112, + "num_tokens": 367788486.0, + "step": 88 + }, + { + "epoch": 0.01057635175282234, + "grad_norm": 0.2744051703450598, + "learning_rate": 3.4851485148514853e-06, + "loss": 1.0364, + "num_tokens": 371971558.0, + "step": 89 + }, + { + "epoch": 0.0106951871657754, + "grad_norm": 0.37663968956319455, + "learning_rate": 3.524752475247525e-06, + "loss": 0.9791, + "num_tokens": 376161770.0, + "step": 90 + }, + { + "epoch": 0.01081402257872846, + "grad_norm": 0.31274199290462107, + "learning_rate": 3.5643564356435644e-06, + "loss": 1.0178, + "num_tokens": 380348189.0, + "step": 91 + }, + { + "epoch": 0.01093285799168152, + "grad_norm": 0.3782840014730868, + "learning_rate": 3.603960396039604e-06, + "loss": 1.0056, + "num_tokens": 384501267.0, + "step": 92 + }, + { + "epoch": 0.01105169340463458, + "grad_norm": 0.3180513047204037, + "learning_rate": 3.643564356435644e-06, + "loss": 1.0379, + "num_tokens": 388674195.0, + "step": 93 + }, + { + "epoch": 0.01117052881758764, + "grad_norm": 0.2885332030887668, + "learning_rate": 3.6831683168316834e-06, + "loss": 1.0185, + "num_tokens": 392854270.0, + "step": 94 + }, + { + "epoch": 0.0112893642305407, + "grad_norm": 0.31476074601872095, + "learning_rate": 3.722772277227723e-06, + "loss": 1.0073, + "num_tokens": 397044021.0, + "step": 95 + }, + { + "epoch": 0.01140819964349376, + "grad_norm": 0.3225728124537213, + "learning_rate": 3.7623762376237625e-06, + "loss": 1.03, + "num_tokens": 401217660.0, + "step": 96 + }, + { + "epoch": 0.011527035056446821, + "grad_norm": 0.3169891557623559, + "learning_rate": 3.8019801980198025e-06, + "loss": 1.0029, + "num_tokens": 405406938.0, + "step": 97 + }, + { + "epoch": 0.011645870469399881, + "grad_norm": 0.3181878175947025, + "learning_rate": 3.841584158415842e-06, + "loss": 1.0542, + "num_tokens": 409587258.0, + "step": 98 + }, + { + "epoch": 0.011764705882352941, + "grad_norm": 0.34238447929026283, + "learning_rate": 3.881188118811882e-06, + "loss": 1.0363, + "num_tokens": 413775442.0, + "step": 99 + }, + { + "epoch": 0.011883541295306001, + "grad_norm": 0.3515778112797284, + "learning_rate": 3.9207920792079216e-06, + "loss": 1.0035, + "num_tokens": 417964241.0, + "step": 100 + }, + { + "epoch": 0.012002376708259061, + "grad_norm": 0.3399832493969732, + "learning_rate": 3.960396039603961e-06, + "loss": 0.984, + "num_tokens": 422139615.0, + "step": 101 + }, + { + "epoch": 0.012121212121212121, + "grad_norm": 0.34316252795261887, + "learning_rate": 4.000000000000001e-06, + "loss": 1.0065, + "num_tokens": 426327760.0, + "step": 102 + }, + { + "epoch": 0.012240047534165181, + "grad_norm": 0.36528466048355646, + "learning_rate": 4.03960396039604e-06, + "loss": 1.0011, + "num_tokens": 430510321.0, + "step": 103 + }, + { + "epoch": 0.012358882947118241, + "grad_norm": 0.33686292280924085, + "learning_rate": 4.079207920792079e-06, + "loss": 0.9748, + "num_tokens": 434700400.0, + "step": 104 + }, + { + "epoch": 0.012477718360071301, + "grad_norm": 0.38124095675063846, + "learning_rate": 4.118811881188119e-06, + "loss": 1.0258, + "num_tokens": 438889548.0, + "step": 105 + }, + { + "epoch": 0.012596553773024361, + "grad_norm": 0.3570544225089495, + "learning_rate": 4.158415841584159e-06, + "loss": 0.9863, + "num_tokens": 443077149.0, + "step": 106 + }, + { + "epoch": 0.012715389185977422, + "grad_norm": 0.4506822319509594, + "learning_rate": 4.198019801980198e-06, + "loss": 1.0175, + "num_tokens": 447266048.0, + "step": 107 + }, + { + "epoch": 0.012834224598930482, + "grad_norm": 0.35437843124351215, + "learning_rate": 4.237623762376238e-06, + "loss": 1.0155, + "num_tokens": 451438688.0, + "step": 108 + }, + { + "epoch": 0.012953060011883542, + "grad_norm": 0.49026774204947215, + "learning_rate": 4.277227722772277e-06, + "loss": 1.0023, + "num_tokens": 455627000.0, + "step": 109 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 0.4104826358751632, + "learning_rate": 4.316831683168317e-06, + "loss": 0.9913, + "num_tokens": 459816596.0, + "step": 110 + }, + { + "epoch": 0.013190730837789662, + "grad_norm": 0.4835685163604746, + "learning_rate": 4.356435643564357e-06, + "loss": 1.0131, + "num_tokens": 463981553.0, + "step": 111 + }, + { + "epoch": 0.013309566250742722, + "grad_norm": 0.4381446355057508, + "learning_rate": 4.396039603960396e-06, + "loss": 1.0239, + "num_tokens": 468152592.0, + "step": 112 + }, + { + "epoch": 0.013428401663695782, + "grad_norm": 0.4745696033221631, + "learning_rate": 4.435643564356436e-06, + "loss": 1.0156, + "num_tokens": 472330760.0, + "step": 113 + }, + { + "epoch": 0.013547237076648842, + "grad_norm": 0.4144658106687397, + "learning_rate": 4.475247524752476e-06, + "loss": 1.0547, + "num_tokens": 476512013.0, + "step": 114 + }, + { + "epoch": 0.013666072489601902, + "grad_norm": 0.44143670217595543, + "learning_rate": 4.514851485148515e-06, + "loss": 1.0269, + "num_tokens": 480701056.0, + "step": 115 + }, + { + "epoch": 0.013784907902554962, + "grad_norm": 0.4655141700849995, + "learning_rate": 4.554455445544555e-06, + "loss": 1.0608, + "num_tokens": 484879058.0, + "step": 116 + }, + { + "epoch": 0.013903743315508022, + "grad_norm": 0.37130314071186993, + "learning_rate": 4.594059405940594e-06, + "loss": 1.0191, + "num_tokens": 489064608.0, + "step": 117 + }, + { + "epoch": 0.014022578728461082, + "grad_norm": 0.6275744846166159, + "learning_rate": 4.633663366336634e-06, + "loss": 1.0056, + "num_tokens": 493214868.0, + "step": 118 + }, + { + "epoch": 0.014141414141414142, + "grad_norm": 0.4037966924861907, + "learning_rate": 4.673267326732674e-06, + "loss": 0.9895, + "num_tokens": 497404152.0, + "step": 119 + }, + { + "epoch": 0.0142602495543672, + "grad_norm": 0.6595220092254291, + "learning_rate": 4.712871287128713e-06, + "loss": 1.0167, + "num_tokens": 501592130.0, + "step": 120 + }, + { + "epoch": 0.01437908496732026, + "grad_norm": 0.5238938351175167, + "learning_rate": 4.7524752475247525e-06, + "loss": 1.0019, + "num_tokens": 505742988.0, + "step": 121 + }, + { + "epoch": 0.01449792038027332, + "grad_norm": 0.5658309477488866, + "learning_rate": 4.7920792079207925e-06, + "loss": 0.9924, + "num_tokens": 509932039.0, + "step": 122 + }, + { + "epoch": 0.01461675579322638, + "grad_norm": 0.4996155069569706, + "learning_rate": 4.831683168316832e-06, + "loss": 1.0006, + "num_tokens": 514092903.0, + "step": 123 + }, + { + "epoch": 0.01473559120617944, + "grad_norm": 0.5194325572344087, + "learning_rate": 4.8712871287128716e-06, + "loss": 1.019, + "num_tokens": 518282989.0, + "step": 124 + }, + { + "epoch": 0.014854426619132501, + "grad_norm": 0.4692171493571607, + "learning_rate": 4.9108910891089115e-06, + "loss": 1.0168, + "num_tokens": 522471224.0, + "step": 125 + }, + { + "epoch": 0.014973262032085561, + "grad_norm": 0.5908543401392924, + "learning_rate": 4.950495049504951e-06, + "loss": 1.0185, + "num_tokens": 526660198.0, + "step": 126 + }, + { + "epoch": 0.015092097445038621, + "grad_norm": 0.49487677525888235, + "learning_rate": 4.990099009900991e-06, + "loss": 1.0105, + "num_tokens": 530849284.0, + "step": 127 + }, + { + "epoch": 0.015210932857991681, + "grad_norm": 0.528924423961576, + "learning_rate": 5.02970297029703e-06, + "loss": 1.0048, + "num_tokens": 535038800.0, + "step": 128 + }, + { + "epoch": 0.015329768270944741, + "grad_norm": 0.47995187288223745, + "learning_rate": 5.06930693069307e-06, + "loss": 0.9943, + "num_tokens": 539226511.0, + "step": 129 + }, + { + "epoch": 0.015448603683897801, + "grad_norm": 0.5932436393606627, + "learning_rate": 5.10891089108911e-06, + "loss": 0.9979, + "num_tokens": 543415744.0, + "step": 130 + }, + { + "epoch": 0.015567439096850861, + "grad_norm": 0.5165808373172965, + "learning_rate": 5.148514851485149e-06, + "loss": 1.0311, + "num_tokens": 547565879.0, + "step": 131 + }, + { + "epoch": 0.01568627450980392, + "grad_norm": 0.46914675350693846, + "learning_rate": 5.188118811881189e-06, + "loss": 0.9954, + "num_tokens": 551734258.0, + "step": 132 + }, + { + "epoch": 0.015805109922756983, + "grad_norm": 0.48062654374740027, + "learning_rate": 5.227722772277229e-06, + "loss": 0.9843, + "num_tokens": 555926080.0, + "step": 133 + }, + { + "epoch": 0.01592394533571004, + "grad_norm": 0.44547284945934174, + "learning_rate": 5.267326732673268e-06, + "loss": 1.0336, + "num_tokens": 560116183.0, + "step": 134 + }, + { + "epoch": 0.016042780748663103, + "grad_norm": 0.5387045462528127, + "learning_rate": 5.306930693069308e-06, + "loss": 1.0472, + "num_tokens": 564306064.0, + "step": 135 + }, + { + "epoch": 0.01616161616161616, + "grad_norm": 0.4149721892434823, + "learning_rate": 5.346534653465347e-06, + "loss": 0.9916, + "num_tokens": 568494281.0, + "step": 136 + }, + { + "epoch": 0.01628045157456922, + "grad_norm": 0.6420544857624245, + "learning_rate": 5.386138613861387e-06, + "loss": 1.006, + "num_tokens": 572659591.0, + "step": 137 + }, + { + "epoch": 0.01639928698752228, + "grad_norm": 0.4715546497084676, + "learning_rate": 5.425742574257427e-06, + "loss": 1.0053, + "num_tokens": 576826780.0, + "step": 138 + }, + { + "epoch": 0.01651812240047534, + "grad_norm": 0.6558874957166373, + "learning_rate": 5.465346534653466e-06, + "loss": 1.0537, + "num_tokens": 580997148.0, + "step": 139 + }, + { + "epoch": 0.016636957813428402, + "grad_norm": 0.6062478239527904, + "learning_rate": 5.504950495049506e-06, + "loss": 1.0167, + "num_tokens": 585178790.0, + "step": 140 + }, + { + "epoch": 0.01675579322638146, + "grad_norm": 0.540476452050986, + "learning_rate": 5.544554455445545e-06, + "loss": 0.9777, + "num_tokens": 589342905.0, + "step": 141 + }, + { + "epoch": 0.016874628639334522, + "grad_norm": 0.5409847709991051, + "learning_rate": 5.584158415841585e-06, + "loss": 1.03, + "num_tokens": 593508857.0, + "step": 142 + }, + { + "epoch": 0.01699346405228758, + "grad_norm": 0.5817150812075775, + "learning_rate": 5.623762376237625e-06, + "loss": 0.9875, + "num_tokens": 597699052.0, + "step": 143 + }, + { + "epoch": 0.017112299465240642, + "grad_norm": 0.522382004077733, + "learning_rate": 5.663366336633663e-06, + "loss": 1.0374, + "num_tokens": 601840868.0, + "step": 144 + }, + { + "epoch": 0.0172311348781937, + "grad_norm": 0.4920491374889655, + "learning_rate": 5.702970297029703e-06, + "loss": 1.008, + "num_tokens": 606011032.0, + "step": 145 + }, + { + "epoch": 0.017349970291146762, + "grad_norm": 0.4597652622932631, + "learning_rate": 5.7425742574257425e-06, + "loss": 0.9954, + "num_tokens": 610170762.0, + "step": 146 + }, + { + "epoch": 0.01746880570409982, + "grad_norm": 0.4823264150221867, + "learning_rate": 5.7821782178217824e-06, + "loss": 1.0382, + "num_tokens": 614328405.0, + "step": 147 + }, + { + "epoch": 0.017587641117052882, + "grad_norm": 0.4535763526065789, + "learning_rate": 5.8217821782178216e-06, + "loss": 1.0176, + "num_tokens": 618518098.0, + "step": 148 + }, + { + "epoch": 0.01770647653000594, + "grad_norm": 0.5034775564082216, + "learning_rate": 5.8613861386138615e-06, + "loss": 1.0249, + "num_tokens": 622707550.0, + "step": 149 + }, + { + "epoch": 0.017825311942959002, + "grad_norm": 0.44518215519464127, + "learning_rate": 5.9009900990099015e-06, + "loss": 1.011, + "num_tokens": 626896780.0, + "step": 150 + }, + { + "epoch": 0.01794414735591206, + "grad_norm": 0.4994935012207737, + "learning_rate": 5.940594059405941e-06, + "loss": 1.0201, + "num_tokens": 631068736.0, + "step": 151 + }, + { + "epoch": 0.018062982768865123, + "grad_norm": 0.46656315348529076, + "learning_rate": 5.980198019801981e-06, + "loss": 1.0491, + "num_tokens": 635255692.0, + "step": 152 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 0.5053856684621402, + "learning_rate": 6.01980198019802e-06, + "loss": 1.0596, + "num_tokens": 639420440.0, + "step": 153 + }, + { + "epoch": 0.018300653594771243, + "grad_norm": 0.46534912296542624, + "learning_rate": 6.05940594059406e-06, + "loss": 1.0273, + "num_tokens": 643609947.0, + "step": 154 + }, + { + "epoch": 0.0184194890077243, + "grad_norm": 0.3875613432101906, + "learning_rate": 6.0990099009901e-06, + "loss": 0.9843, + "num_tokens": 647768382.0, + "step": 155 + }, + { + "epoch": 0.018538324420677363, + "grad_norm": 0.5802352733606384, + "learning_rate": 6.138613861386139e-06, + "loss": 1.0456, + "num_tokens": 651956873.0, + "step": 156 + }, + { + "epoch": 0.01865715983363042, + "grad_norm": 0.4981006905428535, + "learning_rate": 6.178217821782179e-06, + "loss": 1.0018, + "num_tokens": 656138533.0, + "step": 157 + }, + { + "epoch": 0.018775995246583483, + "grad_norm": 0.5442248247547994, + "learning_rate": 6.217821782178219e-06, + "loss": 0.999, + "num_tokens": 660327155.0, + "step": 158 + }, + { + "epoch": 0.01889483065953654, + "grad_norm": 0.436558842456156, + "learning_rate": 6.257425742574258e-06, + "loss": 1.0181, + "num_tokens": 664512641.0, + "step": 159 + }, + { + "epoch": 0.019013666072489603, + "grad_norm": 0.6006789401743763, + "learning_rate": 6.297029702970298e-06, + "loss": 1.016, + "num_tokens": 668668402.0, + "step": 160 + }, + { + "epoch": 0.01913250148544266, + "grad_norm": 0.46333303144775795, + "learning_rate": 6.336633663366337e-06, + "loss": 1.0071, + "num_tokens": 672845905.0, + "step": 161 + }, + { + "epoch": 0.019251336898395723, + "grad_norm": 0.6027986279024985, + "learning_rate": 6.376237623762377e-06, + "loss": 1.0371, + "num_tokens": 677033564.0, + "step": 162 + }, + { + "epoch": 0.01937017231134878, + "grad_norm": 0.4694518423289992, + "learning_rate": 6.415841584158417e-06, + "loss": 1.018, + "num_tokens": 681222461.0, + "step": 163 + }, + { + "epoch": 0.019489007724301843, + "grad_norm": 0.4959403104315456, + "learning_rate": 6.455445544554456e-06, + "loss": 1.0082, + "num_tokens": 685412779.0, + "step": 164 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 0.5249374658255329, + "learning_rate": 6.495049504950496e-06, + "loss": 1.0273, + "num_tokens": 689590318.0, + "step": 165 + }, + { + "epoch": 0.019726678550207963, + "grad_norm": 0.4272761548403561, + "learning_rate": 6.534653465346535e-06, + "loss": 0.9898, + "num_tokens": 693781216.0, + "step": 166 + }, + { + "epoch": 0.019845513963161022, + "grad_norm": 0.5439200838859192, + "learning_rate": 6.574257425742575e-06, + "loss": 1.0103, + "num_tokens": 697967891.0, + "step": 167 + }, + { + "epoch": 0.019964349376114084, + "grad_norm": 0.4208543083170359, + "learning_rate": 6.613861386138615e-06, + "loss": 1.0249, + "num_tokens": 702156753.0, + "step": 168 + }, + { + "epoch": 0.020083184789067142, + "grad_norm": 0.5412470302947126, + "learning_rate": 6.653465346534654e-06, + "loss": 0.9769, + "num_tokens": 706346769.0, + "step": 169 + }, + { + "epoch": 0.020202020202020204, + "grad_norm": 0.49175023907237625, + "learning_rate": 6.693069306930694e-06, + "loss": 1.0281, + "num_tokens": 710519060.0, + "step": 170 + }, + { + "epoch": 0.020320855614973262, + "grad_norm": 0.6978330365006626, + "learning_rate": 6.732673267326733e-06, + "loss": 1.0003, + "num_tokens": 714709562.0, + "step": 171 + }, + { + "epoch": 0.02043969102792632, + "grad_norm": 0.5580269023136903, + "learning_rate": 6.772277227722773e-06, + "loss": 1.0076, + "num_tokens": 718875090.0, + "step": 172 + }, + { + "epoch": 0.020558526440879382, + "grad_norm": 0.5174332770221874, + "learning_rate": 6.811881188118813e-06, + "loss": 1.0428, + "num_tokens": 723053279.0, + "step": 173 + }, + { + "epoch": 0.02067736185383244, + "grad_norm": 0.611888692726984, + "learning_rate": 6.851485148514852e-06, + "loss": 1.0732, + "num_tokens": 727242042.0, + "step": 174 + }, + { + "epoch": 0.020796197266785502, + "grad_norm": 0.6049316740924217, + "learning_rate": 6.8910891089108915e-06, + "loss": 1.0564, + "num_tokens": 731431015.0, + "step": 175 + }, + { + "epoch": 0.02091503267973856, + "grad_norm": 0.6248934090973675, + "learning_rate": 6.930693069306931e-06, + "loss": 1.0047, + "num_tokens": 735590010.0, + "step": 176 + }, + { + "epoch": 0.021033868092691622, + "grad_norm": 0.49947990982442925, + "learning_rate": 6.9702970297029706e-06, + "loss": 0.9883, + "num_tokens": 739778789.0, + "step": 177 + }, + { + "epoch": 0.02115270350564468, + "grad_norm": 0.48364863591670265, + "learning_rate": 7.00990099009901e-06, + "loss": 1.028, + "num_tokens": 743967689.0, + "step": 178 + }, + { + "epoch": 0.021271538918597743, + "grad_norm": 0.7262505543090816, + "learning_rate": 7.04950495049505e-06, + "loss": 0.9999, + "num_tokens": 748140159.0, + "step": 179 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 0.4510215541330937, + "learning_rate": 7.08910891089109e-06, + "loss": 1.0429, + "num_tokens": 752329559.0, + "step": 180 + }, + { + "epoch": 0.021509209744503863, + "grad_norm": 0.5785518935375089, + "learning_rate": 7.128712871287129e-06, + "loss": 1.0434, + "num_tokens": 756517558.0, + "step": 181 + }, + { + "epoch": 0.02162804515745692, + "grad_norm": 0.6629154818431642, + "learning_rate": 7.168316831683169e-06, + "loss": 1.0423, + "num_tokens": 760682401.0, + "step": 182 + }, + { + "epoch": 0.021746880570409983, + "grad_norm": 0.48102155937043856, + "learning_rate": 7.207920792079208e-06, + "loss": 1.0516, + "num_tokens": 764869908.0, + "step": 183 + }, + { + "epoch": 0.02186571598336304, + "grad_norm": 0.588086330906233, + "learning_rate": 7.247524752475248e-06, + "loss": 1.0116, + "num_tokens": 769059107.0, + "step": 184 + }, + { + "epoch": 0.021984551396316103, + "grad_norm": 0.6023936321964533, + "learning_rate": 7.287128712871288e-06, + "loss": 1.0469, + "num_tokens": 773247642.0, + "step": 185 + }, + { + "epoch": 0.02210338680926916, + "grad_norm": 0.45894214768337976, + "learning_rate": 7.326732673267327e-06, + "loss": 1.0422, + "num_tokens": 777436569.0, + "step": 186 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 0.47475890567017354, + "learning_rate": 7.366336633663367e-06, + "loss": 1.0285, + "num_tokens": 781626411.0, + "step": 187 + }, + { + "epoch": 0.02234105763517528, + "grad_norm": 0.6782604342651615, + "learning_rate": 7.405940594059407e-06, + "loss": 1.028, + "num_tokens": 785802703.0, + "step": 188 + }, + { + "epoch": 0.022459893048128343, + "grad_norm": 0.7343237285660992, + "learning_rate": 7.445544554455446e-06, + "loss": 1.0191, + "num_tokens": 789993471.0, + "step": 189 + }, + { + "epoch": 0.0225787284610814, + "grad_norm": 0.35652971339822603, + "learning_rate": 7.485148514851486e-06, + "loss": 1.0388, + "num_tokens": 794150775.0, + "step": 190 + }, + { + "epoch": 0.022697563874034463, + "grad_norm": 0.43332834151629507, + "learning_rate": 7.524752475247525e-06, + "loss": 1.0224, + "num_tokens": 798339301.0, + "step": 191 + }, + { + "epoch": 0.02281639928698752, + "grad_norm": 0.9258647671036654, + "learning_rate": 7.564356435643565e-06, + "loss": 0.9848, + "num_tokens": 802528556.0, + "step": 192 + }, + { + "epoch": 0.022935234699940583, + "grad_norm": 0.47073660346411383, + "learning_rate": 7.603960396039605e-06, + "loss": 1.017, + "num_tokens": 806710578.0, + "step": 193 + }, + { + "epoch": 0.023054070112893642, + "grad_norm": 0.5808781719053012, + "learning_rate": 7.643564356435644e-06, + "loss": 1.0116, + "num_tokens": 810900295.0, + "step": 194 + }, + { + "epoch": 0.023172905525846704, + "grad_norm": 0.585060542878499, + "learning_rate": 7.683168316831683e-06, + "loss": 1.0334, + "num_tokens": 815084224.0, + "step": 195 + }, + { + "epoch": 0.023291740938799762, + "grad_norm": 0.6734805049393392, + "learning_rate": 7.722772277227724e-06, + "loss": 0.9827, + "num_tokens": 819255928.0, + "step": 196 + }, + { + "epoch": 0.023410576351752824, + "grad_norm": 0.49224844310839544, + "learning_rate": 7.762376237623763e-06, + "loss": 1.0158, + "num_tokens": 823445497.0, + "step": 197 + }, + { + "epoch": 0.023529411764705882, + "grad_norm": 0.7494525060605995, + "learning_rate": 7.801980198019802e-06, + "loss": 1.0064, + "num_tokens": 827634226.0, + "step": 198 + }, + { + "epoch": 0.023648247177658944, + "grad_norm": 0.6640495790785501, + "learning_rate": 7.841584158415843e-06, + "loss": 1.0124, + "num_tokens": 831823626.0, + "step": 199 + }, + { + "epoch": 0.023767082590612002, + "grad_norm": 0.5667718298463225, + "learning_rate": 7.881188118811882e-06, + "loss": 1.0248, + "num_tokens": 836010068.0, + "step": 200 + }, + { + "epoch": 0.023885918003565064, + "grad_norm": 0.7854684394010504, + "learning_rate": 7.920792079207921e-06, + "loss": 1.0558, + "num_tokens": 840199801.0, + "step": 201 + }, + { + "epoch": 0.024004753416518122, + "grad_norm": 0.5909423065177295, + "learning_rate": 7.960396039603962e-06, + "loss": 0.9996, + "num_tokens": 844375497.0, + "step": 202 + }, + { + "epoch": 0.024123588829471184, + "grad_norm": 0.660740432472102, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0063, + "num_tokens": 848565196.0, + "step": 203 + }, + { + "epoch": 0.024242424242424242, + "grad_norm": 0.6864967148567281, + "learning_rate": 8.03960396039604e-06, + "loss": 0.995, + "num_tokens": 852754119.0, + "step": 204 + }, + { + "epoch": 0.0243612596553773, + "grad_norm": 0.8409546788464619, + "learning_rate": 8.07920792079208e-06, + "loss": 1.0121, + "num_tokens": 856936841.0, + "step": 205 + }, + { + "epoch": 0.024480095068330363, + "grad_norm": 0.4459459148367304, + "learning_rate": 8.11881188118812e-06, + "loss": 0.9791, + "num_tokens": 861095420.0, + "step": 206 + }, + { + "epoch": 0.02459893048128342, + "grad_norm": 0.9720314947064326, + "learning_rate": 8.158415841584158e-06, + "loss": 1.0137, + "num_tokens": 865257907.0, + "step": 207 + }, + { + "epoch": 0.024717765894236483, + "grad_norm": 0.6040973445255612, + "learning_rate": 8.198019801980199e-06, + "loss": 1.0019, + "num_tokens": 869442773.0, + "step": 208 + }, + { + "epoch": 0.02483660130718954, + "grad_norm": 0.8127328141953055, + "learning_rate": 8.237623762376238e-06, + "loss": 1.0184, + "num_tokens": 873632912.0, + "step": 209 + }, + { + "epoch": 0.024955436720142603, + "grad_norm": 0.6992789928624441, + "learning_rate": 8.277227722772277e-06, + "loss": 1.0248, + "num_tokens": 877804319.0, + "step": 210 + }, + { + "epoch": 0.02507427213309566, + "grad_norm": 0.6086831552186913, + "learning_rate": 8.316831683168318e-06, + "loss": 1.0215, + "num_tokens": 881991533.0, + "step": 211 + }, + { + "epoch": 0.025193107546048723, + "grad_norm": 0.6611177164856036, + "learning_rate": 8.356435643564357e-06, + "loss": 0.9924, + "num_tokens": 886180583.0, + "step": 212 + }, + { + "epoch": 0.02531194295900178, + "grad_norm": 0.6603356894798172, + "learning_rate": 8.396039603960396e-06, + "loss": 1.0537, + "num_tokens": 890369768.0, + "step": 213 + }, + { + "epoch": 0.025430778371954843, + "grad_norm": 0.6791499694744586, + "learning_rate": 8.435643564356437e-06, + "loss": 1.0396, + "num_tokens": 894551454.0, + "step": 214 + }, + { + "epoch": 0.0255496137849079, + "grad_norm": 0.7787800860453032, + "learning_rate": 8.475247524752476e-06, + "loss": 1.0003, + "num_tokens": 898741745.0, + "step": 215 + }, + { + "epoch": 0.025668449197860963, + "grad_norm": 0.6678540902349615, + "learning_rate": 8.514851485148515e-06, + "loss": 1.0315, + "num_tokens": 902873495.0, + "step": 216 + }, + { + "epoch": 0.02578728461081402, + "grad_norm": 0.6307737432431259, + "learning_rate": 8.554455445544554e-06, + "loss": 1.0534, + "num_tokens": 907033684.0, + "step": 217 + }, + { + "epoch": 0.025906120023767083, + "grad_norm": 0.7833016431060976, + "learning_rate": 8.594059405940595e-06, + "loss": 1.02, + "num_tokens": 911223623.0, + "step": 218 + }, + { + "epoch": 0.02602495543672014, + "grad_norm": 0.6096829966057153, + "learning_rate": 8.633663366336634e-06, + "loss": 1.0368, + "num_tokens": 915400001.0, + "step": 219 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 0.569031177270888, + "learning_rate": 8.673267326732673e-06, + "loss": 0.9792, + "num_tokens": 919588247.0, + "step": 220 + }, + { + "epoch": 0.026262626262626262, + "grad_norm": 0.6339379125320957, + "learning_rate": 8.712871287128714e-06, + "loss": 1.0128, + "num_tokens": 923746727.0, + "step": 221 + }, + { + "epoch": 0.026381461675579324, + "grad_norm": 0.8433650759264993, + "learning_rate": 8.752475247524753e-06, + "loss": 1.0087, + "num_tokens": 927933306.0, + "step": 222 + }, + { + "epoch": 0.026500297088532382, + "grad_norm": 0.6962789048903255, + "learning_rate": 8.792079207920792e-06, + "loss": 0.9859, + "num_tokens": 932122650.0, + "step": 223 + }, + { + "epoch": 0.026619132501485444, + "grad_norm": 0.8634741376775652, + "learning_rate": 8.831683168316833e-06, + "loss": 1.0254, + "num_tokens": 936286142.0, + "step": 224 + }, + { + "epoch": 0.026737967914438502, + "grad_norm": 0.5953098977840278, + "learning_rate": 8.871287128712872e-06, + "loss": 0.9987, + "num_tokens": 940474646.0, + "step": 225 + }, + { + "epoch": 0.026856803327391564, + "grad_norm": 0.6441800702726047, + "learning_rate": 8.910891089108911e-06, + "loss": 0.9808, + "num_tokens": 944663022.0, + "step": 226 + }, + { + "epoch": 0.026975638740344622, + "grad_norm": 0.5425881361508264, + "learning_rate": 8.950495049504952e-06, + "loss": 1.0047, + "num_tokens": 948853794.0, + "step": 227 + }, + { + "epoch": 0.027094474153297684, + "grad_norm": 0.8607136093399018, + "learning_rate": 8.990099009900991e-06, + "loss": 1.0086, + "num_tokens": 953027120.0, + "step": 228 + }, + { + "epoch": 0.027213309566250742, + "grad_norm": 0.6168347080369987, + "learning_rate": 9.02970297029703e-06, + "loss": 0.9831, + "num_tokens": 957216181.0, + "step": 229 + }, + { + "epoch": 0.027332144979203804, + "grad_norm": 0.8758851066003743, + "learning_rate": 9.06930693069307e-06, + "loss": 0.996, + "num_tokens": 961403610.0, + "step": 230 + }, + { + "epoch": 0.027450980392156862, + "grad_norm": 0.727725645627956, + "learning_rate": 9.10891089108911e-06, + "loss": 1.026, + "num_tokens": 965593416.0, + "step": 231 + }, + { + "epoch": 0.027569815805109924, + "grad_norm": 0.5570636085299149, + "learning_rate": 9.14851485148515e-06, + "loss": 0.9888, + "num_tokens": 969752811.0, + "step": 232 + }, + { + "epoch": 0.027688651218062982, + "grad_norm": 0.9393573623284593, + "learning_rate": 9.188118811881189e-06, + "loss": 1.0464, + "num_tokens": 973942206.0, + "step": 233 + }, + { + "epoch": 0.027807486631016044, + "grad_norm": 0.6585303956694029, + "learning_rate": 9.22772277227723e-06, + "loss": 1.0192, + "num_tokens": 978078292.0, + "step": 234 + }, + { + "epoch": 0.027926322043969103, + "grad_norm": 0.6787018389377634, + "learning_rate": 9.267326732673269e-06, + "loss": 1.0331, + "num_tokens": 982235554.0, + "step": 235 + }, + { + "epoch": 0.028045157456922164, + "grad_norm": 0.7798327918407351, + "learning_rate": 9.306930693069308e-06, + "loss": 1.0312, + "num_tokens": 986425599.0, + "step": 236 + }, + { + "epoch": 0.028163992869875223, + "grad_norm": 0.647607618345348, + "learning_rate": 9.346534653465348e-06, + "loss": 1.0064, + "num_tokens": 990594553.0, + "step": 237 + }, + { + "epoch": 0.028282828282828285, + "grad_norm": 0.7799392175539815, + "learning_rate": 9.386138613861386e-06, + "loss": 1.0376, + "num_tokens": 994762354.0, + "step": 238 + }, + { + "epoch": 0.028401663695781343, + "grad_norm": 0.7951568640467539, + "learning_rate": 9.425742574257427e-06, + "loss": 1.0244, + "num_tokens": 998952434.0, + "step": 239 + }, + { + "epoch": 0.0285204991087344, + "grad_norm": 0.9123943251709518, + "learning_rate": 9.465346534653466e-06, + "loss": 1.0223, + "num_tokens": 1003126839.0, + "step": 240 + }, + { + "epoch": 0.028639334521687463, + "grad_norm": 0.5704551276604305, + "learning_rate": 9.504950495049505e-06, + "loss": 1.0162, + "num_tokens": 1007314520.0, + "step": 241 + }, + { + "epoch": 0.02875816993464052, + "grad_norm": 0.7093998305705889, + "learning_rate": 9.544554455445544e-06, + "loss": 0.9959, + "num_tokens": 1011490811.0, + "step": 242 + }, + { + "epoch": 0.028877005347593583, + "grad_norm": 0.9554972244674053, + "learning_rate": 9.584158415841585e-06, + "loss": 0.9878, + "num_tokens": 1015649909.0, + "step": 243 + }, + { + "epoch": 0.02899584076054664, + "grad_norm": 0.6773342795753089, + "learning_rate": 9.623762376237624e-06, + "loss": 1.0258, + "num_tokens": 1019837224.0, + "step": 244 + }, + { + "epoch": 0.029114676173499703, + "grad_norm": 0.595394408954933, + "learning_rate": 9.663366336633663e-06, + "loss": 0.9721, + "num_tokens": 1024027335.0, + "step": 245 + }, + { + "epoch": 0.02923351158645276, + "grad_norm": 0.720180611391478, + "learning_rate": 9.702970297029704e-06, + "loss": 1.0043, + "num_tokens": 1028216582.0, + "step": 246 + }, + { + "epoch": 0.029352346999405823, + "grad_norm": 0.9421265476212224, + "learning_rate": 9.742574257425743e-06, + "loss": 0.9954, + "num_tokens": 1032405272.0, + "step": 247 + }, + { + "epoch": 0.02947118241235888, + "grad_norm": 0.703767331825798, + "learning_rate": 9.782178217821782e-06, + "loss": 1.0525, + "num_tokens": 1036594492.0, + "step": 248 + }, + { + "epoch": 0.029590017825311943, + "grad_norm": 0.5461191292469952, + "learning_rate": 9.821782178217823e-06, + "loss": 1.0093, + "num_tokens": 1040730823.0, + "step": 249 + }, + { + "epoch": 0.029708853238265002, + "grad_norm": 1.0967390547003595, + "learning_rate": 9.861386138613862e-06, + "loss": 1.0234, + "num_tokens": 1044902785.0, + "step": 250 + }, + { + "epoch": 0.029827688651218064, + "grad_norm": 0.6336666936271298, + "learning_rate": 9.900990099009901e-06, + "loss": 0.9891, + "num_tokens": 1049086295.0, + "step": 251 + }, + { + "epoch": 0.029946524064171122, + "grad_norm": 1.0602298578812226, + "learning_rate": 9.940594059405942e-06, + "loss": 1.0043, + "num_tokens": 1053269558.0, + "step": 252 + }, + { + "epoch": 0.030065359477124184, + "grad_norm": 0.5831881179451864, + "learning_rate": 9.980198019801981e-06, + "loss": 1.0008, + "num_tokens": 1057425936.0, + "step": 253 + }, + { + "epoch": 0.030184194890077242, + "grad_norm": 1.070610401613548, + "learning_rate": 1.001980198019802e-05, + "loss": 1.0136, + "num_tokens": 1061610617.0, + "step": 254 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 0.7203823171427136, + "learning_rate": 1.005940594059406e-05, + "loss": 1.0307, + "num_tokens": 1065797537.0, + "step": 255 + }, + { + "epoch": 0.030421865715983362, + "grad_norm": 0.7674475077953384, + "learning_rate": 1.00990099009901e-05, + "loss": 1.0153, + "num_tokens": 1069955464.0, + "step": 256 + }, + { + "epoch": 0.030540701128936424, + "grad_norm": 0.9419656176631803, + "learning_rate": 1.013861386138614e-05, + "loss": 1.0059, + "num_tokens": 1074145039.0, + "step": 257 + }, + { + "epoch": 0.030659536541889482, + "grad_norm": 0.6479033789211038, + "learning_rate": 1.0178217821782179e-05, + "loss": 1.0132, + "num_tokens": 1078334293.0, + "step": 258 + }, + { + "epoch": 0.030778371954842544, + "grad_norm": 0.9364743889605371, + "learning_rate": 1.021782178217822e-05, + "loss": 1.0208, + "num_tokens": 1082524209.0, + "step": 259 + }, + { + "epoch": 0.030897207367795602, + "grad_norm": 0.7376190879646657, + "learning_rate": 1.0257425742574259e-05, + "loss": 1.0096, + "num_tokens": 1086684995.0, + "step": 260 + }, + { + "epoch": 0.031016042780748664, + "grad_norm": 0.8519845075927946, + "learning_rate": 1.0297029702970298e-05, + "loss": 0.9865, + "num_tokens": 1090843379.0, + "step": 261 + }, + { + "epoch": 0.031134878193701723, + "grad_norm": 0.8282110744894987, + "learning_rate": 1.0336633663366338e-05, + "loss": 1.0067, + "num_tokens": 1095033231.0, + "step": 262 + }, + { + "epoch": 0.031253713606654784, + "grad_norm": 0.8393415175729813, + "learning_rate": 1.0376237623762378e-05, + "loss": 1.0092, + "num_tokens": 1099222678.0, + "step": 263 + }, + { + "epoch": 0.03137254901960784, + "grad_norm": 0.725695840467483, + "learning_rate": 1.0415841584158417e-05, + "loss": 0.9968, + "num_tokens": 1103413530.0, + "step": 264 + }, + { + "epoch": 0.0314913844325609, + "grad_norm": 0.6553377120093324, + "learning_rate": 1.0455445544554458e-05, + "loss": 1.0264, + "num_tokens": 1107582953.0, + "step": 265 + }, + { + "epoch": 0.031610219845513966, + "grad_norm": 0.8324441874046035, + "learning_rate": 1.0495049504950497e-05, + "loss": 0.996, + "num_tokens": 1111761495.0, + "step": 266 + }, + { + "epoch": 0.031729055258467025, + "grad_norm": 0.9153443523392923, + "learning_rate": 1.0534653465346536e-05, + "loss": 1.0209, + "num_tokens": 1115949650.0, + "step": 267 + }, + { + "epoch": 0.03184789067142008, + "grad_norm": 0.8555680808475898, + "learning_rate": 1.0574257425742575e-05, + "loss": 0.9995, + "num_tokens": 1120137606.0, + "step": 268 + }, + { + "epoch": 0.03196672608437314, + "grad_norm": 0.7375392214953541, + "learning_rate": 1.0613861386138616e-05, + "loss": 0.9997, + "num_tokens": 1124314620.0, + "step": 269 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 0.9581400630517881, + "learning_rate": 1.0653465346534655e-05, + "loss": 1.0389, + "num_tokens": 1128490252.0, + "step": 270 + }, + { + "epoch": 0.032204396910279265, + "grad_norm": 0.8422926638534483, + "learning_rate": 1.0693069306930694e-05, + "loss": 0.9889, + "num_tokens": 1132679699.0, + "step": 271 + }, + { + "epoch": 0.03232323232323232, + "grad_norm": 0.6925882554646645, + "learning_rate": 1.0732673267326735e-05, + "loss": 1.0638, + "num_tokens": 1136842662.0, + "step": 272 + }, + { + "epoch": 0.03244206773618538, + "grad_norm": 0.7754630941363464, + "learning_rate": 1.0772277227722774e-05, + "loss": 0.987, + "num_tokens": 1141031919.0, + "step": 273 + }, + { + "epoch": 0.03256090314913844, + "grad_norm": 0.8754029223124477, + "learning_rate": 1.0811881188118813e-05, + "loss": 0.9927, + "num_tokens": 1145208741.0, + "step": 274 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 0.8521362790760199, + "learning_rate": 1.0851485148514854e-05, + "loss": 1.0072, + "num_tokens": 1149398304.0, + "step": 275 + }, + { + "epoch": 0.03279857397504456, + "grad_norm": 0.7850000381403789, + "learning_rate": 1.0891089108910893e-05, + "loss": 0.9972, + "num_tokens": 1153588288.0, + "step": 276 + }, + { + "epoch": 0.03291740938799762, + "grad_norm": 0.7549022952279772, + "learning_rate": 1.0930693069306932e-05, + "loss": 0.9935, + "num_tokens": 1157770250.0, + "step": 277 + }, + { + "epoch": 0.03303624480095068, + "grad_norm": 1.0845035015546336, + "learning_rate": 1.0970297029702971e-05, + "loss": 0.978, + "num_tokens": 1161944168.0, + "step": 278 + }, + { + "epoch": 0.033155080213903745, + "grad_norm": 0.6415163647450259, + "learning_rate": 1.1009900990099012e-05, + "loss": 1.0276, + "num_tokens": 1166133503.0, + "step": 279 + }, + { + "epoch": 0.033273915626856804, + "grad_norm": 1.058663221980277, + "learning_rate": 1.1049504950495051e-05, + "loss": 1.0183, + "num_tokens": 1170323157.0, + "step": 280 + }, + { + "epoch": 0.03339275103980986, + "grad_norm": 0.5648136060458717, + "learning_rate": 1.108910891089109e-05, + "loss": 0.95, + "num_tokens": 1174513193.0, + "step": 281 + }, + { + "epoch": 0.03351158645276292, + "grad_norm": 1.0539614798013301, + "learning_rate": 1.1128712871287131e-05, + "loss": 1.0224, + "num_tokens": 1178676037.0, + "step": 282 + }, + { + "epoch": 0.033630421865715986, + "grad_norm": 0.6858822798445854, + "learning_rate": 1.116831683168317e-05, + "loss": 0.9642, + "num_tokens": 1182864114.0, + "step": 283 + }, + { + "epoch": 0.033749257278669044, + "grad_norm": 1.0618626929094162, + "learning_rate": 1.120792079207921e-05, + "loss": 1.0113, + "num_tokens": 1187053597.0, + "step": 284 + }, + { + "epoch": 0.0338680926916221, + "grad_norm": 0.7237828121561906, + "learning_rate": 1.124752475247525e-05, + "loss": 1.0228, + "num_tokens": 1191212043.0, + "step": 285 + }, + { + "epoch": 0.03398692810457516, + "grad_norm": 0.7821103185199086, + "learning_rate": 1.1287128712871288e-05, + "loss": 1.0143, + "num_tokens": 1195402388.0, + "step": 286 + }, + { + "epoch": 0.034105763517528226, + "grad_norm": 0.6509027000323482, + "learning_rate": 1.1326732673267327e-05, + "loss": 1.0035, + "num_tokens": 1199591681.0, + "step": 287 + }, + { + "epoch": 0.034224598930481284, + "grad_norm": 0.7749604113586289, + "learning_rate": 1.1366336633663366e-05, + "loss": 1.022, + "num_tokens": 1203767799.0, + "step": 288 + }, + { + "epoch": 0.03434343434343434, + "grad_norm": 0.961195923319006, + "learning_rate": 1.1405940594059407e-05, + "loss": 0.9914, + "num_tokens": 1207957553.0, + "step": 289 + }, + { + "epoch": 0.0344622697563874, + "grad_norm": 0.8455634561642372, + "learning_rate": 1.1445544554455446e-05, + "loss": 1.0308, + "num_tokens": 1212145573.0, + "step": 290 + }, + { + "epoch": 0.034581105169340466, + "grad_norm": 0.822720735644106, + "learning_rate": 1.1485148514851485e-05, + "loss": 1.0397, + "num_tokens": 1216317849.0, + "step": 291 + }, + { + "epoch": 0.034699940582293524, + "grad_norm": 0.8844501537362013, + "learning_rate": 1.1524752475247524e-05, + "loss": 1.035, + "num_tokens": 1220506215.0, + "step": 292 + }, + { + "epoch": 0.03481877599524658, + "grad_norm": 0.8942003780166746, + "learning_rate": 1.1564356435643565e-05, + "loss": 1.0096, + "num_tokens": 1224694908.0, + "step": 293 + }, + { + "epoch": 0.03493761140819964, + "grad_norm": 0.9843642440898817, + "learning_rate": 1.1603960396039604e-05, + "loss": 0.9953, + "num_tokens": 1228862316.0, + "step": 294 + }, + { + "epoch": 0.035056446821152706, + "grad_norm": 0.8035696973561515, + "learning_rate": 1.1643564356435643e-05, + "loss": 1.0318, + "num_tokens": 1233042748.0, + "step": 295 + }, + { + "epoch": 0.035175282234105765, + "grad_norm": 0.6567289282514936, + "learning_rate": 1.1683168316831684e-05, + "loss": 1.0005, + "num_tokens": 1237229431.0, + "step": 296 + }, + { + "epoch": 0.03529411764705882, + "grad_norm": 1.089071027363801, + "learning_rate": 1.1722772277227723e-05, + "loss": 1.0253, + "num_tokens": 1241419025.0, + "step": 297 + }, + { + "epoch": 0.03541295306001188, + "grad_norm": 0.7190853796750405, + "learning_rate": 1.1762376237623762e-05, + "loss": 1.0131, + "num_tokens": 1245608848.0, + "step": 298 + }, + { + "epoch": 0.03553178847296495, + "grad_norm": 1.039314701406848, + "learning_rate": 1.1801980198019803e-05, + "loss": 0.9831, + "num_tokens": 1249788844.0, + "step": 299 + }, + { + "epoch": 0.035650623885918005, + "grad_norm": 0.7317367741341476, + "learning_rate": 1.1841584158415842e-05, + "loss": 0.9783, + "num_tokens": 1253977821.0, + "step": 300 + }, + { + "epoch": 0.03576945929887106, + "grad_norm": 1.1443046134066197, + "learning_rate": 1.1881188118811881e-05, + "loss": 0.9829, + "num_tokens": 1258151069.0, + "step": 301 + }, + { + "epoch": 0.03588829471182412, + "grad_norm": 0.6733196347828493, + "learning_rate": 1.1920792079207922e-05, + "loss": 1.032, + "num_tokens": 1262340352.0, + "step": 302 + }, + { + "epoch": 0.03600713012477719, + "grad_norm": 0.8493720818190518, + "learning_rate": 1.1960396039603961e-05, + "loss": 1.0196, + "num_tokens": 1266528913.0, + "step": 303 + }, + { + "epoch": 0.036125965537730245, + "grad_norm": 0.8176063067958872, + "learning_rate": 1.2e-05, + "loss": 1.0133, + "num_tokens": 1270719530.0, + "step": 304 + }, + { + "epoch": 0.036244800950683304, + "grad_norm": 0.8810404200121977, + "learning_rate": 1.203960396039604e-05, + "loss": 0.9894, + "num_tokens": 1274908932.0, + "step": 305 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 1.060324378998406, + "learning_rate": 1.207920792079208e-05, + "loss": 0.9993, + "num_tokens": 1279099030.0, + "step": 306 + }, + { + "epoch": 0.03648247177658942, + "grad_norm": 0.6959978889308793, + "learning_rate": 1.211881188118812e-05, + "loss": 1.0182, + "num_tokens": 1283287711.0, + "step": 307 + }, + { + "epoch": 0.036601307189542485, + "grad_norm": 0.8415900260977608, + "learning_rate": 1.2158415841584158e-05, + "loss": 1.0121, + "num_tokens": 1287472165.0, + "step": 308 + }, + { + "epoch": 0.036720142602495544, + "grad_norm": 0.8757485173618159, + "learning_rate": 1.21980198019802e-05, + "loss": 1.014, + "num_tokens": 1291663020.0, + "step": 309 + }, + { + "epoch": 0.0368389780154486, + "grad_norm": 1.0074787045035172, + "learning_rate": 1.2237623762376238e-05, + "loss": 0.9726, + "num_tokens": 1295819145.0, + "step": 310 + }, + { + "epoch": 0.03695781342840166, + "grad_norm": 0.8434293536983399, + "learning_rate": 1.2277227722772278e-05, + "loss": 1.0273, + "num_tokens": 1300008004.0, + "step": 311 + }, + { + "epoch": 0.037076648841354726, + "grad_norm": 0.817195505528395, + "learning_rate": 1.2316831683168318e-05, + "loss": 0.9914, + "num_tokens": 1304199296.0, + "step": 312 + }, + { + "epoch": 0.037195484254307784, + "grad_norm": 0.9345861445984438, + "learning_rate": 1.2356435643564358e-05, + "loss": 1.0148, + "num_tokens": 1308388323.0, + "step": 313 + }, + { + "epoch": 0.03731431966726084, + "grad_norm": 0.9258263172273893, + "learning_rate": 1.2396039603960397e-05, + "loss": 1.0079, + "num_tokens": 1312577656.0, + "step": 314 + }, + { + "epoch": 0.0374331550802139, + "grad_norm": 0.8453836072273264, + "learning_rate": 1.2435643564356437e-05, + "loss": 1.0407, + "num_tokens": 1316764962.0, + "step": 315 + }, + { + "epoch": 0.037551990493166966, + "grad_norm": 0.9321831818940307, + "learning_rate": 1.2475247524752477e-05, + "loss": 0.9841, + "num_tokens": 1320952051.0, + "step": 316 + }, + { + "epoch": 0.037670825906120024, + "grad_norm": 0.8447057770893341, + "learning_rate": 1.2514851485148516e-05, + "loss": 1.0263, + "num_tokens": 1325123995.0, + "step": 317 + }, + { + "epoch": 0.03778966131907308, + "grad_norm": 0.8267222718201922, + "learning_rate": 1.2554455445544555e-05, + "loss": 1.0848, + "num_tokens": 1329286282.0, + "step": 318 + }, + { + "epoch": 0.03790849673202614, + "grad_norm": 0.9466046120622011, + "learning_rate": 1.2594059405940596e-05, + "loss": 1.0279, + "num_tokens": 1333476194.0, + "step": 319 + }, + { + "epoch": 0.038027332144979206, + "grad_norm": 0.8968982584112767, + "learning_rate": 1.2633663366336635e-05, + "loss": 1.0019, + "num_tokens": 1337656905.0, + "step": 320 + }, + { + "epoch": 0.038146167557932265, + "grad_norm": 0.8304799284104998, + "learning_rate": 1.2673267326732674e-05, + "loss": 1.0415, + "num_tokens": 1341847298.0, + "step": 321 + }, + { + "epoch": 0.03826500297088532, + "grad_norm": 1.1429024478138583, + "learning_rate": 1.2712871287128715e-05, + "loss": 1.0042, + "num_tokens": 1346028806.0, + "step": 322 + }, + { + "epoch": 0.03838383838383838, + "grad_norm": 0.6310515062348399, + "learning_rate": 1.2752475247524754e-05, + "loss": 1.0438, + "num_tokens": 1350218086.0, + "step": 323 + }, + { + "epoch": 0.038502673796791446, + "grad_norm": 0.9013586765119171, + "learning_rate": 1.2792079207920793e-05, + "loss": 1.0053, + "num_tokens": 1354388534.0, + "step": 324 + }, + { + "epoch": 0.038621509209744505, + "grad_norm": 0.849574079157539, + "learning_rate": 1.2831683168316834e-05, + "loss": 1.0005, + "num_tokens": 1358511812.0, + "step": 325 + }, + { + "epoch": 0.03874034462269756, + "grad_norm": 1.0694301946134852, + "learning_rate": 1.2871287128712873e-05, + "loss": 1.0286, + "num_tokens": 1362673034.0, + "step": 326 + }, + { + "epoch": 0.03885918003565062, + "grad_norm": 0.8339709169050344, + "learning_rate": 1.2910891089108912e-05, + "loss": 1.0212, + "num_tokens": 1366861946.0, + "step": 327 + }, + { + "epoch": 0.03897801544860369, + "grad_norm": 0.9911903176211008, + "learning_rate": 1.2950495049504951e-05, + "loss": 1.0467, + "num_tokens": 1371028434.0, + "step": 328 + }, + { + "epoch": 0.039096850861556745, + "grad_norm": 0.7537014336869483, + "learning_rate": 1.2990099009900992e-05, + "loss": 0.987, + "num_tokens": 1375217300.0, + "step": 329 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 0.8788145428140455, + "learning_rate": 1.3029702970297031e-05, + "loss": 1.0051, + "num_tokens": 1379377438.0, + "step": 330 + }, + { + "epoch": 0.03933452168746286, + "grad_norm": 0.9603456261287328, + "learning_rate": 1.306930693069307e-05, + "loss": 1.0137, + "num_tokens": 1383566549.0, + "step": 331 + }, + { + "epoch": 0.03945335710041593, + "grad_norm": 0.8105797901165799, + "learning_rate": 1.3108910891089111e-05, + "loss": 1.0031, + "num_tokens": 1387757460.0, + "step": 332 + }, + { + "epoch": 0.039572192513368985, + "grad_norm": 1.09807402978249, + "learning_rate": 1.314851485148515e-05, + "loss": 1.0233, + "num_tokens": 1391917533.0, + "step": 333 + }, + { + "epoch": 0.039691027926322044, + "grad_norm": 0.868894928453025, + "learning_rate": 1.318811881188119e-05, + "loss": 0.9596, + "num_tokens": 1396081627.0, + "step": 334 + }, + { + "epoch": 0.0398098633392751, + "grad_norm": 0.9353812368109335, + "learning_rate": 1.322772277227723e-05, + "loss": 1.0169, + "num_tokens": 1400272063.0, + "step": 335 + }, + { + "epoch": 0.03992869875222817, + "grad_norm": 0.8791612780518521, + "learning_rate": 1.326732673267327e-05, + "loss": 0.979, + "num_tokens": 1404462448.0, + "step": 336 + }, + { + "epoch": 0.040047534165181226, + "grad_norm": 0.9805451780794371, + "learning_rate": 1.3306930693069308e-05, + "loss": 1.0171, + "num_tokens": 1408632031.0, + "step": 337 + }, + { + "epoch": 0.040166369578134284, + "grad_norm": 0.9990717893207497, + "learning_rate": 1.334653465346535e-05, + "loss": 1.013, + "num_tokens": 1412821953.0, + "step": 338 + }, + { + "epoch": 0.04028520499108734, + "grad_norm": 0.7558714535155148, + "learning_rate": 1.3386138613861388e-05, + "loss": 1.0159, + "num_tokens": 1416992995.0, + "step": 339 + }, + { + "epoch": 0.04040404040404041, + "grad_norm": 0.8799046069677313, + "learning_rate": 1.3425742574257427e-05, + "loss": 0.9915, + "num_tokens": 1421160373.0, + "step": 340 + }, + { + "epoch": 0.040522875816993466, + "grad_norm": 0.9778253388305075, + "learning_rate": 1.3465346534653467e-05, + "loss": 0.9974, + "num_tokens": 1425312789.0, + "step": 341 + }, + { + "epoch": 0.040641711229946524, + "grad_norm": 0.7156494188736517, + "learning_rate": 1.3504950495049507e-05, + "loss": 1.0261, + "num_tokens": 1429465053.0, + "step": 342 + }, + { + "epoch": 0.04076054664289958, + "grad_norm": 0.9764217108100968, + "learning_rate": 1.3544554455445546e-05, + "loss": 1.0469, + "num_tokens": 1433654258.0, + "step": 343 + }, + { + "epoch": 0.04087938205585264, + "grad_norm": 0.7517912123691523, + "learning_rate": 1.3584158415841586e-05, + "loss": 0.9913, + "num_tokens": 1437840065.0, + "step": 344 + }, + { + "epoch": 0.040998217468805706, + "grad_norm": 1.0107358966199271, + "learning_rate": 1.3623762376237626e-05, + "loss": 0.9912, + "num_tokens": 1442001975.0, + "step": 345 + }, + { + "epoch": 0.041117052881758764, + "grad_norm": 0.9038423634041649, + "learning_rate": 1.3663366336633666e-05, + "loss": 1.0143, + "num_tokens": 1446191120.0, + "step": 346 + }, + { + "epoch": 0.04123588829471182, + "grad_norm": 0.7962375342681128, + "learning_rate": 1.3702970297029705e-05, + "loss": 0.9601, + "num_tokens": 1450371265.0, + "step": 347 + }, + { + "epoch": 0.04135472370766488, + "grad_norm": 0.9274307305511039, + "learning_rate": 1.3742574257425745e-05, + "loss": 0.9747, + "num_tokens": 1454560071.0, + "step": 348 + }, + { + "epoch": 0.041473559120617946, + "grad_norm": 0.7075989842746666, + "learning_rate": 1.3782178217821783e-05, + "loss": 0.9698, + "num_tokens": 1458749192.0, + "step": 349 + }, + { + "epoch": 0.041592394533571005, + "grad_norm": 1.0098271943461812, + "learning_rate": 1.3821782178217822e-05, + "loss": 0.9941, + "num_tokens": 1462938479.0, + "step": 350 + }, + { + "epoch": 0.04171122994652406, + "grad_norm": 0.9295176889513583, + "learning_rate": 1.3861386138613861e-05, + "loss": 1.0127, + "num_tokens": 1467108116.0, + "step": 351 + }, + { + "epoch": 0.04183006535947712, + "grad_norm": 1.143752443439843, + "learning_rate": 1.3900990099009902e-05, + "loss": 0.9821, + "num_tokens": 1471274779.0, + "step": 352 + }, + { + "epoch": 0.04194890077243019, + "grad_norm": 0.6700050731740764, + "learning_rate": 1.3940594059405941e-05, + "loss": 1.0091, + "num_tokens": 1475462868.0, + "step": 353 + }, + { + "epoch": 0.042067736185383245, + "grad_norm": 1.2437518084770816, + "learning_rate": 1.398019801980198e-05, + "loss": 0.9759, + "num_tokens": 1479647024.0, + "step": 354 + }, + { + "epoch": 0.0421865715983363, + "grad_norm": 0.7514422233670018, + "learning_rate": 1.401980198019802e-05, + "loss": 0.9803, + "num_tokens": 1483818348.0, + "step": 355 + }, + { + "epoch": 0.04230540701128936, + "grad_norm": 1.1169216359070304, + "learning_rate": 1.405940594059406e-05, + "loss": 1.019, + "num_tokens": 1488006980.0, + "step": 356 + }, + { + "epoch": 0.04242424242424243, + "grad_norm": 0.8258985473096501, + "learning_rate": 1.40990099009901e-05, + "loss": 1.0004, + "num_tokens": 1492162990.0, + "step": 357 + }, + { + "epoch": 0.042543077837195485, + "grad_norm": 0.9580535886083303, + "learning_rate": 1.4138613861386138e-05, + "loss": 0.989, + "num_tokens": 1496352416.0, + "step": 358 + }, + { + "epoch": 0.04266191325014854, + "grad_norm": 0.9210813993303903, + "learning_rate": 1.417821782178218e-05, + "loss": 1.0002, + "num_tokens": 1500541990.0, + "step": 359 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 1.007281384828844, + "learning_rate": 1.4217821782178218e-05, + "loss": 0.9774, + "num_tokens": 1504730797.0, + "step": 360 + }, + { + "epoch": 0.04289958407605467, + "grad_norm": 0.8784017061699159, + "learning_rate": 1.4257425742574257e-05, + "loss": 1.031, + "num_tokens": 1508920869.0, + "step": 361 + }, + { + "epoch": 0.043018419489007725, + "grad_norm": 1.0336808854971116, + "learning_rate": 1.4297029702970298e-05, + "loss": 1.0076, + "num_tokens": 1513096463.0, + "step": 362 + }, + { + "epoch": 0.043137254901960784, + "grad_norm": 0.999963964885968, + "learning_rate": 1.4336633663366337e-05, + "loss": 1.0091, + "num_tokens": 1517284727.0, + "step": 363 + }, + { + "epoch": 0.04325609031491384, + "grad_norm": 0.8547148956773303, + "learning_rate": 1.4376237623762377e-05, + "loss": 1.0066, + "num_tokens": 1521474191.0, + "step": 364 + }, + { + "epoch": 0.04337492572786691, + "grad_norm": 0.9719245116052349, + "learning_rate": 1.4415841584158416e-05, + "loss": 0.9846, + "num_tokens": 1525664137.0, + "step": 365 + }, + { + "epoch": 0.043493761140819966, + "grad_norm": 0.6819096747362497, + "learning_rate": 1.4455445544554456e-05, + "loss": 1.0282, + "num_tokens": 1529853032.0, + "step": 366 + }, + { + "epoch": 0.043612596553773024, + "grad_norm": 1.0862496541498154, + "learning_rate": 1.4495049504950496e-05, + "loss": 0.9742, + "num_tokens": 1534010894.0, + "step": 367 + }, + { + "epoch": 0.04373143196672608, + "grad_norm": 1.0016022849590365, + "learning_rate": 1.4534653465346535e-05, + "loss": 1.056, + "num_tokens": 1538201274.0, + "step": 368 + }, + { + "epoch": 0.04385026737967915, + "grad_norm": 0.8162730757367173, + "learning_rate": 1.4574257425742576e-05, + "loss": 1.0554, + "num_tokens": 1542377248.0, + "step": 369 + }, + { + "epoch": 0.043969102792632206, + "grad_norm": 1.176573198336089, + "learning_rate": 1.4613861386138615e-05, + "loss": 1.0056, + "num_tokens": 1546544525.0, + "step": 370 + }, + { + "epoch": 0.044087938205585264, + "grad_norm": 0.8511550077303873, + "learning_rate": 1.4653465346534654e-05, + "loss": 1.0145, + "num_tokens": 1550734110.0, + "step": 371 + }, + { + "epoch": 0.04420677361853832, + "grad_norm": 0.8606862845695202, + "learning_rate": 1.4693069306930695e-05, + "loss": 1.0295, + "num_tokens": 1554921375.0, + "step": 372 + }, + { + "epoch": 0.04432560903149139, + "grad_norm": 0.9809033469614155, + "learning_rate": 1.4732673267326734e-05, + "loss": 0.9956, + "num_tokens": 1559109675.0, + "step": 373 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.9056185141808526, + "learning_rate": 1.4772277227722773e-05, + "loss": 1.0211, + "num_tokens": 1563298804.0, + "step": 374 + }, + { + "epoch": 0.044563279857397504, + "grad_norm": 1.0439970609133147, + "learning_rate": 1.4811881188118814e-05, + "loss": 1.02, + "num_tokens": 1567487915.0, + "step": 375 + }, + { + "epoch": 0.04468211527035056, + "grad_norm": 0.8100576875190152, + "learning_rate": 1.4851485148514853e-05, + "loss": 1.0376, + "num_tokens": 1571640542.0, + "step": 376 + }, + { + "epoch": 0.04480095068330362, + "grad_norm": 1.1123461976955566, + "learning_rate": 1.4891089108910892e-05, + "loss": 0.9945, + "num_tokens": 1575830617.0, + "step": 377 + }, + { + "epoch": 0.044919786096256686, + "grad_norm": 0.9012406921662964, + "learning_rate": 1.4930693069306931e-05, + "loss": 1.0028, + "num_tokens": 1580019878.0, + "step": 378 + }, + { + "epoch": 0.045038621509209745, + "grad_norm": 0.8845301476380251, + "learning_rate": 1.4970297029702972e-05, + "loss": 1.0088, + "num_tokens": 1584209522.0, + "step": 379 + }, + { + "epoch": 0.0451574569221628, + "grad_norm": 1.0391407683022333, + "learning_rate": 1.5009900990099011e-05, + "loss": 1.0264, + "num_tokens": 1588398485.0, + "step": 380 + }, + { + "epoch": 0.04527629233511586, + "grad_norm": 0.8634699405918895, + "learning_rate": 1.504950495049505e-05, + "loss": 0.9891, + "num_tokens": 1592587815.0, + "step": 381 + }, + { + "epoch": 0.04539512774806893, + "grad_norm": 0.9541683362729494, + "learning_rate": 1.5089108910891091e-05, + "loss": 1.0137, + "num_tokens": 1596777010.0, + "step": 382 + }, + { + "epoch": 0.045513963161021985, + "grad_norm": 1.0337313041548215, + "learning_rate": 1.512871287128713e-05, + "loss": 1.0142, + "num_tokens": 1600939982.0, + "step": 383 + }, + { + "epoch": 0.04563279857397504, + "grad_norm": 0.9062050074708509, + "learning_rate": 1.516831683168317e-05, + "loss": 1.0049, + "num_tokens": 1605103650.0, + "step": 384 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 1.064496120606584, + "learning_rate": 1.520792079207921e-05, + "loss": 0.9759, + "num_tokens": 1609292276.0, + "step": 385 + }, + { + "epoch": 0.04587046939988117, + "grad_norm": 0.9062265233229498, + "learning_rate": 1.5247524752475249e-05, + "loss": 0.997, + "num_tokens": 1613468178.0, + "step": 386 + }, + { + "epoch": 0.045989304812834225, + "grad_norm": 0.9231703818477335, + "learning_rate": 1.5287128712871288e-05, + "loss": 0.9832, + "num_tokens": 1617646500.0, + "step": 387 + }, + { + "epoch": 0.046108140225787284, + "grad_norm": 0.9435917542985688, + "learning_rate": 1.5326732673267327e-05, + "loss": 1.009, + "num_tokens": 1621787654.0, + "step": 388 + }, + { + "epoch": 0.04622697563874034, + "grad_norm": 0.8875506323117883, + "learning_rate": 1.5366336633663367e-05, + "loss": 1.0296, + "num_tokens": 1625976363.0, + "step": 389 + }, + { + "epoch": 0.04634581105169341, + "grad_norm": 0.856343197278895, + "learning_rate": 1.540594059405941e-05, + "loss": 0.984, + "num_tokens": 1630165141.0, + "step": 390 + }, + { + "epoch": 0.046464646464646465, + "grad_norm": 1.2145878634291678, + "learning_rate": 1.5445544554455448e-05, + "loss": 1.015, + "num_tokens": 1634321416.0, + "step": 391 + }, + { + "epoch": 0.046583481877599524, + "grad_norm": 0.7668787322419878, + "learning_rate": 1.5485148514851487e-05, + "loss": 1.0319, + "num_tokens": 1638510844.0, + "step": 392 + }, + { + "epoch": 0.04670231729055258, + "grad_norm": 1.2987033028964479, + "learning_rate": 1.5524752475247526e-05, + "loss": 0.9995, + "num_tokens": 1642700445.0, + "step": 393 + }, + { + "epoch": 0.04682115270350565, + "grad_norm": 0.9709061263605747, + "learning_rate": 1.5564356435643566e-05, + "loss": 1.0635, + "num_tokens": 1646879033.0, + "step": 394 + }, + { + "epoch": 0.046939988116458706, + "grad_norm": 0.969960269657012, + "learning_rate": 1.5603960396039605e-05, + "loss": 0.9847, + "num_tokens": 1651044638.0, + "step": 395 + }, + { + "epoch": 0.047058823529411764, + "grad_norm": 0.9174598026299835, + "learning_rate": 1.5643564356435644e-05, + "loss": 0.9614, + "num_tokens": 1655233666.0, + "step": 396 + }, + { + "epoch": 0.04717765894236482, + "grad_norm": 1.0022417201886824, + "learning_rate": 1.5683168316831686e-05, + "loss": 0.9976, + "num_tokens": 1659420029.0, + "step": 397 + }, + { + "epoch": 0.04729649435531789, + "grad_norm": 0.8895192197047208, + "learning_rate": 1.5722772277227725e-05, + "loss": 1.0033, + "num_tokens": 1663580463.0, + "step": 398 + }, + { + "epoch": 0.047415329768270946, + "grad_norm": 0.9665884241318633, + "learning_rate": 1.5762376237623765e-05, + "loss": 1.0394, + "num_tokens": 1667747292.0, + "step": 399 + }, + { + "epoch": 0.047534165181224004, + "grad_norm": 0.9471801644567709, + "learning_rate": 1.5801980198019804e-05, + "loss": 0.9944, + "num_tokens": 1671937831.0, + "step": 400 + }, + { + "epoch": 0.04765300059417706, + "grad_norm": 1.0998093834645193, + "learning_rate": 1.5841584158415843e-05, + "loss": 1.0023, + "num_tokens": 1676126729.0, + "step": 401 + }, + { + "epoch": 0.04777183600713013, + "grad_norm": 0.8757340425473612, + "learning_rate": 1.5881188118811882e-05, + "loss": 1.0504, + "num_tokens": 1680313733.0, + "step": 402 + }, + { + "epoch": 0.047890671420083186, + "grad_norm": 1.052149541412391, + "learning_rate": 1.5920792079207924e-05, + "loss": 0.9704, + "num_tokens": 1684503999.0, + "step": 403 + }, + { + "epoch": 0.048009506833036245, + "grad_norm": 0.8936613044387008, + "learning_rate": 1.5960396039603964e-05, + "loss": 1.0183, + "num_tokens": 1688674381.0, + "step": 404 + }, + { + "epoch": 0.0481283422459893, + "grad_norm": 0.9614231729410868, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.0097, + "num_tokens": 1692863435.0, + "step": 405 + }, + { + "epoch": 0.04824717765894237, + "grad_norm": 1.0264197318073163, + "learning_rate": 1.6039603960396042e-05, + "loss": 1.0049, + "num_tokens": 1697051930.0, + "step": 406 + }, + { + "epoch": 0.048366013071895426, + "grad_norm": 1.200005049385442, + "learning_rate": 1.607920792079208e-05, + "loss": 0.9855, + "num_tokens": 1701210407.0, + "step": 407 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 0.830440800423891, + "learning_rate": 1.611881188118812e-05, + "loss": 0.9768, + "num_tokens": 1705350829.0, + "step": 408 + }, + { + "epoch": 0.04860368389780154, + "grad_norm": 1.067297313099946, + "learning_rate": 1.615841584158416e-05, + "loss": 1.0125, + "num_tokens": 1709521309.0, + "step": 409 + }, + { + "epoch": 0.0487225193107546, + "grad_norm": 0.8938314228655156, + "learning_rate": 1.61980198019802e-05, + "loss": 1.0121, + "num_tokens": 1713710616.0, + "step": 410 + }, + { + "epoch": 0.04884135472370767, + "grad_norm": 1.2488303233486089, + "learning_rate": 1.623762376237624e-05, + "loss": 1.0153, + "num_tokens": 1717899405.0, + "step": 411 + }, + { + "epoch": 0.048960190136660725, + "grad_norm": 0.7773872528569518, + "learning_rate": 1.6277227722772277e-05, + "loss": 0.9892, + "num_tokens": 1722090003.0, + "step": 412 + }, + { + "epoch": 0.04907902554961378, + "grad_norm": 1.2604049607422008, + "learning_rate": 1.6316831683168316e-05, + "loss": 1.0077, + "num_tokens": 1726279313.0, + "step": 413 + }, + { + "epoch": 0.04919786096256684, + "grad_norm": 0.8178805868881734, + "learning_rate": 1.6356435643564358e-05, + "loss": 0.9598, + "num_tokens": 1730443453.0, + "step": 414 + }, + { + "epoch": 0.04931669637551991, + "grad_norm": 1.3097663483808555, + "learning_rate": 1.6396039603960397e-05, + "loss": 0.9966, + "num_tokens": 1734623177.0, + "step": 415 + }, + { + "epoch": 0.049435531788472965, + "grad_norm": 0.771168032965109, + "learning_rate": 1.6435643564356436e-05, + "loss": 1.0177, + "num_tokens": 1738778432.0, + "step": 416 + }, + { + "epoch": 0.049554367201426024, + "grad_norm": 1.349741937509474, + "learning_rate": 1.6475247524752476e-05, + "loss": 0.9928, + "num_tokens": 1742967407.0, + "step": 417 + }, + { + "epoch": 0.04967320261437908, + "grad_norm": 0.9987509170660547, + "learning_rate": 1.6514851485148515e-05, + "loss": 1.0363, + "num_tokens": 1747146177.0, + "step": 418 + }, + { + "epoch": 0.04979203802733215, + "grad_norm": 1.0692382923370076, + "learning_rate": 1.6554455445544554e-05, + "loss": 1.0107, + "num_tokens": 1751334468.0, + "step": 419 + }, + { + "epoch": 0.049910873440285206, + "grad_norm": 1.0413630206245814, + "learning_rate": 1.6594059405940596e-05, + "loss": 0.9884, + "num_tokens": 1755482469.0, + "step": 420 + }, + { + "epoch": 0.050029708853238264, + "grad_norm": 0.8834931380576948, + "learning_rate": 1.6633663366336635e-05, + "loss": 1.0073, + "num_tokens": 1759656013.0, + "step": 421 + }, + { + "epoch": 0.05014854426619132, + "grad_norm": 1.101729720717549, + "learning_rate": 1.6673267326732675e-05, + "loss": 1.0314, + "num_tokens": 1763824771.0, + "step": 422 + }, + { + "epoch": 0.05026737967914439, + "grad_norm": 0.8456828871831572, + "learning_rate": 1.6712871287128714e-05, + "loss": 1.0252, + "num_tokens": 1768014156.0, + "step": 423 + }, + { + "epoch": 0.050386215092097446, + "grad_norm": 0.918813030679547, + "learning_rate": 1.6752475247524753e-05, + "loss": 1.0405, + "num_tokens": 1772185384.0, + "step": 424 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 1.065248143328181, + "learning_rate": 1.6792079207920792e-05, + "loss": 0.9422, + "num_tokens": 1776360637.0, + "step": 425 + }, + { + "epoch": 0.05062388591800356, + "grad_norm": 0.8919751725161477, + "learning_rate": 1.683168316831683e-05, + "loss": 0.9994, + "num_tokens": 1780545776.0, + "step": 426 + }, + { + "epoch": 0.05074272133095663, + "grad_norm": 1.0400981877134023, + "learning_rate": 1.6871287128712874e-05, + "loss": 1.0083, + "num_tokens": 1784724851.0, + "step": 427 + }, + { + "epoch": 0.050861556743909686, + "grad_norm": 0.9738129978359162, + "learning_rate": 1.6910891089108913e-05, + "loss": 1.0059, + "num_tokens": 1788875723.0, + "step": 428 + }, + { + "epoch": 0.050980392156862744, + "grad_norm": 1.1436741697647637, + "learning_rate": 1.6950495049504952e-05, + "loss": 0.9661, + "num_tokens": 1793050606.0, + "step": 429 + }, + { + "epoch": 0.0510992275698158, + "grad_norm": 1.0642127888692825, + "learning_rate": 1.699009900990099e-05, + "loss": 0.9805, + "num_tokens": 1797239911.0, + "step": 430 + }, + { + "epoch": 0.05121806298276887, + "grad_norm": 1.0070722746951517, + "learning_rate": 1.702970297029703e-05, + "loss": 0.9802, + "num_tokens": 1801407687.0, + "step": 431 + }, + { + "epoch": 0.051336898395721926, + "grad_norm": 1.0127441099609888, + "learning_rate": 1.706930693069307e-05, + "loss": 1.033, + "num_tokens": 1805595940.0, + "step": 432 + }, + { + "epoch": 0.051455733808674985, + "grad_norm": 0.7587124817850402, + "learning_rate": 1.7108910891089108e-05, + "loss": 1.0179, + "num_tokens": 1809755887.0, + "step": 433 + }, + { + "epoch": 0.05157456922162804, + "grad_norm": 0.9917974898941643, + "learning_rate": 1.714851485148515e-05, + "loss": 0.9991, + "num_tokens": 1813943977.0, + "step": 434 + }, + { + "epoch": 0.05169340463458111, + "grad_norm": 1.1014221135897115, + "learning_rate": 1.718811881188119e-05, + "loss": 0.9713, + "num_tokens": 1818099011.0, + "step": 435 + }, + { + "epoch": 0.05181224004753417, + "grad_norm": 0.9748573620069786, + "learning_rate": 1.722772277227723e-05, + "loss": 1.049, + "num_tokens": 1822286484.0, + "step": 436 + }, + { + "epoch": 0.051931075460487225, + "grad_norm": 1.2036042706836567, + "learning_rate": 1.7267326732673268e-05, + "loss": 1.0114, + "num_tokens": 1826468967.0, + "step": 437 + }, + { + "epoch": 0.05204991087344028, + "grad_norm": 0.8612208587183801, + "learning_rate": 1.7306930693069307e-05, + "loss": 1.024, + "num_tokens": 1830658300.0, + "step": 438 + }, + { + "epoch": 0.05216874628639335, + "grad_norm": 1.0696163842330118, + "learning_rate": 1.7346534653465346e-05, + "loss": 0.9815, + "num_tokens": 1834795009.0, + "step": 439 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 0.9447071690755646, + "learning_rate": 1.738613861386139e-05, + "loss": 0.9855, + "num_tokens": 1838983809.0, + "step": 440 + }, + { + "epoch": 0.052406417112299465, + "grad_norm": 0.94094206047364, + "learning_rate": 1.7425742574257428e-05, + "loss": 1.0134, + "num_tokens": 1843173582.0, + "step": 441 + }, + { + "epoch": 0.052525252525252523, + "grad_norm": 0.884924138667476, + "learning_rate": 1.7465346534653467e-05, + "loss": 1.0354, + "num_tokens": 1847345564.0, + "step": 442 + }, + { + "epoch": 0.05264408793820558, + "grad_norm": 0.9960983761998882, + "learning_rate": 1.7504950495049506e-05, + "loss": 0.9913, + "num_tokens": 1851496621.0, + "step": 443 + }, + { + "epoch": 0.05276292335115865, + "grad_norm": 0.9482952471248478, + "learning_rate": 1.7544554455445545e-05, + "loss": 0.9857, + "num_tokens": 1855684265.0, + "step": 444 + }, + { + "epoch": 0.052881758764111705, + "grad_norm": 0.847747472492336, + "learning_rate": 1.7584158415841585e-05, + "loss": 0.9988, + "num_tokens": 1859873176.0, + "step": 445 + }, + { + "epoch": 0.053000594177064764, + "grad_norm": 1.0494609834136468, + "learning_rate": 1.7623762376237624e-05, + "loss": 1.0141, + "num_tokens": 1864062408.0, + "step": 446 + }, + { + "epoch": 0.05311942959001782, + "grad_norm": 0.9502129004345691, + "learning_rate": 1.7663366336633666e-05, + "loss": 1.0163, + "num_tokens": 1868252186.0, + "step": 447 + }, + { + "epoch": 0.05323826500297089, + "grad_norm": 1.2180572510172398, + "learning_rate": 1.7702970297029705e-05, + "loss": 1.0044, + "num_tokens": 1872436822.0, + "step": 448 + }, + { + "epoch": 0.053357100415923946, + "grad_norm": 0.9464219376855172, + "learning_rate": 1.7742574257425744e-05, + "loss": 0.9849, + "num_tokens": 1876624306.0, + "step": 449 + }, + { + "epoch": 0.053475935828877004, + "grad_norm": 0.830863738957813, + "learning_rate": 1.7782178217821784e-05, + "loss": 0.9759, + "num_tokens": 1880814283.0, + "step": 450 + }, + { + "epoch": 0.05359477124183006, + "grad_norm": 0.9366567380299547, + "learning_rate": 1.7821782178217823e-05, + "loss": 1.0074, + "num_tokens": 1884979234.0, + "step": 451 + }, + { + "epoch": 0.05371360665478313, + "grad_norm": 1.1601979297053349, + "learning_rate": 1.7861386138613862e-05, + "loss": 1.0162, + "num_tokens": 1889118686.0, + "step": 452 + }, + { + "epoch": 0.053832442067736186, + "grad_norm": 0.8264203436263945, + "learning_rate": 1.7900990099009904e-05, + "loss": 0.967, + "num_tokens": 1893304283.0, + "step": 453 + }, + { + "epoch": 0.053951277480689244, + "grad_norm": 0.9332739341026421, + "learning_rate": 1.7940594059405943e-05, + "loss": 1.0265, + "num_tokens": 1897493333.0, + "step": 454 + }, + { + "epoch": 0.0540701128936423, + "grad_norm": 1.0516294503648969, + "learning_rate": 1.7980198019801983e-05, + "loss": 1.0332, + "num_tokens": 1901684250.0, + "step": 455 + }, + { + "epoch": 0.05418894830659537, + "grad_norm": 1.340052195873645, + "learning_rate": 1.8019801980198022e-05, + "loss": 0.9682, + "num_tokens": 1905873776.0, + "step": 456 + }, + { + "epoch": 0.054307783719548426, + "grad_norm": 0.9248093320556678, + "learning_rate": 1.805940594059406e-05, + "loss": 0.9945, + "num_tokens": 1910064184.0, + "step": 457 + }, + { + "epoch": 0.054426619132501484, + "grad_norm": 1.2611547857782448, + "learning_rate": 1.80990099009901e-05, + "loss": 1.0161, + "num_tokens": 1914238684.0, + "step": 458 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 0.7417447052662034, + "learning_rate": 1.813861386138614e-05, + "loss": 0.9791, + "num_tokens": 1918428604.0, + "step": 459 + }, + { + "epoch": 0.05466428995840761, + "grad_norm": 1.162663008723681, + "learning_rate": 1.817821782178218e-05, + "loss": 0.9867, + "num_tokens": 1922576813.0, + "step": 460 + }, + { + "epoch": 0.054783125371360666, + "grad_norm": 0.9956240920387516, + "learning_rate": 1.821782178217822e-05, + "loss": 0.9929, + "num_tokens": 1926766698.0, + "step": 461 + }, + { + "epoch": 0.054901960784313725, + "grad_norm": 1.054723021197826, + "learning_rate": 1.825742574257426e-05, + "loss": 0.9778, + "num_tokens": 1930950071.0, + "step": 462 + }, + { + "epoch": 0.05502079619726678, + "grad_norm": 1.2227687973998707, + "learning_rate": 1.82970297029703e-05, + "loss": 0.9818, + "num_tokens": 1935138091.0, + "step": 463 + }, + { + "epoch": 0.05513963161021985, + "grad_norm": 0.6942470598406394, + "learning_rate": 1.8336633663366338e-05, + "loss": 0.9989, + "num_tokens": 1939326099.0, + "step": 464 + }, + { + "epoch": 0.05525846702317291, + "grad_norm": 1.2759092959078266, + "learning_rate": 1.8376237623762377e-05, + "loss": 0.9955, + "num_tokens": 1943513527.0, + "step": 465 + }, + { + "epoch": 0.055377302436125965, + "grad_norm": 1.0742527381247933, + "learning_rate": 1.841584158415842e-05, + "loss": 0.9718, + "num_tokens": 1947701652.0, + "step": 466 + }, + { + "epoch": 0.05549613784907902, + "grad_norm": 1.0293258403266494, + "learning_rate": 1.845544554455446e-05, + "loss": 0.9782, + "num_tokens": 1951890285.0, + "step": 467 + }, + { + "epoch": 0.05561497326203209, + "grad_norm": 1.117631908133165, + "learning_rate": 1.8495049504950498e-05, + "loss": 1.0135, + "num_tokens": 1956063672.0, + "step": 468 + }, + { + "epoch": 0.05573380867498515, + "grad_norm": 1.023149470844736, + "learning_rate": 1.8534653465346537e-05, + "loss": 0.9699, + "num_tokens": 1960252788.0, + "step": 469 + }, + { + "epoch": 0.055852644087938205, + "grad_norm": 0.8530381031275566, + "learning_rate": 1.8574257425742576e-05, + "loss": 1.0213, + "num_tokens": 1964442620.0, + "step": 470 + }, + { + "epoch": 0.055971479500891264, + "grad_norm": 1.2303618421678983, + "learning_rate": 1.8613861386138615e-05, + "loss": 1.0334, + "num_tokens": 1968591216.0, + "step": 471 + }, + { + "epoch": 0.05609031491384433, + "grad_norm": 0.8765373387169477, + "learning_rate": 1.8653465346534654e-05, + "loss": 0.9842, + "num_tokens": 1972772972.0, + "step": 472 + }, + { + "epoch": 0.05620915032679739, + "grad_norm": 1.0488262721530088, + "learning_rate": 1.8693069306930697e-05, + "loss": 1.011, + "num_tokens": 1976961598.0, + "step": 473 + }, + { + "epoch": 0.056327985739750445, + "grad_norm": 1.0463056912247353, + "learning_rate": 1.8732673267326736e-05, + "loss": 0.9684, + "num_tokens": 1981151630.0, + "step": 474 + }, + { + "epoch": 0.056446821152703504, + "grad_norm": 1.008820716525339, + "learning_rate": 1.8772277227722772e-05, + "loss": 0.996, + "num_tokens": 1985340280.0, + "step": 475 + }, + { + "epoch": 0.05656565656565657, + "grad_norm": 1.2476632213719439, + "learning_rate": 1.881188118811881e-05, + "loss": 1.027, + "num_tokens": 1989519003.0, + "step": 476 + }, + { + "epoch": 0.05668449197860963, + "grad_norm": 0.835947218244666, + "learning_rate": 1.8851485148514853e-05, + "loss": 0.9681, + "num_tokens": 1993707079.0, + "step": 477 + }, + { + "epoch": 0.056803327391562686, + "grad_norm": 1.0599645310435315, + "learning_rate": 1.8891089108910893e-05, + "loss": 1.0096, + "num_tokens": 1997863696.0, + "step": 478 + }, + { + "epoch": 0.056922162804515744, + "grad_norm": 0.9563941398083633, + "learning_rate": 1.8930693069306932e-05, + "loss": 1.0372, + "num_tokens": 2002020290.0, + "step": 479 + }, + { + "epoch": 0.0570409982174688, + "grad_norm": 1.0827962640253064, + "learning_rate": 1.897029702970297e-05, + "loss": 0.9968, + "num_tokens": 2006208307.0, + "step": 480 + }, + { + "epoch": 0.05715983363042187, + "grad_norm": 1.09910865418692, + "learning_rate": 1.900990099009901e-05, + "loss": 0.9901, + "num_tokens": 2010367206.0, + "step": 481 + }, + { + "epoch": 0.057278669043374926, + "grad_norm": 1.0007276104523486, + "learning_rate": 1.904950495049505e-05, + "loss": 0.9772, + "num_tokens": 2014556947.0, + "step": 482 + }, + { + "epoch": 0.057397504456327984, + "grad_norm": 1.1763349813538189, + "learning_rate": 1.9089108910891088e-05, + "loss": 1.0047, + "num_tokens": 2018695709.0, + "step": 483 + }, + { + "epoch": 0.05751633986928104, + "grad_norm": 0.8417385523942812, + "learning_rate": 1.912871287128713e-05, + "loss": 1.0044, + "num_tokens": 2022883977.0, + "step": 484 + }, + { + "epoch": 0.05763517528223411, + "grad_norm": 1.3198024255167593, + "learning_rate": 1.916831683168317e-05, + "loss": 1.0139, + "num_tokens": 2027047217.0, + "step": 485 + }, + { + "epoch": 0.057754010695187166, + "grad_norm": 0.8108321247735294, + "learning_rate": 1.920792079207921e-05, + "loss": 0.99, + "num_tokens": 2031232945.0, + "step": 486 + }, + { + "epoch": 0.057872846108140225, + "grad_norm": 1.0487951572248548, + "learning_rate": 1.9247524752475248e-05, + "loss": 0.9965, + "num_tokens": 2035407178.0, + "step": 487 + }, + { + "epoch": 0.05799168152109328, + "grad_norm": 1.0280558394592938, + "learning_rate": 1.9287128712871287e-05, + "loss": 1.0188, + "num_tokens": 2039595383.0, + "step": 488 + }, + { + "epoch": 0.05811051693404635, + "grad_norm": 1.030152033571584, + "learning_rate": 1.9326732673267326e-05, + "loss": 0.9988, + "num_tokens": 2043749798.0, + "step": 489 + }, + { + "epoch": 0.058229352346999406, + "grad_norm": 0.8175842346132645, + "learning_rate": 1.936633663366337e-05, + "loss": 0.9836, + "num_tokens": 2047939749.0, + "step": 490 + }, + { + "epoch": 0.058348187759952465, + "grad_norm": 1.0347725445503018, + "learning_rate": 1.9405940594059408e-05, + "loss": 1.0148, + "num_tokens": 2052127578.0, + "step": 491 + }, + { + "epoch": 0.05846702317290552, + "grad_norm": 0.9136395270938487, + "learning_rate": 1.9445544554455447e-05, + "loss": 1.013, + "num_tokens": 2056317717.0, + "step": 492 + }, + { + "epoch": 0.05858585858585859, + "grad_norm": 0.9776734163773239, + "learning_rate": 1.9485148514851486e-05, + "loss": 0.9675, + "num_tokens": 2060506247.0, + "step": 493 + }, + { + "epoch": 0.05870469399881165, + "grad_norm": 1.141146792214047, + "learning_rate": 1.9524752475247525e-05, + "loss": 1.0335, + "num_tokens": 2064695430.0, + "step": 494 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.9476979551176866, + "learning_rate": 1.9564356435643564e-05, + "loss": 0.9828, + "num_tokens": 2068884172.0, + "step": 495 + }, + { + "epoch": 0.05894236482471776, + "grad_norm": 1.2765610795392286, + "learning_rate": 1.9603960396039604e-05, + "loss": 1.0082, + "num_tokens": 2073069817.0, + "step": 496 + }, + { + "epoch": 0.05906120023767083, + "grad_norm": 0.8765295706122909, + "learning_rate": 1.9643564356435646e-05, + "loss": 0.9914, + "num_tokens": 2077178359.0, + "step": 497 + }, + { + "epoch": 0.05918003565062389, + "grad_norm": 1.1699865435801549, + "learning_rate": 1.9683168316831685e-05, + "loss": 0.9986, + "num_tokens": 2081367043.0, + "step": 498 + }, + { + "epoch": 0.059298871063576945, + "grad_norm": 1.101903600197394, + "learning_rate": 1.9722772277227724e-05, + "loss": 1.0055, + "num_tokens": 2085552935.0, + "step": 499 + }, + { + "epoch": 0.059417706476530004, + "grad_norm": 1.027291730368695, + "learning_rate": 1.9762376237623764e-05, + "loss": 1.0422, + "num_tokens": 2089734414.0, + "step": 500 + }, + { + "epoch": 0.05953654188948307, + "grad_norm": 1.1237705967579137, + "learning_rate": 1.9801980198019803e-05, + "loss": 0.9697, + "num_tokens": 2093922816.0, + "step": 501 + }, + { + "epoch": 0.05965537730243613, + "grad_norm": 0.8544268292122418, + "learning_rate": 1.9841584158415842e-05, + "loss": 0.9659, + "num_tokens": 2098112844.0, + "step": 502 + }, + { + "epoch": 0.059774212715389186, + "grad_norm": 1.3108465906220692, + "learning_rate": 1.9881188118811884e-05, + "loss": 1.0, + "num_tokens": 2102302874.0, + "step": 503 + }, + { + "epoch": 0.059893048128342244, + "grad_norm": 0.906125530289828, + "learning_rate": 1.9920792079207923e-05, + "loss": 1.0022, + "num_tokens": 2106492570.0, + "step": 504 + }, + { + "epoch": 0.06001188354129531, + "grad_norm": 1.1151787723335196, + "learning_rate": 1.9960396039603963e-05, + "loss": 1.0037, + "num_tokens": 2110636611.0, + "step": 505 + }, + { + "epoch": 0.06013071895424837, + "grad_norm": 0.9305436184011975, + "learning_rate": 2e-05, + "loss": 1.0265, + "num_tokens": 2114825831.0, + "step": 506 + }, + { + "epoch": 0.060249554367201426, + "grad_norm": 1.277024064944511, + "learning_rate": 1.999999983334979e-05, + "loss": 0.9902, + "num_tokens": 2119014342.0, + "step": 507 + }, + { + "epoch": 0.060368389780154484, + "grad_norm": 0.7431133951158614, + "learning_rate": 1.9999999333399146e-05, + "loss": 1.0128, + "num_tokens": 2123203194.0, + "step": 508 + }, + { + "epoch": 0.06048722519310755, + "grad_norm": 1.4543166514036685, + "learning_rate": 1.9999998500148106e-05, + "loss": 0.9789, + "num_tokens": 2127392532.0, + "step": 509 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 0.7537367919625798, + "learning_rate": 1.999999733359669e-05, + "loss": 1.0365, + "num_tokens": 2131581995.0, + "step": 510 + }, + { + "epoch": 0.060724896019013666, + "grad_norm": 1.3822944372274082, + "learning_rate": 1.999999583374494e-05, + "loss": 1.0091, + "num_tokens": 2135741409.0, + "step": 511 + }, + { + "epoch": 0.060843731431966724, + "grad_norm": 1.0737211786699694, + "learning_rate": 1.9999994000592915e-05, + "loss": 0.9995, + "num_tokens": 2139929032.0, + "step": 512 + }, + { + "epoch": 0.06096256684491978, + "grad_norm": 1.1369489425096384, + "learning_rate": 1.9999991834140683e-05, + "loss": 0.9431, + "num_tokens": 2144103410.0, + "step": 513 + }, + { + "epoch": 0.06108140225787285, + "grad_norm": 1.2970683525619, + "learning_rate": 1.9999989334388324e-05, + "loss": 1.0096, + "num_tokens": 2148292838.0, + "step": 514 + }, + { + "epoch": 0.061200237670825906, + "grad_norm": 0.9035499798118491, + "learning_rate": 1.999998650133593e-05, + "loss": 0.9715, + "num_tokens": 2152482190.0, + "step": 515 + }, + { + "epoch": 0.061319073083778965, + "grad_norm": 1.1686445131713212, + "learning_rate": 1.9999983334983606e-05, + "loss": 0.9958, + "num_tokens": 2156670693.0, + "step": 516 + }, + { + "epoch": 0.06143790849673202, + "grad_norm": 0.9378806842083853, + "learning_rate": 1.9999979835331473e-05, + "loss": 0.9829, + "num_tokens": 2160834341.0, + "step": 517 + }, + { + "epoch": 0.06155674390968509, + "grad_norm": 0.9246806814223588, + "learning_rate": 1.999997600237965e-05, + "loss": 0.988, + "num_tokens": 2165015716.0, + "step": 518 + }, + { + "epoch": 0.06167557932263815, + "grad_norm": 1.1532305903747908, + "learning_rate": 1.999997183612829e-05, + "loss": 1.0122, + "num_tokens": 2169205513.0, + "step": 519 + }, + { + "epoch": 0.061794414735591205, + "grad_norm": 0.8874284771215121, + "learning_rate": 1.9999967336577545e-05, + "loss": 0.9914, + "num_tokens": 2173395447.0, + "step": 520 + }, + { + "epoch": 0.06191325014854426, + "grad_norm": 1.1059447369460003, + "learning_rate": 1.9999962503727578e-05, + "loss": 1.0064, + "num_tokens": 2177583574.0, + "step": 521 + }, + { + "epoch": 0.06203208556149733, + "grad_norm": 1.0134838796844707, + "learning_rate": 1.999995733757857e-05, + "loss": 1.0161, + "num_tokens": 2181773029.0, + "step": 522 + }, + { + "epoch": 0.06215092097445039, + "grad_norm": 0.8921826857889071, + "learning_rate": 1.9999951838130713e-05, + "loss": 0.9984, + "num_tokens": 2185963869.0, + "step": 523 + }, + { + "epoch": 0.062269756387403445, + "grad_norm": 1.2134168102252116, + "learning_rate": 1.999994600538421e-05, + "loss": 1.0077, + "num_tokens": 2190142709.0, + "step": 524 + }, + { + "epoch": 0.062388591800356503, + "grad_norm": 0.9108265722745886, + "learning_rate": 1.9999939839339277e-05, + "loss": 0.9883, + "num_tokens": 2194306175.0, + "step": 525 + }, + { + "epoch": 0.06250742721330957, + "grad_norm": 1.1429271005780177, + "learning_rate": 1.999993333999614e-05, + "loss": 0.9893, + "num_tokens": 2198489046.0, + "step": 526 + }, + { + "epoch": 0.06262626262626263, + "grad_norm": 0.92403907240122, + "learning_rate": 1.9999926507355042e-05, + "loss": 0.9825, + "num_tokens": 2202657775.0, + "step": 527 + }, + { + "epoch": 0.06274509803921569, + "grad_norm": 1.1855768762765453, + "learning_rate": 1.999991934141624e-05, + "loss": 0.9995, + "num_tokens": 2206813445.0, + "step": 528 + }, + { + "epoch": 0.06286393345216874, + "grad_norm": 0.8420701841345412, + "learning_rate": 1.999991184217999e-05, + "loss": 0.9908, + "num_tokens": 2211003619.0, + "step": 529 + }, + { + "epoch": 0.0629827688651218, + "grad_norm": 1.0977054833483164, + "learning_rate": 1.999990400964658e-05, + "loss": 0.9802, + "num_tokens": 2215191971.0, + "step": 530 + }, + { + "epoch": 0.06310160427807486, + "grad_norm": 0.8278475113126589, + "learning_rate": 1.9999895843816293e-05, + "loss": 0.9908, + "num_tokens": 2219370391.0, + "step": 531 + }, + { + "epoch": 0.06322043969102793, + "grad_norm": 1.0678151596511756, + "learning_rate": 1.9999887344689432e-05, + "loss": 1.0147, + "num_tokens": 2223536743.0, + "step": 532 + }, + { + "epoch": 0.06333927510398099, + "grad_norm": 0.8674099815377254, + "learning_rate": 1.9999878512266314e-05, + "loss": 0.9937, + "num_tokens": 2227699083.0, + "step": 533 + }, + { + "epoch": 0.06345811051693405, + "grad_norm": 1.0286567745816728, + "learning_rate": 1.9999869346547264e-05, + "loss": 0.9654, + "num_tokens": 2231888521.0, + "step": 534 + }, + { + "epoch": 0.06357694592988711, + "grad_norm": 1.0206622488454271, + "learning_rate": 1.9999859847532625e-05, + "loss": 0.9749, + "num_tokens": 2236058507.0, + "step": 535 + }, + { + "epoch": 0.06369578134284017, + "grad_norm": 0.9701613006756303, + "learning_rate": 1.999985001522275e-05, + "loss": 0.9974, + "num_tokens": 2240241935.0, + "step": 536 + }, + { + "epoch": 0.06381461675579322, + "grad_norm": 1.160322168068658, + "learning_rate": 1.9999839849617995e-05, + "loss": 1.0195, + "num_tokens": 2244431291.0, + "step": 537 + }, + { + "epoch": 0.06393345216874628, + "grad_norm": 0.830064370227406, + "learning_rate": 1.9999829350718742e-05, + "loss": 0.9835, + "num_tokens": 2248620850.0, + "step": 538 + }, + { + "epoch": 0.06405228758169934, + "grad_norm": 1.0141513205415962, + "learning_rate": 1.9999818518525378e-05, + "loss": 0.99, + "num_tokens": 2252806290.0, + "step": 539 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 0.8138778470096792, + "learning_rate": 1.9999807353038308e-05, + "loss": 1.0225, + "num_tokens": 2256979125.0, + "step": 540 + }, + { + "epoch": 0.06428995840760547, + "grad_norm": 1.239728607084859, + "learning_rate": 1.999979585425794e-05, + "loss": 0.9801, + "num_tokens": 2261130458.0, + "step": 541 + }, + { + "epoch": 0.06440879382055853, + "grad_norm": 0.9452824650938275, + "learning_rate": 1.99997840221847e-05, + "loss": 1.0043, + "num_tokens": 2265319826.0, + "step": 542 + }, + { + "epoch": 0.06452762923351159, + "grad_norm": 0.8584602557285886, + "learning_rate": 1.9999771856819036e-05, + "loss": 1.0157, + "num_tokens": 2269493834.0, + "step": 543 + }, + { + "epoch": 0.06464646464646465, + "grad_norm": 1.2899646185123712, + "learning_rate": 1.9999759358161386e-05, + "loss": 0.9701, + "num_tokens": 2273683809.0, + "step": 544 + }, + { + "epoch": 0.0647653000594177, + "grad_norm": 0.818229786512621, + "learning_rate": 1.999974652621222e-05, + "loss": 0.9934, + "num_tokens": 2277872184.0, + "step": 545 + }, + { + "epoch": 0.06488413547237076, + "grad_norm": 1.2357106170627234, + "learning_rate": 1.9999733360972013e-05, + "loss": 1.0229, + "num_tokens": 2282057937.0, + "step": 546 + }, + { + "epoch": 0.06500297088532382, + "grad_norm": 0.9194625350644626, + "learning_rate": 1.999971986244125e-05, + "loss": 0.9558, + "num_tokens": 2286247606.0, + "step": 547 + }, + { + "epoch": 0.06512180629827688, + "grad_norm": 0.9748769275383025, + "learning_rate": 1.9999706030620428e-05, + "loss": 0.972, + "num_tokens": 2290433767.0, + "step": 548 + }, + { + "epoch": 0.06524064171122995, + "grad_norm": 1.0329528443211695, + "learning_rate": 1.9999691865510066e-05, + "loss": 1.0059, + "num_tokens": 2294595412.0, + "step": 549 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.9066224834222771, + "learning_rate": 1.9999677367110684e-05, + "loss": 0.9554, + "num_tokens": 2298761156.0, + "step": 550 + }, + { + "epoch": 0.06547831253713607, + "grad_norm": 1.161529603260094, + "learning_rate": 1.9999662535422818e-05, + "loss": 0.9659, + "num_tokens": 2302951043.0, + "step": 551 + }, + { + "epoch": 0.06559714795008913, + "grad_norm": 1.0495176217896605, + "learning_rate": 1.9999647370447026e-05, + "loss": 0.9548, + "num_tokens": 2307138341.0, + "step": 552 + }, + { + "epoch": 0.06571598336304219, + "grad_norm": 0.7948034787466988, + "learning_rate": 1.999963187218386e-05, + "loss": 0.9852, + "num_tokens": 2311327014.0, + "step": 553 + }, + { + "epoch": 0.06583481877599524, + "grad_norm": 0.9658682303486338, + "learning_rate": 1.9999616040633896e-05, + "loss": 1.0353, + "num_tokens": 2315517435.0, + "step": 554 + }, + { + "epoch": 0.0659536541889483, + "grad_norm": 0.9933980688685567, + "learning_rate": 1.9999599875797723e-05, + "loss": 1.0149, + "num_tokens": 2319697107.0, + "step": 555 + }, + { + "epoch": 0.06607248960190136, + "grad_norm": 1.1607260690936247, + "learning_rate": 1.999958337767594e-05, + "loss": 0.989, + "num_tokens": 2323879634.0, + "step": 556 + }, + { + "epoch": 0.06619132501485443, + "grad_norm": 0.9307826886364474, + "learning_rate": 1.9999566546269154e-05, + "loss": 1.0251, + "num_tokens": 2328035810.0, + "step": 557 + }, + { + "epoch": 0.06631016042780749, + "grad_norm": 1.2254952878656296, + "learning_rate": 1.9999549381577993e-05, + "loss": 0.9933, + "num_tokens": 2332224770.0, + "step": 558 + }, + { + "epoch": 0.06642899584076055, + "grad_norm": 0.8257656303172576, + "learning_rate": 1.999953188360309e-05, + "loss": 0.9926, + "num_tokens": 2336414778.0, + "step": 559 + }, + { + "epoch": 0.06654783125371361, + "grad_norm": 1.3059477976980414, + "learning_rate": 1.999951405234509e-05, + "loss": 0.9725, + "num_tokens": 2340604011.0, + "step": 560 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.7143600779523541, + "learning_rate": 1.9999495887804658e-05, + "loss": 0.9953, + "num_tokens": 2344794726.0, + "step": 561 + }, + { + "epoch": 0.06678550207961972, + "grad_norm": 1.2426928331191915, + "learning_rate": 1.9999477389982464e-05, + "loss": 0.9683, + "num_tokens": 2348984362.0, + "step": 562 + }, + { + "epoch": 0.06690433749257278, + "grad_norm": 0.923334082123573, + "learning_rate": 1.9999458558879195e-05, + "loss": 1.0264, + "num_tokens": 2353174062.0, + "step": 563 + }, + { + "epoch": 0.06702317290552584, + "grad_norm": 1.1899708882668296, + "learning_rate": 1.999943939449555e-05, + "loss": 0.9928, + "num_tokens": 2357363089.0, + "step": 564 + }, + { + "epoch": 0.06714200831847891, + "grad_norm": 0.9671524077138673, + "learning_rate": 1.9999419896832236e-05, + "loss": 1.0346, + "num_tokens": 2361552591.0, + "step": 565 + }, + { + "epoch": 0.06726084373143197, + "grad_norm": 0.9854883877340567, + "learning_rate": 1.999940006588997e-05, + "loss": 0.9828, + "num_tokens": 2365741387.0, + "step": 566 + }, + { + "epoch": 0.06737967914438503, + "grad_norm": 0.9634590765306578, + "learning_rate": 1.99993799016695e-05, + "loss": 0.9819, + "num_tokens": 2369902541.0, + "step": 567 + }, + { + "epoch": 0.06749851455733809, + "grad_norm": 1.082624623679767, + "learning_rate": 1.9999359404171557e-05, + "loss": 1.0239, + "num_tokens": 2374091219.0, + "step": 568 + }, + { + "epoch": 0.06761734997029115, + "grad_norm": 0.9478617567569775, + "learning_rate": 1.9999338573396915e-05, + "loss": 1.0122, + "num_tokens": 2378280178.0, + "step": 569 + }, + { + "epoch": 0.0677361853832442, + "grad_norm": 1.0087992702302322, + "learning_rate": 1.9999317409346334e-05, + "loss": 0.9603, + "num_tokens": 2382459692.0, + "step": 570 + }, + { + "epoch": 0.06785502079619726, + "grad_norm": 0.8017002099359061, + "learning_rate": 1.99992959120206e-05, + "loss": 0.9755, + "num_tokens": 2386647740.0, + "step": 571 + }, + { + "epoch": 0.06797385620915032, + "grad_norm": 1.0554851260559064, + "learning_rate": 1.9999274081420516e-05, + "loss": 0.9913, + "num_tokens": 2390836822.0, + "step": 572 + }, + { + "epoch": 0.0680926916221034, + "grad_norm": 1.167690166248365, + "learning_rate": 1.999925191754688e-05, + "loss": 0.9967, + "num_tokens": 2395026022.0, + "step": 573 + }, + { + "epoch": 0.06821152703505645, + "grad_norm": 0.8053515100128363, + "learning_rate": 1.9999229420400523e-05, + "loss": 1.013, + "num_tokens": 2399215070.0, + "step": 574 + }, + { + "epoch": 0.06833036244800951, + "grad_norm": 1.0445159944637117, + "learning_rate": 1.9999206589982273e-05, + "loss": 0.9855, + "num_tokens": 2403404082.0, + "step": 575 + }, + { + "epoch": 0.06844919786096257, + "grad_norm": 0.8304711922045406, + "learning_rate": 1.9999183426292973e-05, + "loss": 0.9548, + "num_tokens": 2407594452.0, + "step": 576 + }, + { + "epoch": 0.06856803327391563, + "grad_norm": 1.0485420067351643, + "learning_rate": 1.999915992933349e-05, + "loss": 0.9983, + "num_tokens": 2411783547.0, + "step": 577 + }, + { + "epoch": 0.06868686868686869, + "grad_norm": 0.7534124345263852, + "learning_rate": 1.9999136099104683e-05, + "loss": 0.9656, + "num_tokens": 2415974765.0, + "step": 578 + }, + { + "epoch": 0.06880570409982174, + "grad_norm": 1.0874128769132052, + "learning_rate": 1.999911193560744e-05, + "loss": 1.0091, + "num_tokens": 2420165727.0, + "step": 579 + }, + { + "epoch": 0.0689245395127748, + "grad_norm": 0.9439421620387204, + "learning_rate": 1.999908743884265e-05, + "loss": 1.0079, + "num_tokens": 2424353649.0, + "step": 580 + }, + { + "epoch": 0.06904337492572786, + "grad_norm": 0.7479507605982884, + "learning_rate": 1.9999062608811232e-05, + "loss": 0.991, + "num_tokens": 2428543542.0, + "step": 581 + }, + { + "epoch": 0.06916221033868093, + "grad_norm": 1.4973453455390504, + "learning_rate": 1.99990374455141e-05, + "loss": 0.9968, + "num_tokens": 2432733828.0, + "step": 582 + }, + { + "epoch": 0.06928104575163399, + "grad_norm": 0.7422771591558817, + "learning_rate": 1.999901194895218e-05, + "loss": 0.9688, + "num_tokens": 2436922889.0, + "step": 583 + }, + { + "epoch": 0.06939988116458705, + "grad_norm": 1.0969438336772774, + "learning_rate": 1.9998986119126423e-05, + "loss": 0.962, + "num_tokens": 2441112031.0, + "step": 584 + }, + { + "epoch": 0.06951871657754011, + "grad_norm": 0.9043132075964827, + "learning_rate": 1.9998959956037787e-05, + "loss": 1.0104, + "num_tokens": 2445302845.0, + "step": 585 + }, + { + "epoch": 0.06963755199049317, + "grad_norm": 0.8451505181589982, + "learning_rate": 1.9998933459687236e-05, + "loss": 0.9645, + "num_tokens": 2449482232.0, + "step": 586 + }, + { + "epoch": 0.06975638740344622, + "grad_norm": 1.0817198630807554, + "learning_rate": 1.9998906630075753e-05, + "loss": 0.988, + "num_tokens": 2453642801.0, + "step": 587 + }, + { + "epoch": 0.06987522281639928, + "grad_norm": 0.7835308626160816, + "learning_rate": 1.9998879467204327e-05, + "loss": 1.003, + "num_tokens": 2457784963.0, + "step": 588 + }, + { + "epoch": 0.06999405822935234, + "grad_norm": 1.0258307447175854, + "learning_rate": 1.9998851971073975e-05, + "loss": 1.0043, + "num_tokens": 2461949816.0, + "step": 589 + }, + { + "epoch": 0.07011289364230541, + "grad_norm": 1.0595768606360716, + "learning_rate": 1.9998824141685703e-05, + "loss": 0.9981, + "num_tokens": 2466137462.0, + "step": 590 + }, + { + "epoch": 0.07023172905525847, + "grad_norm": 0.8407672731294995, + "learning_rate": 1.999879597904055e-05, + "loss": 0.9791, + "num_tokens": 2470325441.0, + "step": 591 + }, + { + "epoch": 0.07035056446821153, + "grad_norm": 1.1973601476864653, + "learning_rate": 1.999876748313956e-05, + "loss": 0.9932, + "num_tokens": 2474497381.0, + "step": 592 + }, + { + "epoch": 0.07046939988116459, + "grad_norm": 0.8871409741173518, + "learning_rate": 1.9998738653983777e-05, + "loss": 1.0232, + "num_tokens": 2478686186.0, + "step": 593 + }, + { + "epoch": 0.07058823529411765, + "grad_norm": 0.9880325044433343, + "learning_rate": 1.999870949157428e-05, + "loss": 0.969, + "num_tokens": 2482843318.0, + "step": 594 + }, + { + "epoch": 0.0707070707070707, + "grad_norm": 0.8258164275038922, + "learning_rate": 1.9998679995912145e-05, + "loss": 0.9653, + "num_tokens": 2487016896.0, + "step": 595 + }, + { + "epoch": 0.07082590612002376, + "grad_norm": 1.0673588693681197, + "learning_rate": 1.9998650166998463e-05, + "loss": 1.0311, + "num_tokens": 2491204391.0, + "step": 596 + }, + { + "epoch": 0.07094474153297682, + "grad_norm": 0.7778938099136748, + "learning_rate": 1.9998620004834342e-05, + "loss": 0.9798, + "num_tokens": 2495390660.0, + "step": 597 + }, + { + "epoch": 0.0710635769459299, + "grad_norm": 1.0985292861656715, + "learning_rate": 1.9998589509420897e-05, + "loss": 0.9825, + "num_tokens": 2499578079.0, + "step": 598 + }, + { + "epoch": 0.07118241235888295, + "grad_norm": 0.8362242696517288, + "learning_rate": 1.9998558680759255e-05, + "loss": 0.992, + "num_tokens": 2503736939.0, + "step": 599 + }, + { + "epoch": 0.07130124777183601, + "grad_norm": 0.8497613154152251, + "learning_rate": 1.9998527518850566e-05, + "loss": 0.9932, + "num_tokens": 2507902463.0, + "step": 600 + }, + { + "epoch": 0.07142008318478907, + "grad_norm": 0.9127102408488601, + "learning_rate": 1.9998496023695974e-05, + "loss": 1.0052, + "num_tokens": 2512049553.0, + "step": 601 + }, + { + "epoch": 0.07153891859774213, + "grad_norm": 1.1275112181948304, + "learning_rate": 1.999846419529665e-05, + "loss": 0.9955, + "num_tokens": 2516240295.0, + "step": 602 + }, + { + "epoch": 0.07165775401069518, + "grad_norm": 0.7828149242122373, + "learning_rate": 1.9998432033653773e-05, + "loss": 0.9955, + "num_tokens": 2520429784.0, + "step": 603 + }, + { + "epoch": 0.07177658942364824, + "grad_norm": 1.0307237977534678, + "learning_rate": 1.9998399538768532e-05, + "loss": 0.9828, + "num_tokens": 2524595997.0, + "step": 604 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.8700647488327659, + "learning_rate": 1.9998366710642132e-05, + "loss": 0.9947, + "num_tokens": 2528767386.0, + "step": 605 + }, + { + "epoch": 0.07201426024955437, + "grad_norm": 1.0370631492905944, + "learning_rate": 1.9998333549275792e-05, + "loss": 0.9638, + "num_tokens": 2532957337.0, + "step": 606 + }, + { + "epoch": 0.07213309566250743, + "grad_norm": 0.9768537542567992, + "learning_rate": 1.9998300054670733e-05, + "loss": 0.9933, + "num_tokens": 2537128492.0, + "step": 607 + }, + { + "epoch": 0.07225193107546049, + "grad_norm": 1.1000169724228614, + "learning_rate": 1.99982662268282e-05, + "loss": 1.0038, + "num_tokens": 2541317879.0, + "step": 608 + }, + { + "epoch": 0.07237076648841355, + "grad_norm": 0.8013952603675053, + "learning_rate": 1.9998232065749447e-05, + "loss": 1.0162, + "num_tokens": 2545486574.0, + "step": 609 + }, + { + "epoch": 0.07248960190136661, + "grad_norm": 1.01522994902192, + "learning_rate": 1.999819757143573e-05, + "loss": 1.0323, + "num_tokens": 2549675479.0, + "step": 610 + }, + { + "epoch": 0.07260843731431967, + "grad_norm": 1.0807711352126672, + "learning_rate": 1.999816274388834e-05, + "loss": 0.9726, + "num_tokens": 2553805472.0, + "step": 611 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 0.7430471956943735, + "learning_rate": 1.9998127583108558e-05, + "loss": 1.0023, + "num_tokens": 2557951259.0, + "step": 612 + }, + { + "epoch": 0.07284610814022578, + "grad_norm": 1.1560437229490803, + "learning_rate": 1.9998092089097687e-05, + "loss": 0.9827, + "num_tokens": 2562141405.0, + "step": 613 + }, + { + "epoch": 0.07296494355317884, + "grad_norm": 0.7784931184872178, + "learning_rate": 1.9998056261857045e-05, + "loss": 0.9989, + "num_tokens": 2566330624.0, + "step": 614 + }, + { + "epoch": 0.07308377896613191, + "grad_norm": 1.0898243391468947, + "learning_rate": 1.9998020101387955e-05, + "loss": 0.9403, + "num_tokens": 2570498805.0, + "step": 615 + }, + { + "epoch": 0.07320261437908497, + "grad_norm": 0.866809334085863, + "learning_rate": 1.9997983607691758e-05, + "loss": 0.9855, + "num_tokens": 2574689439.0, + "step": 616 + }, + { + "epoch": 0.07332144979203803, + "grad_norm": 1.094003394696701, + "learning_rate": 1.9997946780769807e-05, + "loss": 1.0027, + "num_tokens": 2578879259.0, + "step": 617 + }, + { + "epoch": 0.07344028520499109, + "grad_norm": 0.924166629550328, + "learning_rate": 1.9997909620623458e-05, + "loss": 0.9757, + "num_tokens": 2583069599.0, + "step": 618 + }, + { + "epoch": 0.07355912061794415, + "grad_norm": 0.7517850980553057, + "learning_rate": 1.99978721272541e-05, + "loss": 0.9682, + "num_tokens": 2587225921.0, + "step": 619 + }, + { + "epoch": 0.0736779560308972, + "grad_norm": 1.2671725150020128, + "learning_rate": 1.999783430066311e-05, + "loss": 0.9957, + "num_tokens": 2591391633.0, + "step": 620 + }, + { + "epoch": 0.07379679144385026, + "grad_norm": 0.7765351823586298, + "learning_rate": 1.9997796140851895e-05, + "loss": 0.9869, + "num_tokens": 2595580785.0, + "step": 621 + }, + { + "epoch": 0.07391562685680332, + "grad_norm": 0.9862687689669966, + "learning_rate": 1.9997757647821866e-05, + "loss": 0.9962, + "num_tokens": 2599751464.0, + "step": 622 + }, + { + "epoch": 0.0740344622697564, + "grad_norm": 0.7856246938381422, + "learning_rate": 1.999771882157445e-05, + "loss": 0.9602, + "num_tokens": 2603941247.0, + "step": 623 + }, + { + "epoch": 0.07415329768270945, + "grad_norm": 1.1307116487228903, + "learning_rate": 1.9997679662111085e-05, + "loss": 1.0066, + "num_tokens": 2608130223.0, + "step": 624 + }, + { + "epoch": 0.07427213309566251, + "grad_norm": 0.9409513192478081, + "learning_rate": 1.9997640169433216e-05, + "loss": 0.9858, + "num_tokens": 2612314705.0, + "step": 625 + }, + { + "epoch": 0.07439096850861557, + "grad_norm": 0.9380137133276806, + "learning_rate": 1.9997600343542316e-05, + "loss": 0.9844, + "num_tokens": 2616490132.0, + "step": 626 + }, + { + "epoch": 0.07450980392156863, + "grad_norm": 0.8291448927348025, + "learning_rate": 1.999756018443985e-05, + "loss": 0.9715, + "num_tokens": 2620660465.0, + "step": 627 + }, + { + "epoch": 0.07462863933452168, + "grad_norm": 0.9017267490118319, + "learning_rate": 1.999751969212731e-05, + "loss": 0.9675, + "num_tokens": 2624850194.0, + "step": 628 + }, + { + "epoch": 0.07474747474747474, + "grad_norm": 0.9750728824643783, + "learning_rate": 1.9997478866606194e-05, + "loss": 0.9764, + "num_tokens": 2629040286.0, + "step": 629 + }, + { + "epoch": 0.0748663101604278, + "grad_norm": 1.4320428452877931, + "learning_rate": 1.9997437707878015e-05, + "loss": 1.0148, + "num_tokens": 2633231007.0, + "step": 630 + }, + { + "epoch": 0.07498514557338087, + "grad_norm": 0.707752773375284, + "learning_rate": 1.9997396215944296e-05, + "loss": 0.9901, + "num_tokens": 2637420632.0, + "step": 631 + }, + { + "epoch": 0.07510398098633393, + "grad_norm": 1.708125285611259, + "learning_rate": 1.9997354390806572e-05, + "loss": 0.9882, + "num_tokens": 2641553669.0, + "step": 632 + }, + { + "epoch": 0.07522281639928699, + "grad_norm": 1.021050024024448, + "learning_rate": 1.99973122324664e-05, + "loss": 1.0029, + "num_tokens": 2645743282.0, + "step": 633 + }, + { + "epoch": 0.07534165181224005, + "grad_norm": 1.3722067699542175, + "learning_rate": 1.9997269740925327e-05, + "loss": 1.0062, + "num_tokens": 2649933029.0, + "step": 634 + }, + { + "epoch": 0.0754604872251931, + "grad_norm": 1.045417307827853, + "learning_rate": 1.999722691618494e-05, + "loss": 0.9963, + "num_tokens": 2654123189.0, + "step": 635 + }, + { + "epoch": 0.07557932263814617, + "grad_norm": 1.202285333023687, + "learning_rate": 1.999718375824682e-05, + "loss": 0.9843, + "num_tokens": 2658292348.0, + "step": 636 + }, + { + "epoch": 0.07569815805109922, + "grad_norm": 1.098131827013832, + "learning_rate": 1.9997140267112566e-05, + "loss": 1.0037, + "num_tokens": 2662452730.0, + "step": 637 + }, + { + "epoch": 0.07581699346405228, + "grad_norm": 0.832658074166596, + "learning_rate": 1.9997096442783787e-05, + "loss": 1.0178, + "num_tokens": 2666642774.0, + "step": 638 + }, + { + "epoch": 0.07593582887700535, + "grad_norm": 1.26549133041555, + "learning_rate": 1.9997052285262107e-05, + "loss": 0.9691, + "num_tokens": 2670803855.0, + "step": 639 + }, + { + "epoch": 0.07605466428995841, + "grad_norm": 0.8862833238904796, + "learning_rate": 1.999700779454916e-05, + "loss": 0.9792, + "num_tokens": 2674977497.0, + "step": 640 + }, + { + "epoch": 0.07617349970291147, + "grad_norm": 1.2037660952438718, + "learning_rate": 1.9996962970646596e-05, + "loss": 1.0077, + "num_tokens": 2679165561.0, + "step": 641 + }, + { + "epoch": 0.07629233511586453, + "grad_norm": 1.0889430234501234, + "learning_rate": 1.9996917813556072e-05, + "loss": 1.0028, + "num_tokens": 2683355261.0, + "step": 642 + }, + { + "epoch": 0.07641117052881759, + "grad_norm": 1.08992552214955, + "learning_rate": 1.9996872323279264e-05, + "loss": 1.031, + "num_tokens": 2687542859.0, + "step": 643 + }, + { + "epoch": 0.07653000594177065, + "grad_norm": 0.9311640507397052, + "learning_rate": 1.999682649981785e-05, + "loss": 0.9872, + "num_tokens": 2691732412.0, + "step": 644 + }, + { + "epoch": 0.0766488413547237, + "grad_norm": 1.133349093931421, + "learning_rate": 1.999678034317354e-05, + "loss": 0.9893, + "num_tokens": 2695920561.0, + "step": 645 + }, + { + "epoch": 0.07676767676767676, + "grad_norm": 0.9657683769983242, + "learning_rate": 1.999673385334803e-05, + "loss": 0.9793, + "num_tokens": 2700108831.0, + "step": 646 + }, + { + "epoch": 0.07688651218062983, + "grad_norm": 1.175120617783434, + "learning_rate": 1.9996687030343047e-05, + "loss": 1.0234, + "num_tokens": 2704299032.0, + "step": 647 + }, + { + "epoch": 0.07700534759358289, + "grad_norm": 0.8361014507627768, + "learning_rate": 1.9996639874160324e-05, + "loss": 0.9932, + "num_tokens": 2708469419.0, + "step": 648 + }, + { + "epoch": 0.07712418300653595, + "grad_norm": 1.0984008141306603, + "learning_rate": 1.9996592384801612e-05, + "loss": 0.9765, + "num_tokens": 2712644948.0, + "step": 649 + }, + { + "epoch": 0.07724301841948901, + "grad_norm": 0.9024411232103469, + "learning_rate": 1.9996544562268664e-05, + "loss": 1.0031, + "num_tokens": 2716834028.0, + "step": 650 + }, + { + "epoch": 0.07736185383244207, + "grad_norm": 0.9046444345235584, + "learning_rate": 1.999649640656325e-05, + "loss": 0.9688, + "num_tokens": 2721024108.0, + "step": 651 + }, + { + "epoch": 0.07748068924539513, + "grad_norm": 0.8253903554616617, + "learning_rate": 1.999644791768716e-05, + "loss": 0.9697, + "num_tokens": 2725198753.0, + "step": 652 + }, + { + "epoch": 0.07759952465834818, + "grad_norm": 0.7251072097136918, + "learning_rate": 1.999639909564218e-05, + "loss": 0.996, + "num_tokens": 2729386942.0, + "step": 653 + }, + { + "epoch": 0.07771836007130124, + "grad_norm": 1.1219231837667651, + "learning_rate": 1.9996349940430128e-05, + "loss": 0.9888, + "num_tokens": 2733561364.0, + "step": 654 + }, + { + "epoch": 0.0778371954842543, + "grad_norm": 0.7892535060470279, + "learning_rate": 1.999630045205282e-05, + "loss": 0.9706, + "num_tokens": 2737750382.0, + "step": 655 + }, + { + "epoch": 0.07795603089720737, + "grad_norm": 1.0044971635056434, + "learning_rate": 1.9996250630512092e-05, + "loss": 0.9792, + "num_tokens": 2741938431.0, + "step": 656 + }, + { + "epoch": 0.07807486631016043, + "grad_norm": 0.8209345042965077, + "learning_rate": 1.999620047580978e-05, + "loss": 0.9947, + "num_tokens": 2746128330.0, + "step": 657 + }, + { + "epoch": 0.07819370172311349, + "grad_norm": 0.9956231115525314, + "learning_rate": 1.999614998794775e-05, + "loss": 0.9689, + "num_tokens": 2750318689.0, + "step": 658 + }, + { + "epoch": 0.07831253713606655, + "grad_norm": 0.8049786134671247, + "learning_rate": 1.9996099166927873e-05, + "loss": 0.9893, + "num_tokens": 2754475687.0, + "step": 659 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 1.0698262083004588, + "learning_rate": 1.9996048012752024e-05, + "loss": 1.0172, + "num_tokens": 2758635247.0, + "step": 660 + }, + { + "epoch": 0.07855020796197267, + "grad_norm": 0.9142724427498621, + "learning_rate": 1.99959965254221e-05, + "loss": 0.9694, + "num_tokens": 2762814918.0, + "step": 661 + }, + { + "epoch": 0.07866904337492572, + "grad_norm": 0.9364221850848549, + "learning_rate": 1.9995944704940012e-05, + "loss": 0.9917, + "num_tokens": 2767005111.0, + "step": 662 + }, + { + "epoch": 0.07878787878787878, + "grad_norm": 0.9495608576443219, + "learning_rate": 1.999589255130767e-05, + "loss": 0.9611, + "num_tokens": 2771195102.0, + "step": 663 + }, + { + "epoch": 0.07890671420083185, + "grad_norm": 0.9045615408239004, + "learning_rate": 1.999584006452702e-05, + "loss": 1.0066, + "num_tokens": 2775383331.0, + "step": 664 + }, + { + "epoch": 0.07902554961378491, + "grad_norm": 0.845288142731402, + "learning_rate": 1.9995787244599988e-05, + "loss": 0.9846, + "num_tokens": 2779573386.0, + "step": 665 + }, + { + "epoch": 0.07914438502673797, + "grad_norm": 0.9403803237311391, + "learning_rate": 1.9995734091528543e-05, + "loss": 0.976, + "num_tokens": 2783737425.0, + "step": 666 + }, + { + "epoch": 0.07926322043969103, + "grad_norm": 0.8275644280447944, + "learning_rate": 1.9995680605314648e-05, + "loss": 1.034, + "num_tokens": 2787926313.0, + "step": 667 + }, + { + "epoch": 0.07938205585264409, + "grad_norm": 1.045528177215657, + "learning_rate": 1.9995626785960286e-05, + "loss": 1.0136, + "num_tokens": 2792114753.0, + "step": 668 + }, + { + "epoch": 0.07950089126559715, + "grad_norm": 0.8313618407459651, + "learning_rate": 1.9995572633467446e-05, + "loss": 0.9806, + "num_tokens": 2796303252.0, + "step": 669 + }, + { + "epoch": 0.0796197266785502, + "grad_norm": 0.7271179427479677, + "learning_rate": 1.999551814783814e-05, + "loss": 0.989, + "num_tokens": 2800466235.0, + "step": 670 + }, + { + "epoch": 0.07973856209150326, + "grad_norm": 0.8471829261369231, + "learning_rate": 1.999546332907438e-05, + "loss": 1.0584, + "num_tokens": 2804614191.0, + "step": 671 + }, + { + "epoch": 0.07985739750445633, + "grad_norm": 0.9554642608067724, + "learning_rate": 1.99954081771782e-05, + "loss": 0.9654, + "num_tokens": 2808791482.0, + "step": 672 + }, + { + "epoch": 0.07997623291740939, + "grad_norm": 0.8530266948017565, + "learning_rate": 1.9995352692151642e-05, + "loss": 1.0028, + "num_tokens": 2812981296.0, + "step": 673 + }, + { + "epoch": 0.08009506833036245, + "grad_norm": 0.9826243471647051, + "learning_rate": 1.9995296873996758e-05, + "loss": 0.9752, + "num_tokens": 2817170855.0, + "step": 674 + }, + { + "epoch": 0.08021390374331551, + "grad_norm": 0.8307230447735579, + "learning_rate": 1.999524072271562e-05, + "loss": 0.9561, + "num_tokens": 2821359818.0, + "step": 675 + }, + { + "epoch": 0.08033273915626857, + "grad_norm": 0.849921075200608, + "learning_rate": 1.99951842383103e-05, + "loss": 0.9463, + "num_tokens": 2825549483.0, + "step": 676 + }, + { + "epoch": 0.08045157456922163, + "grad_norm": 0.8567514878197661, + "learning_rate": 1.9995127420782897e-05, + "loss": 0.9862, + "num_tokens": 2829739193.0, + "step": 677 + }, + { + "epoch": 0.08057040998217468, + "grad_norm": 0.9305623165048367, + "learning_rate": 1.999507027013551e-05, + "loss": 1.0047, + "num_tokens": 2833928311.0, + "step": 678 + }, + { + "epoch": 0.08068924539512774, + "grad_norm": 0.8846198634348584, + "learning_rate": 1.999501278637026e-05, + "loss": 0.9891, + "num_tokens": 2838091392.0, + "step": 679 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 0.7994130038619215, + "learning_rate": 1.9994954969489275e-05, + "loss": 0.9719, + "num_tokens": 2842280135.0, + "step": 680 + }, + { + "epoch": 0.08092691622103387, + "grad_norm": 1.0532304676951965, + "learning_rate": 1.9994896819494693e-05, + "loss": 0.9871, + "num_tokens": 2846469958.0, + "step": 681 + }, + { + "epoch": 0.08104575163398693, + "grad_norm": 0.8885599837971827, + "learning_rate": 1.999483833638867e-05, + "loss": 0.9759, + "num_tokens": 2850653752.0, + "step": 682 + }, + { + "epoch": 0.08116458704693999, + "grad_norm": 1.2003667108872191, + "learning_rate": 1.9994779520173368e-05, + "loss": 0.9774, + "num_tokens": 2854843104.0, + "step": 683 + }, + { + "epoch": 0.08128342245989305, + "grad_norm": 0.6967095245525575, + "learning_rate": 1.9994720370850972e-05, + "loss": 0.9795, + "num_tokens": 2859018194.0, + "step": 684 + }, + { + "epoch": 0.0814022578728461, + "grad_norm": 1.1109487723512441, + "learning_rate": 1.999466088842367e-05, + "loss": 0.9795, + "num_tokens": 2863178118.0, + "step": 685 + }, + { + "epoch": 0.08152109328579916, + "grad_norm": 0.8176414542454502, + "learning_rate": 1.999460107289366e-05, + "loss": 0.9975, + "num_tokens": 2867348743.0, + "step": 686 + }, + { + "epoch": 0.08163992869875222, + "grad_norm": 1.2802778400449297, + "learning_rate": 1.999454092426316e-05, + "loss": 0.9788, + "num_tokens": 2871506379.0, + "step": 687 + }, + { + "epoch": 0.08175876411170528, + "grad_norm": 0.726127729309202, + "learning_rate": 1.9994480442534405e-05, + "loss": 1.0008, + "num_tokens": 2875696594.0, + "step": 688 + }, + { + "epoch": 0.08187759952465835, + "grad_norm": 1.1742355662797648, + "learning_rate": 1.9994419627709617e-05, + "loss": 0.963, + "num_tokens": 2879885541.0, + "step": 689 + }, + { + "epoch": 0.08199643493761141, + "grad_norm": 0.9487545337021355, + "learning_rate": 1.999435847979107e-05, + "loss": 0.9601, + "num_tokens": 2884075155.0, + "step": 690 + }, + { + "epoch": 0.08211527035056447, + "grad_norm": 1.0195602790106255, + "learning_rate": 1.9994296998781012e-05, + "loss": 0.8977, + "num_tokens": 2888234167.0, + "step": 691 + }, + { + "epoch": 0.08223410576351753, + "grad_norm": 0.8723202821086766, + "learning_rate": 1.9994235184681726e-05, + "loss": 0.9899, + "num_tokens": 2892423813.0, + "step": 692 + }, + { + "epoch": 0.08235294117647059, + "grad_norm": 0.9153373985735197, + "learning_rate": 1.9994173037495504e-05, + "loss": 0.9831, + "num_tokens": 2896557761.0, + "step": 693 + }, + { + "epoch": 0.08247177658942365, + "grad_norm": 0.8197266809187718, + "learning_rate": 1.9994110557224642e-05, + "loss": 0.9893, + "num_tokens": 2900719321.0, + "step": 694 + }, + { + "epoch": 0.0825906120023767, + "grad_norm": 1.0305155413238876, + "learning_rate": 1.999404774387146e-05, + "loss": 0.9401, + "num_tokens": 2904895722.0, + "step": 695 + }, + { + "epoch": 0.08270944741532976, + "grad_norm": 0.9220330996709163, + "learning_rate": 1.9993984597438275e-05, + "loss": 0.9941, + "num_tokens": 2909085545.0, + "step": 696 + }, + { + "epoch": 0.08282828282828283, + "grad_norm": 0.8162018524763643, + "learning_rate": 1.9993921117927433e-05, + "loss": 0.9631, + "num_tokens": 2913276438.0, + "step": 697 + }, + { + "epoch": 0.08294711824123589, + "grad_norm": 0.8197353841614413, + "learning_rate": 1.9993857305341284e-05, + "loss": 0.983, + "num_tokens": 2917437072.0, + "step": 698 + }, + { + "epoch": 0.08306595365418895, + "grad_norm": 0.8878022091053079, + "learning_rate": 1.999379315968219e-05, + "loss": 1.0178, + "num_tokens": 2921625368.0, + "step": 699 + }, + { + "epoch": 0.08318478906714201, + "grad_norm": 0.9921384212857545, + "learning_rate": 1.9993728680952522e-05, + "loss": 1.0237, + "num_tokens": 2925814302.0, + "step": 700 + }, + { + "epoch": 0.08330362448009507, + "grad_norm": 0.8825805658382617, + "learning_rate": 1.9993663869154676e-05, + "loss": 0.9949, + "num_tokens": 2929988708.0, + "step": 701 + }, + { + "epoch": 0.08342245989304813, + "grad_norm": 0.8694199821688399, + "learning_rate": 1.999359872429105e-05, + "loss": 0.9749, + "num_tokens": 2934177248.0, + "step": 702 + }, + { + "epoch": 0.08354129530600118, + "grad_norm": 0.9226783187936644, + "learning_rate": 1.9993533246364054e-05, + "loss": 0.9523, + "num_tokens": 2938366043.0, + "step": 703 + }, + { + "epoch": 0.08366013071895424, + "grad_norm": 0.9421029184522437, + "learning_rate": 1.999346743537611e-05, + "loss": 1.0087, + "num_tokens": 2942554596.0, + "step": 704 + }, + { + "epoch": 0.08377896613190731, + "grad_norm": 1.0832008464187202, + "learning_rate": 1.9993401291329667e-05, + "loss": 0.9314, + "num_tokens": 2946733756.0, + "step": 705 + }, + { + "epoch": 0.08389780154486037, + "grad_norm": 0.6260694478332809, + "learning_rate": 1.9993334814227165e-05, + "loss": 0.9694, + "num_tokens": 2950918650.0, + "step": 706 + }, + { + "epoch": 0.08401663695781343, + "grad_norm": 1.1044589182421614, + "learning_rate": 1.9993268004071064e-05, + "loss": 0.9919, + "num_tokens": 2955108325.0, + "step": 707 + }, + { + "epoch": 0.08413547237076649, + "grad_norm": 0.8504896488377706, + "learning_rate": 1.9993200860863844e-05, + "loss": 1.002, + "num_tokens": 2959295060.0, + "step": 708 + }, + { + "epoch": 0.08425430778371955, + "grad_norm": 0.8766143152351106, + "learning_rate": 1.999313338460799e-05, + "loss": 0.9675, + "num_tokens": 2963482539.0, + "step": 709 + }, + { + "epoch": 0.0843731431966726, + "grad_norm": 0.803288728323924, + "learning_rate": 1.9993065575305995e-05, + "loss": 0.9784, + "num_tokens": 2967671324.0, + "step": 710 + }, + { + "epoch": 0.08449197860962566, + "grad_norm": 0.8663436073744805, + "learning_rate": 1.9992997432960384e-05, + "loss": 1.0184, + "num_tokens": 2971858443.0, + "step": 711 + }, + { + "epoch": 0.08461081402257872, + "grad_norm": 0.9886053590041541, + "learning_rate": 1.9992928957573664e-05, + "loss": 0.9701, + "num_tokens": 2976048105.0, + "step": 712 + }, + { + "epoch": 0.0847296494355318, + "grad_norm": 1.0781378474516157, + "learning_rate": 1.9992860149148384e-05, + "loss": 0.9909, + "num_tokens": 2980237206.0, + "step": 713 + }, + { + "epoch": 0.08484848484848485, + "grad_norm": 0.7820736417442933, + "learning_rate": 1.9992791007687086e-05, + "loss": 1.0575, + "num_tokens": 2984426195.0, + "step": 714 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.9647265588754995, + "learning_rate": 1.9992721533192332e-05, + "loss": 1.0053, + "num_tokens": 2988613791.0, + "step": 715 + }, + { + "epoch": 0.08508615567439097, + "grad_norm": 0.847353659260934, + "learning_rate": 1.999265172566669e-05, + "loss": 0.9555, + "num_tokens": 2992772808.0, + "step": 716 + }, + { + "epoch": 0.08520499108734403, + "grad_norm": 0.7589777313102866, + "learning_rate": 1.9992581585112756e-05, + "loss": 1.015, + "num_tokens": 2996943477.0, + "step": 717 + }, + { + "epoch": 0.08532382650029709, + "grad_norm": 1.0673708220889044, + "learning_rate": 1.9992511111533116e-05, + "loss": 0.9967, + "num_tokens": 3001132410.0, + "step": 718 + }, + { + "epoch": 0.08544266191325015, + "grad_norm": 0.6426574752254651, + "learning_rate": 1.9992440304930392e-05, + "loss": 0.9992, + "num_tokens": 3005268705.0, + "step": 719 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 1.1207377726604384, + "learning_rate": 1.9992369165307194e-05, + "loss": 1.0117, + "num_tokens": 3009440456.0, + "step": 720 + }, + { + "epoch": 0.08568033273915626, + "grad_norm": 0.7323136153923633, + "learning_rate": 1.9992297692666163e-05, + "loss": 0.9434, + "num_tokens": 3013603656.0, + "step": 721 + }, + { + "epoch": 0.08579916815210933, + "grad_norm": 1.0563243782947707, + "learning_rate": 1.9992225887009948e-05, + "loss": 0.9859, + "num_tokens": 3017793628.0, + "step": 722 + }, + { + "epoch": 0.08591800356506239, + "grad_norm": 0.904107919846161, + "learning_rate": 1.9992153748341198e-05, + "loss": 1.0123, + "num_tokens": 3021957797.0, + "step": 723 + }, + { + "epoch": 0.08603683897801545, + "grad_norm": 0.7785114836804949, + "learning_rate": 1.99920812766626e-05, + "loss": 0.9447, + "num_tokens": 3026099095.0, + "step": 724 + }, + { + "epoch": 0.08615567439096851, + "grad_norm": 0.9658116431128839, + "learning_rate": 1.999200847197682e-05, + "loss": 0.9877, + "num_tokens": 3030289549.0, + "step": 725 + }, + { + "epoch": 0.08627450980392157, + "grad_norm": 0.7866218527709771, + "learning_rate": 1.999193533428657e-05, + "loss": 1.0536, + "num_tokens": 3034476950.0, + "step": 726 + }, + { + "epoch": 0.08639334521687463, + "grad_norm": 1.0509309081254898, + "learning_rate": 1.999186186359455e-05, + "loss": 0.9907, + "num_tokens": 3038654066.0, + "step": 727 + }, + { + "epoch": 0.08651218062982768, + "grad_norm": 0.748396918013241, + "learning_rate": 1.9991788059903486e-05, + "loss": 0.9293, + "num_tokens": 3042837135.0, + "step": 728 + }, + { + "epoch": 0.08663101604278074, + "grad_norm": 0.753794905688729, + "learning_rate": 1.9991713923216103e-05, + "loss": 0.9854, + "num_tokens": 3046997793.0, + "step": 729 + }, + { + "epoch": 0.08674985145573381, + "grad_norm": 0.9508728574609998, + "learning_rate": 1.9991639453535155e-05, + "loss": 0.9775, + "num_tokens": 3051185312.0, + "step": 730 + }, + { + "epoch": 0.08686868686868687, + "grad_norm": 0.8009920171118428, + "learning_rate": 1.9991564650863395e-05, + "loss": 0.9526, + "num_tokens": 3055349040.0, + "step": 731 + }, + { + "epoch": 0.08698752228163993, + "grad_norm": 0.9641775872122788, + "learning_rate": 1.9991489515203596e-05, + "loss": 0.9714, + "num_tokens": 3059539366.0, + "step": 732 + }, + { + "epoch": 0.08710635769459299, + "grad_norm": 0.7507082125280816, + "learning_rate": 1.9991414046558536e-05, + "loss": 0.9682, + "num_tokens": 3063724046.0, + "step": 733 + }, + { + "epoch": 0.08722519310754605, + "grad_norm": 0.8203135078574947, + "learning_rate": 1.999133824493102e-05, + "loss": 0.9941, + "num_tokens": 3067912359.0, + "step": 734 + }, + { + "epoch": 0.0873440285204991, + "grad_norm": 0.8745198367725862, + "learning_rate": 1.9991262110323845e-05, + "loss": 0.9819, + "num_tokens": 3072076740.0, + "step": 735 + }, + { + "epoch": 0.08746286393345216, + "grad_norm": 0.8774414340081887, + "learning_rate": 1.9991185642739834e-05, + "loss": 0.9921, + "num_tokens": 3076264745.0, + "step": 736 + }, + { + "epoch": 0.08758169934640522, + "grad_norm": 0.7548615994512958, + "learning_rate": 1.9991108842181814e-05, + "loss": 0.9813, + "num_tokens": 3080425220.0, + "step": 737 + }, + { + "epoch": 0.0877005347593583, + "grad_norm": 0.853859257030167, + "learning_rate": 1.9991031708652636e-05, + "loss": 0.9895, + "num_tokens": 3084589977.0, + "step": 738 + }, + { + "epoch": 0.08781937017231135, + "grad_norm": 0.8646938037845462, + "learning_rate": 1.999095424215516e-05, + "loss": 0.9571, + "num_tokens": 3088753068.0, + "step": 739 + }, + { + "epoch": 0.08793820558526441, + "grad_norm": 0.9659670052627928, + "learning_rate": 1.9990876442692245e-05, + "loss": 1.0012, + "num_tokens": 3092942858.0, + "step": 740 + }, + { + "epoch": 0.08805704099821747, + "grad_norm": 0.8098972629391531, + "learning_rate": 1.9990798310266776e-05, + "loss": 0.9413, + "num_tokens": 3097132536.0, + "step": 741 + }, + { + "epoch": 0.08817587641117053, + "grad_norm": 0.9376616035452747, + "learning_rate": 1.999071984488165e-05, + "loss": 0.9738, + "num_tokens": 3101322698.0, + "step": 742 + }, + { + "epoch": 0.08829471182412359, + "grad_norm": 0.6509785790541581, + "learning_rate": 1.9990641046539768e-05, + "loss": 0.9818, + "num_tokens": 3105509860.0, + "step": 743 + }, + { + "epoch": 0.08841354723707665, + "grad_norm": 1.1752662011526247, + "learning_rate": 1.9990561915244048e-05, + "loss": 0.9826, + "num_tokens": 3109698054.0, + "step": 744 + }, + { + "epoch": 0.0885323826500297, + "grad_norm": 0.7407201616578195, + "learning_rate": 1.9990482450997426e-05, + "loss": 0.9716, + "num_tokens": 3113855673.0, + "step": 745 + }, + { + "epoch": 0.08865121806298278, + "grad_norm": 0.8827858299101906, + "learning_rate": 1.999040265380284e-05, + "loss": 0.9769, + "num_tokens": 3118045382.0, + "step": 746 + }, + { + "epoch": 0.08877005347593583, + "grad_norm": 0.8073528003291093, + "learning_rate": 1.9990322523663243e-05, + "loss": 1.0196, + "num_tokens": 3122210420.0, + "step": 747 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 1.0710513152743462, + "learning_rate": 1.999024206058161e-05, + "loss": 0.9982, + "num_tokens": 3126398791.0, + "step": 748 + }, + { + "epoch": 0.08900772430184195, + "grad_norm": 0.7601631414495912, + "learning_rate": 1.9990161264560916e-05, + "loss": 0.9985, + "num_tokens": 3130564032.0, + "step": 749 + }, + { + "epoch": 0.08912655971479501, + "grad_norm": 0.8915972061066586, + "learning_rate": 1.9990080135604156e-05, + "loss": 1.0165, + "num_tokens": 3134752025.0, + "step": 750 + }, + { + "epoch": 0.08924539512774807, + "grad_norm": 0.7826461630862808, + "learning_rate": 1.9989998673714327e-05, + "loss": 0.9975, + "num_tokens": 3138907014.0, + "step": 751 + }, + { + "epoch": 0.08936423054070113, + "grad_norm": 0.9368692442460843, + "learning_rate": 1.9989916878894458e-05, + "loss": 0.9851, + "num_tokens": 3143069470.0, + "step": 752 + }, + { + "epoch": 0.08948306595365418, + "grad_norm": 0.7788032694387118, + "learning_rate": 1.9989834751147567e-05, + "loss": 0.9972, + "num_tokens": 3147207014.0, + "step": 753 + }, + { + "epoch": 0.08960190136660724, + "grad_norm": 0.8364889767916424, + "learning_rate": 1.99897522904767e-05, + "loss": 0.9394, + "num_tokens": 3151394606.0, + "step": 754 + }, + { + "epoch": 0.08972073677956031, + "grad_norm": 0.880095712116036, + "learning_rate": 1.998966949688491e-05, + "loss": 1.0094, + "num_tokens": 3155556432.0, + "step": 755 + }, + { + "epoch": 0.08983957219251337, + "grad_norm": 0.9665589597953431, + "learning_rate": 1.9989586370375264e-05, + "loss": 0.9981, + "num_tokens": 3159731142.0, + "step": 756 + }, + { + "epoch": 0.08995840760546643, + "grad_norm": 0.8617206702533924, + "learning_rate": 1.998950291095084e-05, + "loss": 0.9946, + "num_tokens": 3163920352.0, + "step": 757 + }, + { + "epoch": 0.09007724301841949, + "grad_norm": 0.9537897770347452, + "learning_rate": 1.9989419118614734e-05, + "loss": 1.0383, + "num_tokens": 3168094326.0, + "step": 758 + }, + { + "epoch": 0.09019607843137255, + "grad_norm": 0.9643746366446022, + "learning_rate": 1.998933499337004e-05, + "loss": 0.982, + "num_tokens": 3172282917.0, + "step": 759 + }, + { + "epoch": 0.0903149138443256, + "grad_norm": 0.6971087089949083, + "learning_rate": 1.9989250535219874e-05, + "loss": 0.9644, + "num_tokens": 3176437323.0, + "step": 760 + }, + { + "epoch": 0.09043374925727866, + "grad_norm": 1.0164888806945118, + "learning_rate": 1.998916574416737e-05, + "loss": 0.9984, + "num_tokens": 3180619829.0, + "step": 761 + }, + { + "epoch": 0.09055258467023172, + "grad_norm": 0.8242747084952322, + "learning_rate": 1.9989080620215667e-05, + "loss": 0.9415, + "num_tokens": 3184806757.0, + "step": 762 + }, + { + "epoch": 0.0906714200831848, + "grad_norm": 1.0126699489487279, + "learning_rate": 1.998899516336792e-05, + "loss": 0.9777, + "num_tokens": 3188985021.0, + "step": 763 + }, + { + "epoch": 0.09079025549613785, + "grad_norm": 0.7061987402203622, + "learning_rate": 1.9988909373627282e-05, + "loss": 0.9813, + "num_tokens": 3193140742.0, + "step": 764 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.970637785973205, + "learning_rate": 1.998882325099694e-05, + "loss": 0.9781, + "num_tokens": 3197329120.0, + "step": 765 + }, + { + "epoch": 0.09102792632204397, + "grad_norm": 0.7213452718601799, + "learning_rate": 1.998873679548008e-05, + "loss": 0.9869, + "num_tokens": 3201516764.0, + "step": 766 + }, + { + "epoch": 0.09114676173499703, + "grad_norm": 0.8864700969456814, + "learning_rate": 1.998865000707991e-05, + "loss": 0.9669, + "num_tokens": 3205699417.0, + "step": 767 + }, + { + "epoch": 0.09126559714795009, + "grad_norm": 0.8739166024564046, + "learning_rate": 1.9988562885799632e-05, + "loss": 0.9759, + "num_tokens": 3209857879.0, + "step": 768 + }, + { + "epoch": 0.09138443256090314, + "grad_norm": 0.7677000977807592, + "learning_rate": 1.9988475431642486e-05, + "loss": 0.9523, + "num_tokens": 3214046581.0, + "step": 769 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.8871580734160754, + "learning_rate": 1.99883876446117e-05, + "loss": 0.9943, + "num_tokens": 3218236423.0, + "step": 770 + }, + { + "epoch": 0.09162210338680928, + "grad_norm": 0.8953138733669775, + "learning_rate": 1.9988299524710528e-05, + "loss": 1.0249, + "num_tokens": 3222425526.0, + "step": 771 + }, + { + "epoch": 0.09174093879976233, + "grad_norm": 0.7483968920865591, + "learning_rate": 1.9988211071942235e-05, + "loss": 0.9701, + "num_tokens": 3226615016.0, + "step": 772 + }, + { + "epoch": 0.09185977421271539, + "grad_norm": 0.7708479954216567, + "learning_rate": 1.99881222863101e-05, + "loss": 0.98, + "num_tokens": 3230803925.0, + "step": 773 + }, + { + "epoch": 0.09197860962566845, + "grad_norm": 0.9724216915481813, + "learning_rate": 1.9988033167817406e-05, + "loss": 0.9671, + "num_tokens": 3234961672.0, + "step": 774 + }, + { + "epoch": 0.09209744503862151, + "grad_norm": 0.7346917238605186, + "learning_rate": 1.998794371646745e-05, + "loss": 1.0225, + "num_tokens": 3239119869.0, + "step": 775 + }, + { + "epoch": 0.09221628045157457, + "grad_norm": 0.9021431094034519, + "learning_rate": 1.998785393226355e-05, + "loss": 0.9717, + "num_tokens": 3243263663.0, + "step": 776 + }, + { + "epoch": 0.09233511586452763, + "grad_norm": 0.7702168168783676, + "learning_rate": 1.9987763815209032e-05, + "loss": 0.9731, + "num_tokens": 3247433762.0, + "step": 777 + }, + { + "epoch": 0.09245395127748068, + "grad_norm": 0.7349553377982673, + "learning_rate": 1.9987673365307236e-05, + "loss": 0.9877, + "num_tokens": 3251622932.0, + "step": 778 + }, + { + "epoch": 0.09257278669043376, + "grad_norm": 0.9492095352572457, + "learning_rate": 1.99875825825615e-05, + "loss": 0.9786, + "num_tokens": 3255812128.0, + "step": 779 + }, + { + "epoch": 0.09269162210338681, + "grad_norm": 0.7192016534557254, + "learning_rate": 1.9987491466975196e-05, + "loss": 0.9913, + "num_tokens": 3260001096.0, + "step": 780 + }, + { + "epoch": 0.09281045751633987, + "grad_norm": 1.0216072887104792, + "learning_rate": 1.99874000185517e-05, + "loss": 0.9774, + "num_tokens": 3264191949.0, + "step": 781 + }, + { + "epoch": 0.09292929292929293, + "grad_norm": 0.7919519968550489, + "learning_rate": 1.998730823729439e-05, + "loss": 0.9771, + "num_tokens": 3268382021.0, + "step": 782 + }, + { + "epoch": 0.09304812834224599, + "grad_norm": 0.8903727311784021, + "learning_rate": 1.9987216123206666e-05, + "loss": 1.0096, + "num_tokens": 3272570813.0, + "step": 783 + }, + { + "epoch": 0.09316696375519905, + "grad_norm": 0.8385156033259328, + "learning_rate": 1.998712367629195e-05, + "loss": 0.9745, + "num_tokens": 3276759944.0, + "step": 784 + }, + { + "epoch": 0.0932857991681521, + "grad_norm": 0.9374726617684337, + "learning_rate": 1.9987030896553653e-05, + "loss": 0.9502, + "num_tokens": 3280949676.0, + "step": 785 + }, + { + "epoch": 0.09340463458110516, + "grad_norm": 0.759326136676509, + "learning_rate": 1.998693778399522e-05, + "loss": 0.9914, + "num_tokens": 3285138240.0, + "step": 786 + }, + { + "epoch": 0.09352346999405822, + "grad_norm": 1.0121585505158641, + "learning_rate": 1.9986844338620086e-05, + "loss": 0.9647, + "num_tokens": 3289301350.0, + "step": 787 + }, + { + "epoch": 0.0936423054070113, + "grad_norm": 0.7819438603217598, + "learning_rate": 1.9986750560431726e-05, + "loss": 0.971, + "num_tokens": 3293446602.0, + "step": 788 + }, + { + "epoch": 0.09376114081996435, + "grad_norm": 0.9578589383526377, + "learning_rate": 1.998665644943361e-05, + "loss": 0.9552, + "num_tokens": 3297602994.0, + "step": 789 + }, + { + "epoch": 0.09387997623291741, + "grad_norm": 0.7134468867896002, + "learning_rate": 1.998656200562922e-05, + "loss": 1.0001, + "num_tokens": 3301790859.0, + "step": 790 + }, + { + "epoch": 0.09399881164587047, + "grad_norm": 1.0853548481918864, + "learning_rate": 1.9986467229022055e-05, + "loss": 1.0214, + "num_tokens": 3305957628.0, + "step": 791 + }, + { + "epoch": 0.09411764705882353, + "grad_norm": 0.8730224008540879, + "learning_rate": 1.9986372119615624e-05, + "loss": 0.991, + "num_tokens": 3310146028.0, + "step": 792 + }, + { + "epoch": 0.09423648247177659, + "grad_norm": 0.7659943186768062, + "learning_rate": 1.998627667741345e-05, + "loss": 0.9751, + "num_tokens": 3314332951.0, + "step": 793 + }, + { + "epoch": 0.09435531788472964, + "grad_norm": 0.8476417054633055, + "learning_rate": 1.9986180902419067e-05, + "loss": 0.9451, + "num_tokens": 3318523147.0, + "step": 794 + }, + { + "epoch": 0.0944741532976827, + "grad_norm": 0.8212975267023064, + "learning_rate": 1.9986084794636022e-05, + "loss": 0.9891, + "num_tokens": 3322696447.0, + "step": 795 + }, + { + "epoch": 0.09459298871063578, + "grad_norm": 0.8305018646586093, + "learning_rate": 1.9985988354067875e-05, + "loss": 0.9232, + "num_tokens": 3326886492.0, + "step": 796 + }, + { + "epoch": 0.09471182412358883, + "grad_norm": 0.8686056577042471, + "learning_rate": 1.99858915807182e-05, + "loss": 0.9545, + "num_tokens": 3331076615.0, + "step": 797 + }, + { + "epoch": 0.09483065953654189, + "grad_norm": 0.7594593185325395, + "learning_rate": 1.9985794474590573e-05, + "loss": 0.9761, + "num_tokens": 3335221384.0, + "step": 798 + }, + { + "epoch": 0.09494949494949495, + "grad_norm": 0.8447679882775643, + "learning_rate": 1.9985697035688597e-05, + "loss": 1.0011, + "num_tokens": 3339409708.0, + "step": 799 + }, + { + "epoch": 0.09506833036244801, + "grad_norm": 0.7790406034479882, + "learning_rate": 1.9985599264015878e-05, + "loss": 1.0156, + "num_tokens": 3343561199.0, + "step": 800 + }, + { + "epoch": 0.09518716577540107, + "grad_norm": 0.9605907711339373, + "learning_rate": 1.9985501159576037e-05, + "loss": 0.993, + "num_tokens": 3347733868.0, + "step": 801 + }, + { + "epoch": 0.09530600118835413, + "grad_norm": 0.8226913121843273, + "learning_rate": 1.9985402722372705e-05, + "loss": 1.0005, + "num_tokens": 3351919328.0, + "step": 802 + }, + { + "epoch": 0.09542483660130718, + "grad_norm": 0.8178179471428424, + "learning_rate": 1.9985303952409533e-05, + "loss": 0.9701, + "num_tokens": 3356104043.0, + "step": 803 + }, + { + "epoch": 0.09554367201426026, + "grad_norm": 0.7861183749602786, + "learning_rate": 1.9985204849690175e-05, + "loss": 0.9715, + "num_tokens": 3360292570.0, + "step": 804 + }, + { + "epoch": 0.09566250742721331, + "grad_norm": 1.0120908217867022, + "learning_rate": 1.99851054142183e-05, + "loss": 1.0097, + "num_tokens": 3364481225.0, + "step": 805 + }, + { + "epoch": 0.09578134284016637, + "grad_norm": 0.7563283805123195, + "learning_rate": 1.9985005645997595e-05, + "loss": 1.0051, + "num_tokens": 3368639121.0, + "step": 806 + }, + { + "epoch": 0.09590017825311943, + "grad_norm": 0.843188966493132, + "learning_rate": 1.998490554503175e-05, + "loss": 0.9592, + "num_tokens": 3372820879.0, + "step": 807 + }, + { + "epoch": 0.09601901366607249, + "grad_norm": 0.7717362001154708, + "learning_rate": 1.9984805111324476e-05, + "loss": 1.0016, + "num_tokens": 3376978417.0, + "step": 808 + }, + { + "epoch": 0.09613784907902555, + "grad_norm": 0.8840572554063915, + "learning_rate": 1.9984704344879486e-05, + "loss": 1.0101, + "num_tokens": 3381167434.0, + "step": 809 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 0.678266344874382, + "learning_rate": 1.998460324570052e-05, + "loss": 0.9881, + "num_tokens": 3385355913.0, + "step": 810 + }, + { + "epoch": 0.09637551990493166, + "grad_norm": 0.8681001484675436, + "learning_rate": 1.9984501813791316e-05, + "loss": 0.9911, + "num_tokens": 3389533934.0, + "step": 811 + }, + { + "epoch": 0.09649435531788474, + "grad_norm": 0.9781244640598047, + "learning_rate": 1.9984400049155634e-05, + "loss": 0.9916, + "num_tokens": 3393724739.0, + "step": 812 + }, + { + "epoch": 0.0966131907308378, + "grad_norm": 0.8002396512146893, + "learning_rate": 1.998429795179724e-05, + "loss": 0.9435, + "num_tokens": 3397892442.0, + "step": 813 + }, + { + "epoch": 0.09673202614379085, + "grad_norm": 0.785108681087386, + "learning_rate": 1.998419552171992e-05, + "loss": 0.981, + "num_tokens": 3402057848.0, + "step": 814 + }, + { + "epoch": 0.09685086155674391, + "grad_norm": 0.8220882605951773, + "learning_rate": 1.998409275892746e-05, + "loss": 0.9611, + "num_tokens": 3406232094.0, + "step": 815 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 0.8412575128374873, + "learning_rate": 1.9983989663423672e-05, + "loss": 0.9601, + "num_tokens": 3410420330.0, + "step": 816 + }, + { + "epoch": 0.09708853238265003, + "grad_norm": 0.7697415932268472, + "learning_rate": 1.9983886235212368e-05, + "loss": 0.9813, + "num_tokens": 3414581033.0, + "step": 817 + }, + { + "epoch": 0.09720736779560309, + "grad_norm": 0.9323226486833662, + "learning_rate": 1.998378247429738e-05, + "loss": 0.9417, + "num_tokens": 3418770037.0, + "step": 818 + }, + { + "epoch": 0.09732620320855614, + "grad_norm": 0.8786328195544582, + "learning_rate": 1.9983678380682556e-05, + "loss": 0.9674, + "num_tokens": 3422939857.0, + "step": 819 + }, + { + "epoch": 0.0974450386215092, + "grad_norm": 0.7116494452883586, + "learning_rate": 1.9983573954371746e-05, + "loss": 0.9993, + "num_tokens": 3427099533.0, + "step": 820 + }, + { + "epoch": 0.09756387403446228, + "grad_norm": 0.8317094985938485, + "learning_rate": 1.9983469195368818e-05, + "loss": 0.9603, + "num_tokens": 3431283020.0, + "step": 821 + }, + { + "epoch": 0.09768270944741533, + "grad_norm": 0.8869650523389624, + "learning_rate": 1.998336410367765e-05, + "loss": 0.9802, + "num_tokens": 3435450091.0, + "step": 822 + }, + { + "epoch": 0.09780154486036839, + "grad_norm": 0.7032346654128587, + "learning_rate": 1.998325867930214e-05, + "loss": 0.9984, + "num_tokens": 3439638320.0, + "step": 823 + }, + { + "epoch": 0.09792038027332145, + "grad_norm": 0.7757778543765145, + "learning_rate": 1.9983152922246184e-05, + "loss": 0.9862, + "num_tokens": 3443827032.0, + "step": 824 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.01345433702807, + "learning_rate": 1.9983046832513706e-05, + "loss": 0.9205, + "num_tokens": 3448016551.0, + "step": 825 + }, + { + "epoch": 0.09815805109922757, + "grad_norm": 0.7573779640355437, + "learning_rate": 1.9982940410108627e-05, + "loss": 1.0339, + "num_tokens": 3452147502.0, + "step": 826 + }, + { + "epoch": 0.09827688651218063, + "grad_norm": 0.9610861065915691, + "learning_rate": 1.9982833655034896e-05, + "loss": 0.971, + "num_tokens": 3456332935.0, + "step": 827 + }, + { + "epoch": 0.09839572192513368, + "grad_norm": 0.7238334699581038, + "learning_rate": 1.9982726567296463e-05, + "loss": 0.997, + "num_tokens": 3460518454.0, + "step": 828 + }, + { + "epoch": 0.09851455733808676, + "grad_norm": 0.85557740917451, + "learning_rate": 1.998261914689729e-05, + "loss": 1.0147, + "num_tokens": 3464708067.0, + "step": 829 + }, + { + "epoch": 0.09863339275103981, + "grad_norm": 0.6971372234241576, + "learning_rate": 1.9982511393841366e-05, + "loss": 0.9987, + "num_tokens": 3468898020.0, + "step": 830 + }, + { + "epoch": 0.09875222816399287, + "grad_norm": 0.9730495586987788, + "learning_rate": 1.9982403308132668e-05, + "loss": 0.9726, + "num_tokens": 3473087736.0, + "step": 831 + }, + { + "epoch": 0.09887106357694593, + "grad_norm": 0.8990991429077071, + "learning_rate": 1.998229488977521e-05, + "loss": 0.9722, + "num_tokens": 3477251788.0, + "step": 832 + }, + { + "epoch": 0.09898989898989899, + "grad_norm": 0.7595892620169997, + "learning_rate": 1.9982186138772997e-05, + "loss": 0.9793, + "num_tokens": 3481426178.0, + "step": 833 + }, + { + "epoch": 0.09910873440285205, + "grad_norm": 0.8952586454157471, + "learning_rate": 1.9982077055130068e-05, + "loss": 0.9933, + "num_tokens": 3485616274.0, + "step": 834 + }, + { + "epoch": 0.0992275698158051, + "grad_norm": 0.9153471425002477, + "learning_rate": 1.9981967638850452e-05, + "loss": 0.9931, + "num_tokens": 3489777779.0, + "step": 835 + }, + { + "epoch": 0.09934640522875816, + "grad_norm": 0.7265025292276248, + "learning_rate": 1.9981857889938207e-05, + "loss": 0.9299, + "num_tokens": 3493966994.0, + "step": 836 + }, + { + "epoch": 0.09946524064171124, + "grad_norm": 0.8125107389241452, + "learning_rate": 1.9981747808397393e-05, + "loss": 0.9568, + "num_tokens": 3498155239.0, + "step": 837 + }, + { + "epoch": 0.0995840760546643, + "grad_norm": 0.7850794398837204, + "learning_rate": 1.9981637394232095e-05, + "loss": 0.9634, + "num_tokens": 3502343525.0, + "step": 838 + }, + { + "epoch": 0.09970291146761735, + "grad_norm": 0.8267178684555512, + "learning_rate": 1.9981526647446394e-05, + "loss": 0.9763, + "num_tokens": 3506533238.0, + "step": 839 + }, + { + "epoch": 0.09982174688057041, + "grad_norm": 0.8818570478959105, + "learning_rate": 1.9981415568044397e-05, + "loss": 0.9784, + "num_tokens": 3510722442.0, + "step": 840 + }, + { + "epoch": 0.09994058229352347, + "grad_norm": 0.7272707090215114, + "learning_rate": 1.998130415603021e-05, + "loss": 0.9931, + "num_tokens": 3514893633.0, + "step": 841 + }, + { + "epoch": 0.10005941770647653, + "grad_norm": 0.981251849240853, + "learning_rate": 1.9981192411407967e-05, + "loss": 0.9946, + "num_tokens": 3519081166.0, + "step": 842 + }, + { + "epoch": 0.10017825311942959, + "grad_norm": 0.600962431380547, + "learning_rate": 1.99810803341818e-05, + "loss": 0.9547, + "num_tokens": 3523261824.0, + "step": 843 + }, + { + "epoch": 0.10029708853238264, + "grad_norm": 1.0304880697124563, + "learning_rate": 1.9980967924355865e-05, + "loss": 0.9841, + "num_tokens": 3527437136.0, + "step": 844 + }, + { + "epoch": 0.10041592394533572, + "grad_norm": 0.7767174046609356, + "learning_rate": 1.998085518193432e-05, + "loss": 0.9801, + "num_tokens": 3531625841.0, + "step": 845 + }, + { + "epoch": 0.10053475935828877, + "grad_norm": 1.020914027204615, + "learning_rate": 1.9980742106921343e-05, + "loss": 0.9677, + "num_tokens": 3535815073.0, + "step": 846 + }, + { + "epoch": 0.10065359477124183, + "grad_norm": 0.7891093530759892, + "learning_rate": 1.9980628699321124e-05, + "loss": 0.9582, + "num_tokens": 3540005013.0, + "step": 847 + }, + { + "epoch": 0.10077243018419489, + "grad_norm": 0.7631099367875016, + "learning_rate": 1.9980514959137857e-05, + "loss": 0.9458, + "num_tokens": 3544195814.0, + "step": 848 + }, + { + "epoch": 0.10089126559714795, + "grad_norm": 0.7170256594292476, + "learning_rate": 1.9980400886375757e-05, + "loss": 0.9854, + "num_tokens": 3548368508.0, + "step": 849 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 1.0369861085608094, + "learning_rate": 1.998028648103905e-05, + "loss": 0.9974, + "num_tokens": 3552556308.0, + "step": 850 + }, + { + "epoch": 0.10112893642305407, + "grad_norm": 0.7771102064083919, + "learning_rate": 1.9980171743131972e-05, + "loss": 0.9869, + "num_tokens": 3556738613.0, + "step": 851 + }, + { + "epoch": 0.10124777183600712, + "grad_norm": 0.6820350082223943, + "learning_rate": 1.9980056672658772e-05, + "loss": 1.0062, + "num_tokens": 3560925694.0, + "step": 852 + }, + { + "epoch": 0.10136660724896018, + "grad_norm": 0.8717715919440441, + "learning_rate": 1.997994126962371e-05, + "loss": 0.9443, + "num_tokens": 3565092709.0, + "step": 853 + }, + { + "epoch": 0.10148544266191326, + "grad_norm": 0.770914740860196, + "learning_rate": 1.9979825534031064e-05, + "loss": 0.9903, + "num_tokens": 3569281634.0, + "step": 854 + }, + { + "epoch": 0.10160427807486631, + "grad_norm": 0.8958723897247893, + "learning_rate": 1.997970946588511e-05, + "loss": 0.9658, + "num_tokens": 3573442590.0, + "step": 855 + }, + { + "epoch": 0.10172311348781937, + "grad_norm": 0.8807113050587959, + "learning_rate": 1.9979593065190158e-05, + "loss": 0.9835, + "num_tokens": 3577631157.0, + "step": 856 + }, + { + "epoch": 0.10184194890077243, + "grad_norm": 0.6521756816870385, + "learning_rate": 1.9979476331950516e-05, + "loss": 0.9564, + "num_tokens": 3581799246.0, + "step": 857 + }, + { + "epoch": 0.10196078431372549, + "grad_norm": 1.111538476060259, + "learning_rate": 1.99793592661705e-05, + "loss": 0.9858, + "num_tokens": 3585988989.0, + "step": 858 + }, + { + "epoch": 0.10207961972667855, + "grad_norm": 0.7040036626657048, + "learning_rate": 1.9979241867854456e-05, + "loss": 0.9884, + "num_tokens": 3590105859.0, + "step": 859 + }, + { + "epoch": 0.1021984551396316, + "grad_norm": 0.7040864627524575, + "learning_rate": 1.997912413700672e-05, + "loss": 0.9831, + "num_tokens": 3594279194.0, + "step": 860 + }, + { + "epoch": 0.10231729055258466, + "grad_norm": 0.8306407204905412, + "learning_rate": 1.9979006073631665e-05, + "loss": 0.9384, + "num_tokens": 3598426743.0, + "step": 861 + }, + { + "epoch": 0.10243612596553774, + "grad_norm": 0.7930424269709083, + "learning_rate": 1.997888767773365e-05, + "loss": 0.9731, + "num_tokens": 3602587079.0, + "step": 862 + }, + { + "epoch": 0.1025549613784908, + "grad_norm": 0.9729015732159827, + "learning_rate": 1.997876894931707e-05, + "loss": 0.9723, + "num_tokens": 3606776733.0, + "step": 863 + }, + { + "epoch": 0.10267379679144385, + "grad_norm": 0.7092011018162888, + "learning_rate": 1.9978649888386315e-05, + "loss": 0.958, + "num_tokens": 3610928170.0, + "step": 864 + }, + { + "epoch": 0.10279263220439691, + "grad_norm": 1.0008082023600569, + "learning_rate": 1.99785304949458e-05, + "loss": 0.9819, + "num_tokens": 3615091249.0, + "step": 865 + }, + { + "epoch": 0.10291146761734997, + "grad_norm": 0.795639632553202, + "learning_rate": 1.997841076899994e-05, + "loss": 0.9248, + "num_tokens": 3619272686.0, + "step": 866 + }, + { + "epoch": 0.10303030303030303, + "grad_norm": 0.8311306071808098, + "learning_rate": 1.9978290710553175e-05, + "loss": 0.9617, + "num_tokens": 3623462455.0, + "step": 867 + }, + { + "epoch": 0.10314913844325609, + "grad_norm": 0.7357672975941393, + "learning_rate": 1.997817031960995e-05, + "loss": 0.9944, + "num_tokens": 3627651504.0, + "step": 868 + }, + { + "epoch": 0.10326797385620914, + "grad_norm": 0.7122473829526649, + "learning_rate": 1.997804959617472e-05, + "loss": 0.9869, + "num_tokens": 3631840860.0, + "step": 869 + }, + { + "epoch": 0.10338680926916222, + "grad_norm": 0.8059718251608216, + "learning_rate": 1.997792854025196e-05, + "loss": 0.9715, + "num_tokens": 3636030422.0, + "step": 870 + }, + { + "epoch": 0.10350564468211527, + "grad_norm": 0.7823862549517516, + "learning_rate": 1.997780715184615e-05, + "loss": 1.0114, + "num_tokens": 3640195379.0, + "step": 871 + }, + { + "epoch": 0.10362448009506833, + "grad_norm": 0.9246022815808393, + "learning_rate": 1.9977685430961784e-05, + "loss": 0.9904, + "num_tokens": 3644376205.0, + "step": 872 + }, + { + "epoch": 0.10374331550802139, + "grad_norm": 0.6329869285397705, + "learning_rate": 1.9977563377603377e-05, + "loss": 1.0105, + "num_tokens": 3648565605.0, + "step": 873 + }, + { + "epoch": 0.10386215092097445, + "grad_norm": 0.8948470186717428, + "learning_rate": 1.9977440991775437e-05, + "loss": 0.967, + "num_tokens": 3652756001.0, + "step": 874 + }, + { + "epoch": 0.10398098633392751, + "grad_norm": 0.8320274057289039, + "learning_rate": 1.9977318273482512e-05, + "loss": 0.987, + "num_tokens": 3656898210.0, + "step": 875 + }, + { + "epoch": 0.10409982174688057, + "grad_norm": 0.7441123779657239, + "learning_rate": 1.997719522272913e-05, + "loss": 0.9796, + "num_tokens": 3661066867.0, + "step": 876 + }, + { + "epoch": 0.10421865715983362, + "grad_norm": 1.0045803262471382, + "learning_rate": 1.9977071839519863e-05, + "loss": 0.9497, + "num_tokens": 3665257547.0, + "step": 877 + }, + { + "epoch": 0.1043374925727867, + "grad_norm": 0.6531357094652689, + "learning_rate": 1.9976948123859273e-05, + "loss": 0.964, + "num_tokens": 3669444216.0, + "step": 878 + }, + { + "epoch": 0.10445632798573976, + "grad_norm": 0.7245539904297431, + "learning_rate": 1.997682407575194e-05, + "loss": 0.9776, + "num_tokens": 3673628489.0, + "step": 879 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.8443661040761561, + "learning_rate": 1.9976699695202463e-05, + "loss": 0.9511, + "num_tokens": 3677818138.0, + "step": 880 + }, + { + "epoch": 0.10469399881164587, + "grad_norm": 0.7846376111901664, + "learning_rate": 1.9976574982215438e-05, + "loss": 0.9789, + "num_tokens": 3682006238.0, + "step": 881 + }, + { + "epoch": 0.10481283422459893, + "grad_norm": 0.8850506361424345, + "learning_rate": 1.9976449936795497e-05, + "loss": 1.0158, + "num_tokens": 3686195920.0, + "step": 882 + }, + { + "epoch": 0.10493166963755199, + "grad_norm": 0.9405393942764589, + "learning_rate": 1.9976324558947267e-05, + "loss": 0.988, + "num_tokens": 3690385455.0, + "step": 883 + }, + { + "epoch": 0.10505050505050505, + "grad_norm": 0.6376448747367858, + "learning_rate": 1.9976198848675383e-05, + "loss": 0.9425, + "num_tokens": 3694544714.0, + "step": 884 + }, + { + "epoch": 0.1051693404634581, + "grad_norm": 0.9928386493669294, + "learning_rate": 1.997607280598451e-05, + "loss": 0.9439, + "num_tokens": 3698707790.0, + "step": 885 + }, + { + "epoch": 0.10528817587641116, + "grad_norm": 0.5990494010750665, + "learning_rate": 1.9975946430879313e-05, + "loss": 0.9965, + "num_tokens": 3702879764.0, + "step": 886 + }, + { + "epoch": 0.10540701128936424, + "grad_norm": 1.1232885789850042, + "learning_rate": 1.9975819723364468e-05, + "loss": 0.9643, + "num_tokens": 3707068357.0, + "step": 887 + }, + { + "epoch": 0.1055258467023173, + "grad_norm": 0.6856682518648873, + "learning_rate": 1.997569268344467e-05, + "loss": 0.9753, + "num_tokens": 3711257393.0, + "step": 888 + }, + { + "epoch": 0.10564468211527035, + "grad_norm": 0.9882611061518417, + "learning_rate": 1.9975565311124627e-05, + "loss": 0.965, + "num_tokens": 3715445845.0, + "step": 889 + }, + { + "epoch": 0.10576351752822341, + "grad_norm": 0.6777737261881038, + "learning_rate": 1.9975437606409054e-05, + "loss": 0.9592, + "num_tokens": 3719636247.0, + "step": 890 + }, + { + "epoch": 0.10588235294117647, + "grad_norm": 0.8607237823431101, + "learning_rate": 1.9975309569302678e-05, + "loss": 0.9407, + "num_tokens": 3723802522.0, + "step": 891 + }, + { + "epoch": 0.10600118835412953, + "grad_norm": 0.7714434339739894, + "learning_rate": 1.9975181199810244e-05, + "loss": 0.9981, + "num_tokens": 3727991112.0, + "step": 892 + }, + { + "epoch": 0.10612002376708259, + "grad_norm": 0.7922825396490928, + "learning_rate": 1.9975052497936504e-05, + "loss": 0.9558, + "num_tokens": 3732149787.0, + "step": 893 + }, + { + "epoch": 0.10623885918003564, + "grad_norm": 0.7866059158880009, + "learning_rate": 1.9974923463686222e-05, + "loss": 0.9659, + "num_tokens": 3736339758.0, + "step": 894 + }, + { + "epoch": 0.10635769459298872, + "grad_norm": 0.8907016519860648, + "learning_rate": 1.997479409706418e-05, + "loss": 0.9484, + "num_tokens": 3740514703.0, + "step": 895 + }, + { + "epoch": 0.10647653000594177, + "grad_norm": 0.8496368510734497, + "learning_rate": 1.9974664398075174e-05, + "loss": 0.9426, + "num_tokens": 3744703100.0, + "step": 896 + }, + { + "epoch": 0.10659536541889483, + "grad_norm": 0.821555745538996, + "learning_rate": 1.997453436672399e-05, + "loss": 0.9763, + "num_tokens": 3748867066.0, + "step": 897 + }, + { + "epoch": 0.10671420083184789, + "grad_norm": 0.6573573676992018, + "learning_rate": 1.9974404003015465e-05, + "loss": 0.9697, + "num_tokens": 3753024644.0, + "step": 898 + }, + { + "epoch": 0.10683303624480095, + "grad_norm": 0.7712597051011916, + "learning_rate": 1.9974273306954412e-05, + "loss": 1.0097, + "num_tokens": 3757213937.0, + "step": 899 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 0.9811791690720862, + "learning_rate": 1.9974142278545677e-05, + "loss": 0.9789, + "num_tokens": 3761361844.0, + "step": 900 + }, + { + "epoch": 0.10707070707070707, + "grad_norm": 0.7636357913311981, + "learning_rate": 1.997401091779411e-05, + "loss": 0.9947, + "num_tokens": 3765550146.0, + "step": 901 + }, + { + "epoch": 0.10718954248366012, + "grad_norm": 0.68473263750844, + "learning_rate": 1.9973879224704578e-05, + "loss": 1.0121, + "num_tokens": 3769740097.0, + "step": 902 + }, + { + "epoch": 0.1073083778966132, + "grad_norm": 0.9149391618736658, + "learning_rate": 1.9973747199281957e-05, + "loss": 0.9552, + "num_tokens": 3773930749.0, + "step": 903 + }, + { + "epoch": 0.10742721330956626, + "grad_norm": 0.7624528507417961, + "learning_rate": 1.9973614841531136e-05, + "loss": 0.9428, + "num_tokens": 3778120493.0, + "step": 904 + }, + { + "epoch": 0.10754604872251931, + "grad_norm": 0.7341903596461895, + "learning_rate": 1.9973482151457015e-05, + "loss": 0.9766, + "num_tokens": 3782308612.0, + "step": 905 + }, + { + "epoch": 0.10766488413547237, + "grad_norm": 0.8377639853027621, + "learning_rate": 1.9973349129064512e-05, + "loss": 0.9743, + "num_tokens": 3786487673.0, + "step": 906 + }, + { + "epoch": 0.10778371954842543, + "grad_norm": 0.7649815751417852, + "learning_rate": 1.9973215774358555e-05, + "loss": 0.9838, + "num_tokens": 3790663484.0, + "step": 907 + }, + { + "epoch": 0.10790255496137849, + "grad_norm": 0.7343574698797452, + "learning_rate": 1.997308208734407e-05, + "loss": 0.9544, + "num_tokens": 3794852143.0, + "step": 908 + }, + { + "epoch": 0.10802139037433155, + "grad_norm": 0.9441068812048246, + "learning_rate": 1.9972948068026024e-05, + "loss": 0.9736, + "num_tokens": 3799041628.0, + "step": 909 + }, + { + "epoch": 0.1081402257872846, + "grad_norm": 0.6397081110236696, + "learning_rate": 1.9972813716409366e-05, + "loss": 0.9977, + "num_tokens": 3803212841.0, + "step": 910 + }, + { + "epoch": 0.10825906120023768, + "grad_norm": 0.9264015772351539, + "learning_rate": 1.9972679032499084e-05, + "loss": 0.9501, + "num_tokens": 3807378225.0, + "step": 911 + }, + { + "epoch": 0.10837789661319074, + "grad_norm": 0.8028388827011983, + "learning_rate": 1.9972544016300156e-05, + "loss": 0.9643, + "num_tokens": 3811566657.0, + "step": 912 + }, + { + "epoch": 0.1084967320261438, + "grad_norm": 0.8030199355911952, + "learning_rate": 1.9972408667817595e-05, + "loss": 0.9266, + "num_tokens": 3815749302.0, + "step": 913 + }, + { + "epoch": 0.10861556743909685, + "grad_norm": 0.9691375795713126, + "learning_rate": 1.9972272987056393e-05, + "loss": 0.9879, + "num_tokens": 3819939554.0, + "step": 914 + }, + { + "epoch": 0.10873440285204991, + "grad_norm": 0.6285297870805705, + "learning_rate": 1.9972136974021594e-05, + "loss": 0.9638, + "num_tokens": 3824099778.0, + "step": 915 + }, + { + "epoch": 0.10885323826500297, + "grad_norm": 0.8847843097365771, + "learning_rate": 1.9972000628718226e-05, + "loss": 0.9908, + "num_tokens": 3828264242.0, + "step": 916 + }, + { + "epoch": 0.10897207367795603, + "grad_norm": 0.7616915489047769, + "learning_rate": 1.9971863951151335e-05, + "loss": 0.9794, + "num_tokens": 3832446513.0, + "step": 917 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 0.8534606491344399, + "learning_rate": 1.997172694132599e-05, + "loss": 0.9555, + "num_tokens": 3836627956.0, + "step": 918 + }, + { + "epoch": 0.10920974450386216, + "grad_norm": 0.7263546469671229, + "learning_rate": 1.9971589599247265e-05, + "loss": 0.9932, + "num_tokens": 3840817671.0, + "step": 919 + }, + { + "epoch": 0.10932857991681522, + "grad_norm": 0.78321222502943, + "learning_rate": 1.997145192492024e-05, + "loss": 0.9799, + "num_tokens": 3844971728.0, + "step": 920 + }, + { + "epoch": 0.10944741532976827, + "grad_norm": 0.8414530482305401, + "learning_rate": 1.9971313918350016e-05, + "loss": 0.951, + "num_tokens": 3849160063.0, + "step": 921 + }, + { + "epoch": 0.10956625074272133, + "grad_norm": 0.7829564120819275, + "learning_rate": 1.997117557954171e-05, + "loss": 0.9355, + "num_tokens": 3853341339.0, + "step": 922 + }, + { + "epoch": 0.10968508615567439, + "grad_norm": 0.8886783108278432, + "learning_rate": 1.9971036908500435e-05, + "loss": 0.9436, + "num_tokens": 3857531557.0, + "step": 923 + }, + { + "epoch": 0.10980392156862745, + "grad_norm": 0.7198803717804935, + "learning_rate": 1.9970897905231338e-05, + "loss": 0.9383, + "num_tokens": 3861694362.0, + "step": 924 + }, + { + "epoch": 0.10992275698158051, + "grad_norm": 0.6806407385140952, + "learning_rate": 1.997075856973955e-05, + "loss": 0.984, + "num_tokens": 3865883073.0, + "step": 925 + }, + { + "epoch": 0.11004159239453357, + "grad_norm": 0.9617221374441172, + "learning_rate": 1.997061890203025e-05, + "loss": 0.9442, + "num_tokens": 3870072271.0, + "step": 926 + }, + { + "epoch": 0.11016042780748662, + "grad_norm": 0.6562584103120765, + "learning_rate": 1.99704789021086e-05, + "loss": 0.9802, + "num_tokens": 3874249290.0, + "step": 927 + }, + { + "epoch": 0.1102792632204397, + "grad_norm": 0.8197125675432281, + "learning_rate": 1.9970338569979785e-05, + "loss": 0.9708, + "num_tokens": 3878439026.0, + "step": 928 + }, + { + "epoch": 0.11039809863339275, + "grad_norm": 0.7456803896718919, + "learning_rate": 1.9970197905649002e-05, + "loss": 0.9677, + "num_tokens": 3882625850.0, + "step": 929 + }, + { + "epoch": 0.11051693404634581, + "grad_norm": 0.9574191047821067, + "learning_rate": 1.9970056909121465e-05, + "loss": 0.9588, + "num_tokens": 3886805241.0, + "step": 930 + }, + { + "epoch": 0.11063576945929887, + "grad_norm": 0.7329682122657992, + "learning_rate": 1.996991558040239e-05, + "loss": 0.9616, + "num_tokens": 3890945268.0, + "step": 931 + }, + { + "epoch": 0.11075460487225193, + "grad_norm": 0.7469920964640284, + "learning_rate": 1.9969773919497015e-05, + "loss": 0.9861, + "num_tokens": 3895123473.0, + "step": 932 + }, + { + "epoch": 0.11087344028520499, + "grad_norm": 0.8015970125373503, + "learning_rate": 1.996963192641058e-05, + "loss": 0.9929, + "num_tokens": 3899286250.0, + "step": 933 + }, + { + "epoch": 0.11099227569815805, + "grad_norm": 0.6973295230962487, + "learning_rate": 1.9969489601148352e-05, + "loss": 0.9411, + "num_tokens": 3903476459.0, + "step": 934 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.8309077632697411, + "learning_rate": 1.9969346943715596e-05, + "loss": 0.9701, + "num_tokens": 3907645985.0, + "step": 935 + }, + { + "epoch": 0.11122994652406418, + "grad_norm": 0.7458705656542112, + "learning_rate": 1.9969203954117597e-05, + "loss": 0.9593, + "num_tokens": 3911831930.0, + "step": 936 + }, + { + "epoch": 0.11134878193701724, + "grad_norm": 0.8327030427557065, + "learning_rate": 1.9969060632359645e-05, + "loss": 0.9656, + "num_tokens": 3916020915.0, + "step": 937 + }, + { + "epoch": 0.1114676173499703, + "grad_norm": 0.8085546355222615, + "learning_rate": 1.9968916978447058e-05, + "loss": 0.9633, + "num_tokens": 3920210064.0, + "step": 938 + }, + { + "epoch": 0.11158645276292335, + "grad_norm": 0.9391904172034269, + "learning_rate": 1.9968772992385154e-05, + "loss": 0.9613, + "num_tokens": 3924391975.0, + "step": 939 + }, + { + "epoch": 0.11170528817587641, + "grad_norm": 0.7200108157867211, + "learning_rate": 1.9968628674179256e-05, + "loss": 0.9685, + "num_tokens": 3928567618.0, + "step": 940 + }, + { + "epoch": 0.11182412358882947, + "grad_norm": 0.8127068277980082, + "learning_rate": 1.9968484023834714e-05, + "loss": 0.9924, + "num_tokens": 3932756216.0, + "step": 941 + }, + { + "epoch": 0.11194295900178253, + "grad_norm": 0.7703767068077036, + "learning_rate": 1.996833904135689e-05, + "loss": 0.9666, + "num_tokens": 3936937310.0, + "step": 942 + }, + { + "epoch": 0.11206179441473559, + "grad_norm": 0.8052375918649717, + "learning_rate": 1.9968193726751145e-05, + "loss": 0.9805, + "num_tokens": 3941087103.0, + "step": 943 + }, + { + "epoch": 0.11218062982768866, + "grad_norm": 0.8271367022665789, + "learning_rate": 1.9968048080022866e-05, + "loss": 0.9587, + "num_tokens": 3945276757.0, + "step": 944 + }, + { + "epoch": 0.11229946524064172, + "grad_norm": 0.8318835540876806, + "learning_rate": 1.9967902101177443e-05, + "loss": 0.977, + "num_tokens": 3949467351.0, + "step": 945 + }, + { + "epoch": 0.11241830065359477, + "grad_norm": 0.7462639606705672, + "learning_rate": 1.9967755790220288e-05, + "loss": 1.002, + "num_tokens": 3953654886.0, + "step": 946 + }, + { + "epoch": 0.11253713606654783, + "grad_norm": 0.7296435013589743, + "learning_rate": 1.9967609147156812e-05, + "loss": 0.9648, + "num_tokens": 3957843252.0, + "step": 947 + }, + { + "epoch": 0.11265597147950089, + "grad_norm": 0.7667204900839878, + "learning_rate": 1.996746217199245e-05, + "loss": 0.9595, + "num_tokens": 3962018726.0, + "step": 948 + }, + { + "epoch": 0.11277480689245395, + "grad_norm": 0.7814814196052406, + "learning_rate": 1.9967314864732642e-05, + "loss": 0.9295, + "num_tokens": 3966183131.0, + "step": 949 + }, + { + "epoch": 0.11289364230540701, + "grad_norm": 0.9672426689256659, + "learning_rate": 1.9967167225382847e-05, + "loss": 0.9749, + "num_tokens": 3970359878.0, + "step": 950 + }, + { + "epoch": 0.11301247771836007, + "grad_norm": 0.6849657170882447, + "learning_rate": 1.996701925394853e-05, + "loss": 0.925, + "num_tokens": 3974550367.0, + "step": 951 + }, + { + "epoch": 0.11313131313131314, + "grad_norm": 0.8036915320035362, + "learning_rate": 1.9966870950435177e-05, + "loss": 0.9854, + "num_tokens": 3978739786.0, + "step": 952 + }, + { + "epoch": 0.1132501485442662, + "grad_norm": 0.8347651671025945, + "learning_rate": 1.996672231484827e-05, + "loss": 0.9662, + "num_tokens": 3982928725.0, + "step": 953 + }, + { + "epoch": 0.11336898395721925, + "grad_norm": 0.8298126429850338, + "learning_rate": 1.996657334719332e-05, + "loss": 0.9842, + "num_tokens": 3987118377.0, + "step": 954 + }, + { + "epoch": 0.11348781937017231, + "grad_norm": 0.7499711220542805, + "learning_rate": 1.9966424047475842e-05, + "loss": 0.9719, + "num_tokens": 3991308220.0, + "step": 955 + }, + { + "epoch": 0.11360665478312537, + "grad_norm": 0.875355691300183, + "learning_rate": 1.9966274415701366e-05, + "loss": 0.9744, + "num_tokens": 3995495956.0, + "step": 956 + }, + { + "epoch": 0.11372549019607843, + "grad_norm": 0.874251070008103, + "learning_rate": 1.996612445187543e-05, + "loss": 0.9539, + "num_tokens": 3999685154.0, + "step": 957 + }, + { + "epoch": 0.11384432560903149, + "grad_norm": 0.7500934341886796, + "learning_rate": 1.996597415600359e-05, + "loss": 0.9599, + "num_tokens": 4003855357.0, + "step": 958 + }, + { + "epoch": 0.11396316102198455, + "grad_norm": 0.7134074890601336, + "learning_rate": 1.9965823528091416e-05, + "loss": 0.9845, + "num_tokens": 4008044916.0, + "step": 959 + }, + { + "epoch": 0.1140819964349376, + "grad_norm": 0.9104476955760792, + "learning_rate": 1.996567256814448e-05, + "loss": 0.9188, + "num_tokens": 4012233578.0, + "step": 960 + }, + { + "epoch": 0.11420083184789068, + "grad_norm": 0.712892076946917, + "learning_rate": 1.996552127616838e-05, + "loss": 0.964, + "num_tokens": 4016416099.0, + "step": 961 + }, + { + "epoch": 0.11431966726084374, + "grad_norm": 0.8670668710240336, + "learning_rate": 1.9965369652168707e-05, + "loss": 0.9816, + "num_tokens": 4020604402.0, + "step": 962 + }, + { + "epoch": 0.1144385026737968, + "grad_norm": 0.8781829784049378, + "learning_rate": 1.9965217696151087e-05, + "loss": 0.9897, + "num_tokens": 4024793137.0, + "step": 963 + }, + { + "epoch": 0.11455733808674985, + "grad_norm": 0.7893211751950855, + "learning_rate": 1.9965065408121142e-05, + "loss": 0.9658, + "num_tokens": 4028960087.0, + "step": 964 + }, + { + "epoch": 0.11467617349970291, + "grad_norm": 0.7138215730480706, + "learning_rate": 1.9964912788084515e-05, + "loss": 0.9836, + "num_tokens": 4033130827.0, + "step": 965 + }, + { + "epoch": 0.11479500891265597, + "grad_norm": 0.7590034935951293, + "learning_rate": 1.9964759836046856e-05, + "loss": 0.9448, + "num_tokens": 4037318857.0, + "step": 966 + }, + { + "epoch": 0.11491384432560903, + "grad_norm": 0.8442999638237617, + "learning_rate": 1.996460655201383e-05, + "loss": 0.9388, + "num_tokens": 4041479339.0, + "step": 967 + }, + { + "epoch": 0.11503267973856209, + "grad_norm": 0.7046005166764598, + "learning_rate": 1.996445293599111e-05, + "loss": 0.9246, + "num_tokens": 4045642079.0, + "step": 968 + }, + { + "epoch": 0.11515151515151516, + "grad_norm": 0.760946848677576, + "learning_rate": 1.9964298987984392e-05, + "loss": 0.9638, + "num_tokens": 4049830915.0, + "step": 969 + }, + { + "epoch": 0.11527035056446822, + "grad_norm": 0.817003084429784, + "learning_rate": 1.9964144707999375e-05, + "loss": 0.9767, + "num_tokens": 4054020714.0, + "step": 970 + }, + { + "epoch": 0.11538918597742127, + "grad_norm": 0.7786689903977406, + "learning_rate": 1.9963990096041762e-05, + "loss": 0.9883, + "num_tokens": 4058210143.0, + "step": 971 + }, + { + "epoch": 0.11550802139037433, + "grad_norm": 0.7873097987922436, + "learning_rate": 1.9963835152117298e-05, + "loss": 1.0115, + "num_tokens": 4062377727.0, + "step": 972 + }, + { + "epoch": 0.11562685680332739, + "grad_norm": 0.7220714896858094, + "learning_rate": 1.9963679876231707e-05, + "loss": 0.9502, + "num_tokens": 4066566219.0, + "step": 973 + }, + { + "epoch": 0.11574569221628045, + "grad_norm": 0.8791044631525857, + "learning_rate": 1.9963524268390743e-05, + "loss": 0.9996, + "num_tokens": 4070756208.0, + "step": 974 + }, + { + "epoch": 0.11586452762923351, + "grad_norm": 0.8121957057047376, + "learning_rate": 1.996336832860017e-05, + "loss": 0.9911, + "num_tokens": 4074944621.0, + "step": 975 + }, + { + "epoch": 0.11598336304218657, + "grad_norm": 0.7330528524209493, + "learning_rate": 1.996321205686576e-05, + "loss": 0.9736, + "num_tokens": 4079116667.0, + "step": 976 + }, + { + "epoch": 0.11610219845513964, + "grad_norm": 0.9730650640362494, + "learning_rate": 1.9963055453193305e-05, + "loss": 0.9494, + "num_tokens": 4083299885.0, + "step": 977 + }, + { + "epoch": 0.1162210338680927, + "grad_norm": 0.594289533577467, + "learning_rate": 1.9962898517588596e-05, + "loss": 0.9229, + "num_tokens": 4087441246.0, + "step": 978 + }, + { + "epoch": 0.11633986928104575, + "grad_norm": 1.0365218795557039, + "learning_rate": 1.9962741250057455e-05, + "loss": 0.9625, + "num_tokens": 4091603171.0, + "step": 979 + }, + { + "epoch": 0.11645870469399881, + "grad_norm": 0.7107202720300704, + "learning_rate": 1.9962583650605703e-05, + "loss": 0.9205, + "num_tokens": 4095792771.0, + "step": 980 + }, + { + "epoch": 0.11657754010695187, + "grad_norm": 0.977045830931995, + "learning_rate": 1.9962425719239174e-05, + "loss": 0.9688, + "num_tokens": 4099981739.0, + "step": 981 + }, + { + "epoch": 0.11669637551990493, + "grad_norm": 0.5994828720338742, + "learning_rate": 1.9962267455963717e-05, + "loss": 0.9167, + "num_tokens": 4104142018.0, + "step": 982 + }, + { + "epoch": 0.11681521093285799, + "grad_norm": 0.8568453605459725, + "learning_rate": 1.9962108860785193e-05, + "loss": 0.9923, + "num_tokens": 4108331181.0, + "step": 983 + }, + { + "epoch": 0.11693404634581105, + "grad_norm": 0.6910300722580484, + "learning_rate": 1.996194993370948e-05, + "loss": 0.9678, + "num_tokens": 4112519871.0, + "step": 984 + }, + { + "epoch": 0.11705288175876412, + "grad_norm": 0.8808892825565989, + "learning_rate": 1.9961790674742456e-05, + "loss": 0.9278, + "num_tokens": 4116710903.0, + "step": 985 + }, + { + "epoch": 0.11717171717171718, + "grad_norm": 0.7479469296928194, + "learning_rate": 1.9961631083890025e-05, + "loss": 0.9888, + "num_tokens": 4120875440.0, + "step": 986 + }, + { + "epoch": 0.11729055258467024, + "grad_norm": 0.8847736709766341, + "learning_rate": 1.9961471161158094e-05, + "loss": 0.9605, + "num_tokens": 4125065120.0, + "step": 987 + }, + { + "epoch": 0.1174093879976233, + "grad_norm": 0.6968248884869398, + "learning_rate": 1.996131090655259e-05, + "loss": 0.9621, + "num_tokens": 4129256140.0, + "step": 988 + }, + { + "epoch": 0.11752822341057635, + "grad_norm": 0.7409325227566183, + "learning_rate": 1.996115032007944e-05, + "loss": 0.9815, + "num_tokens": 4133424455.0, + "step": 989 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.8656555488167591, + "learning_rate": 1.9960989401744597e-05, + "loss": 0.9484, + "num_tokens": 4137592519.0, + "step": 990 + }, + { + "epoch": 0.11776589423648247, + "grad_norm": 0.7238342430479998, + "learning_rate": 1.996082815155402e-05, + "loss": 0.9619, + "num_tokens": 4141779897.0, + "step": 991 + }, + { + "epoch": 0.11788472964943553, + "grad_norm": 0.6589688817930367, + "learning_rate": 1.9960666569513675e-05, + "loss": 0.9883, + "num_tokens": 4145946173.0, + "step": 992 + }, + { + "epoch": 0.11800356506238859, + "grad_norm": 0.7843063689513585, + "learning_rate": 1.996050465562955e-05, + "loss": 0.9907, + "num_tokens": 4150127054.0, + "step": 993 + }, + { + "epoch": 0.11812240047534166, + "grad_norm": 0.8317500314047154, + "learning_rate": 1.9960342409907643e-05, + "loss": 0.9608, + "num_tokens": 4154315149.0, + "step": 994 + }, + { + "epoch": 0.11824123588829472, + "grad_norm": 0.7056526229500479, + "learning_rate": 1.9960179832353963e-05, + "loss": 0.9607, + "num_tokens": 4158504591.0, + "step": 995 + }, + { + "epoch": 0.11836007130124777, + "grad_norm": 0.8601992796759685, + "learning_rate": 1.996001692297453e-05, + "loss": 0.9507, + "num_tokens": 4162692931.0, + "step": 996 + }, + { + "epoch": 0.11847890671420083, + "grad_norm": 0.6899559509305775, + "learning_rate": 1.9959853681775372e-05, + "loss": 0.9526, + "num_tokens": 4166859374.0, + "step": 997 + }, + { + "epoch": 0.11859774212715389, + "grad_norm": 0.7755880936388657, + "learning_rate": 1.9959690108762538e-05, + "loss": 0.9402, + "num_tokens": 4171048176.0, + "step": 998 + }, + { + "epoch": 0.11871657754010695, + "grad_norm": 0.7343683644168912, + "learning_rate": 1.9959526203942087e-05, + "loss": 0.9564, + "num_tokens": 4175237873.0, + "step": 999 + }, + { + "epoch": 0.11883541295306001, + "grad_norm": 0.749992692097283, + "learning_rate": 1.995936196732009e-05, + "loss": 0.9768, + "num_tokens": 4179408949.0, + "step": 1000 + }, + { + "epoch": 0.11895424836601307, + "grad_norm": 0.8543481072165449, + "learning_rate": 1.9959197398902627e-05, + "loss": 0.9417, + "num_tokens": 4183598452.0, + "step": 1001 + }, + { + "epoch": 0.11907308377896614, + "grad_norm": 0.7606471722116287, + "learning_rate": 1.9959032498695788e-05, + "loss": 1.0122, + "num_tokens": 4187787029.0, + "step": 1002 + }, + { + "epoch": 0.1191919191919192, + "grad_norm": 0.691017935997518, + "learning_rate": 1.9958867266705687e-05, + "loss": 0.9781, + "num_tokens": 4191977141.0, + "step": 1003 + }, + { + "epoch": 0.11931075460487225, + "grad_norm": 0.8414353030647412, + "learning_rate": 1.9958701702938442e-05, + "loss": 0.9668, + "num_tokens": 4196167057.0, + "step": 1004 + }, + { + "epoch": 0.11942959001782531, + "grad_norm": 0.852582111249189, + "learning_rate": 1.9958535807400182e-05, + "loss": 1.0117, + "num_tokens": 4200340316.0, + "step": 1005 + }, + { + "epoch": 0.11954842543077837, + "grad_norm": 0.7758480202367903, + "learning_rate": 1.9958369580097047e-05, + "loss": 0.9564, + "num_tokens": 4204509750.0, + "step": 1006 + }, + { + "epoch": 0.11966726084373143, + "grad_norm": 0.8403504639107541, + "learning_rate": 1.9958203021035202e-05, + "loss": 0.9734, + "num_tokens": 4208699258.0, + "step": 1007 + }, + { + "epoch": 0.11978609625668449, + "grad_norm": 0.685699668384415, + "learning_rate": 1.9958036130220812e-05, + "loss": 0.9603, + "num_tokens": 4212889614.0, + "step": 1008 + }, + { + "epoch": 0.11990493166963755, + "grad_norm": 0.7988890327800512, + "learning_rate": 1.9957868907660055e-05, + "loss": 0.9655, + "num_tokens": 4217078709.0, + "step": 1009 + }, + { + "epoch": 0.12002376708259062, + "grad_norm": 0.7149188824239007, + "learning_rate": 1.9957701353359123e-05, + "loss": 0.9486, + "num_tokens": 4221266729.0, + "step": 1010 + }, + { + "epoch": 0.12014260249554368, + "grad_norm": 0.734270739977134, + "learning_rate": 1.9957533467324225e-05, + "loss": 0.928, + "num_tokens": 4225457465.0, + "step": 1011 + }, + { + "epoch": 0.12026143790849673, + "grad_norm": 0.8494191092641942, + "learning_rate": 1.9957365249561575e-05, + "loss": 0.9466, + "num_tokens": 4229645338.0, + "step": 1012 + }, + { + "epoch": 0.1203802733214498, + "grad_norm": 0.7812816355711585, + "learning_rate": 1.9957196700077405e-05, + "loss": 0.9326, + "num_tokens": 4233834712.0, + "step": 1013 + }, + { + "epoch": 0.12049910873440285, + "grad_norm": 0.764763827036271, + "learning_rate": 1.9957027818877958e-05, + "loss": 0.9438, + "num_tokens": 4238024607.0, + "step": 1014 + }, + { + "epoch": 0.12061794414735591, + "grad_norm": 0.7705853196046942, + "learning_rate": 1.9956858605969485e-05, + "loss": 0.9691, + "num_tokens": 4242197140.0, + "step": 1015 + }, + { + "epoch": 0.12073677956030897, + "grad_norm": 0.8653582779835822, + "learning_rate": 1.9956689061358255e-05, + "loss": 0.9646, + "num_tokens": 4246356947.0, + "step": 1016 + }, + { + "epoch": 0.12085561497326203, + "grad_norm": 0.6828327188563826, + "learning_rate": 1.9956519185050546e-05, + "loss": 0.9523, + "num_tokens": 4250546039.0, + "step": 1017 + }, + { + "epoch": 0.1209744503862151, + "grad_norm": 0.9224872205221977, + "learning_rate": 1.9956348977052646e-05, + "loss": 0.9682, + "num_tokens": 4254731811.0, + "step": 1018 + }, + { + "epoch": 0.12109328579916816, + "grad_norm": 0.6603068969995866, + "learning_rate": 1.9956178437370863e-05, + "loss": 0.9813, + "num_tokens": 4258904792.0, + "step": 1019 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 0.7674739907129744, + "learning_rate": 1.995600756601151e-05, + "loss": 0.9395, + "num_tokens": 4263094845.0, + "step": 1020 + }, + { + "epoch": 0.12133095662507427, + "grad_norm": 0.7282617018212485, + "learning_rate": 1.9955836362980918e-05, + "loss": 0.9839, + "num_tokens": 4267272331.0, + "step": 1021 + }, + { + "epoch": 0.12144979203802733, + "grad_norm": 0.7419242856076329, + "learning_rate": 1.9955664828285422e-05, + "loss": 0.9904, + "num_tokens": 4271458213.0, + "step": 1022 + }, + { + "epoch": 0.12156862745098039, + "grad_norm": 0.6780789196554964, + "learning_rate": 1.995549296193138e-05, + "loss": 0.977, + "num_tokens": 4275647431.0, + "step": 1023 + }, + { + "epoch": 0.12168746286393345, + "grad_norm": 0.9432238085775989, + "learning_rate": 1.9955320763925158e-05, + "loss": 0.9409, + "num_tokens": 4279836785.0, + "step": 1024 + }, + { + "epoch": 0.12180629827688651, + "grad_norm": 0.7565437974989146, + "learning_rate": 1.9955148234273124e-05, + "loss": 0.97, + "num_tokens": 4284003225.0, + "step": 1025 + }, + { + "epoch": 0.12192513368983957, + "grad_norm": 0.8045069811260114, + "learning_rate": 1.995497537298167e-05, + "loss": 0.9647, + "num_tokens": 4288094173.0, + "step": 1026 + }, + { + "epoch": 0.12204396910279264, + "grad_norm": 0.7351784894932966, + "learning_rate": 1.995480218005721e-05, + "loss": 0.9659, + "num_tokens": 4292282938.0, + "step": 1027 + }, + { + "epoch": 0.1221628045157457, + "grad_norm": 0.7492735282254978, + "learning_rate": 1.995462865550614e-05, + "loss": 0.9662, + "num_tokens": 4296472581.0, + "step": 1028 + }, + { + "epoch": 0.12228163992869875, + "grad_norm": 0.7081859879327967, + "learning_rate": 1.9954454799334897e-05, + "loss": 0.9492, + "num_tokens": 4300663795.0, + "step": 1029 + }, + { + "epoch": 0.12240047534165181, + "grad_norm": 0.7931794918044576, + "learning_rate": 1.9954280611549916e-05, + "loss": 0.9691, + "num_tokens": 4304832918.0, + "step": 1030 + }, + { + "epoch": 0.12251931075460487, + "grad_norm": 0.787066628878619, + "learning_rate": 1.995410609215765e-05, + "loss": 0.9611, + "num_tokens": 4309022472.0, + "step": 1031 + }, + { + "epoch": 0.12263814616755793, + "grad_norm": 0.7872317750230585, + "learning_rate": 1.9953931241164565e-05, + "loss": 0.9905, + "num_tokens": 4313180023.0, + "step": 1032 + }, + { + "epoch": 0.12275698158051099, + "grad_norm": 0.7597513836198712, + "learning_rate": 1.9953756058577128e-05, + "loss": 0.9799, + "num_tokens": 4317370248.0, + "step": 1033 + }, + { + "epoch": 0.12287581699346405, + "grad_norm": 0.7391355081657808, + "learning_rate": 1.9953580544401828e-05, + "loss": 0.947, + "num_tokens": 4321558702.0, + "step": 1034 + }, + { + "epoch": 0.12299465240641712, + "grad_norm": 0.845057434098796, + "learning_rate": 1.9953404698645174e-05, + "loss": 0.9725, + "num_tokens": 4325738318.0, + "step": 1035 + }, + { + "epoch": 0.12311348781937018, + "grad_norm": 0.67549279861666, + "learning_rate": 1.9953228521313665e-05, + "loss": 0.959, + "num_tokens": 4329892618.0, + "step": 1036 + }, + { + "epoch": 0.12323232323232323, + "grad_norm": 0.8099772711352445, + "learning_rate": 1.995305201241384e-05, + "loss": 0.9726, + "num_tokens": 4334048308.0, + "step": 1037 + }, + { + "epoch": 0.1233511586452763, + "grad_norm": 0.7489101834018272, + "learning_rate": 1.9952875171952223e-05, + "loss": 0.9379, + "num_tokens": 4338209816.0, + "step": 1038 + }, + { + "epoch": 0.12346999405822935, + "grad_norm": 0.7074285360092183, + "learning_rate": 1.9952697999935365e-05, + "loss": 0.9938, + "num_tokens": 4342400310.0, + "step": 1039 + }, + { + "epoch": 0.12358882947118241, + "grad_norm": 0.821727552886169, + "learning_rate": 1.9952520496369836e-05, + "loss": 0.9568, + "num_tokens": 4346559693.0, + "step": 1040 + }, + { + "epoch": 0.12370766488413547, + "grad_norm": 0.6419833429321541, + "learning_rate": 1.99523426612622e-05, + "loss": 0.9926, + "num_tokens": 4350744544.0, + "step": 1041 + }, + { + "epoch": 0.12382650029708853, + "grad_norm": 0.9198353096851581, + "learning_rate": 1.9952164494619052e-05, + "loss": 0.9422, + "num_tokens": 4354910028.0, + "step": 1042 + }, + { + "epoch": 0.1239453357100416, + "grad_norm": 0.7769017061101192, + "learning_rate": 1.9951985996446978e-05, + "loss": 0.9857, + "num_tokens": 4359098069.0, + "step": 1043 + }, + { + "epoch": 0.12406417112299466, + "grad_norm": 0.8473973469158077, + "learning_rate": 1.99518071667526e-05, + "loss": 1.0113, + "num_tokens": 4363280595.0, + "step": 1044 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.6419151539872928, + "learning_rate": 1.9951628005542537e-05, + "loss": 0.9618, + "num_tokens": 4367466823.0, + "step": 1045 + }, + { + "epoch": 0.12430184194890077, + "grad_norm": 0.8084078609681029, + "learning_rate": 1.9951448512823418e-05, + "loss": 0.9694, + "num_tokens": 4371620778.0, + "step": 1046 + }, + { + "epoch": 0.12442067736185383, + "grad_norm": 0.733803605059491, + "learning_rate": 1.9951268688601898e-05, + "loss": 0.9552, + "num_tokens": 4375810150.0, + "step": 1047 + }, + { + "epoch": 0.12453951277480689, + "grad_norm": 0.767701189220976, + "learning_rate": 1.9951088532884633e-05, + "loss": 0.9519, + "num_tokens": 4380000176.0, + "step": 1048 + }, + { + "epoch": 0.12465834818775995, + "grad_norm": 0.7138845769727306, + "learning_rate": 1.9950908045678296e-05, + "loss": 0.9759, + "num_tokens": 4384188564.0, + "step": 1049 + }, + { + "epoch": 0.12477718360071301, + "grad_norm": 0.8228538938691916, + "learning_rate": 1.995072722698957e-05, + "loss": 0.9633, + "num_tokens": 4388378583.0, + "step": 1050 + }, + { + "epoch": 0.12489601901366608, + "grad_norm": 0.6109850839266098, + "learning_rate": 1.9950546076825147e-05, + "loss": 0.9516, + "num_tokens": 4392567380.0, + "step": 1051 + }, + { + "epoch": 0.12501485442661914, + "grad_norm": 0.9273278384744392, + "learning_rate": 1.9950364595191745e-05, + "loss": 0.9927, + "num_tokens": 4396756418.0, + "step": 1052 + }, + { + "epoch": 0.12513368983957218, + "grad_norm": 0.7549372481015637, + "learning_rate": 1.9950182782096083e-05, + "loss": 1.0022, + "num_tokens": 4400886228.0, + "step": 1053 + }, + { + "epoch": 0.12525252525252525, + "grad_norm": 0.7742673661122809, + "learning_rate": 1.9950000637544885e-05, + "loss": 0.9723, + "num_tokens": 4405076476.0, + "step": 1054 + }, + { + "epoch": 0.12537136066547833, + "grad_norm": 0.7282866698539435, + "learning_rate": 1.9949818161544907e-05, + "loss": 1.0004, + "num_tokens": 4409242207.0, + "step": 1055 + }, + { + "epoch": 0.12549019607843137, + "grad_norm": 0.7775765960974138, + "learning_rate": 1.9949635354102898e-05, + "loss": 0.9515, + "num_tokens": 4413430931.0, + "step": 1056 + }, + { + "epoch": 0.12560903149138444, + "grad_norm": 0.661340116163063, + "learning_rate": 1.994945221522564e-05, + "loss": 0.9703, + "num_tokens": 4417612903.0, + "step": 1057 + }, + { + "epoch": 0.1257278669043375, + "grad_norm": 0.827087358286616, + "learning_rate": 1.9949268744919898e-05, + "loss": 0.9432, + "num_tokens": 4421790498.0, + "step": 1058 + }, + { + "epoch": 0.12584670231729056, + "grad_norm": 0.6490525085064067, + "learning_rate": 1.9949084943192483e-05, + "loss": 0.9592, + "num_tokens": 4425979726.0, + "step": 1059 + }, + { + "epoch": 0.1259655377302436, + "grad_norm": 0.8784405363560835, + "learning_rate": 1.9948900810050193e-05, + "loss": 0.9669, + "num_tokens": 4430151801.0, + "step": 1060 + }, + { + "epoch": 0.12608437314319668, + "grad_norm": 0.6705542167702402, + "learning_rate": 1.9948716345499848e-05, + "loss": 0.9501, + "num_tokens": 4434328345.0, + "step": 1061 + }, + { + "epoch": 0.12620320855614972, + "grad_norm": 0.9073897457578592, + "learning_rate": 1.9948531549548283e-05, + "loss": 0.9856, + "num_tokens": 4438517636.0, + "step": 1062 + }, + { + "epoch": 0.1263220439691028, + "grad_norm": 0.6313103611886339, + "learning_rate": 1.9948346422202335e-05, + "loss": 0.9667, + "num_tokens": 4442707368.0, + "step": 1063 + }, + { + "epoch": 0.12644087938205587, + "grad_norm": 0.8063923130520053, + "learning_rate": 1.9948160963468865e-05, + "loss": 0.9657, + "num_tokens": 4446896175.0, + "step": 1064 + }, + { + "epoch": 0.1265597147950089, + "grad_norm": 0.7649811003603758, + "learning_rate": 1.9947975173354747e-05, + "loss": 0.9467, + "num_tokens": 4451052637.0, + "step": 1065 + }, + { + "epoch": 0.12667855020796198, + "grad_norm": 0.7604609744898873, + "learning_rate": 1.9947789051866845e-05, + "loss": 0.9476, + "num_tokens": 4455241371.0, + "step": 1066 + }, + { + "epoch": 0.12679738562091503, + "grad_norm": 0.8073750736662679, + "learning_rate": 1.9947602599012065e-05, + "loss": 0.9276, + "num_tokens": 4459403005.0, + "step": 1067 + }, + { + "epoch": 0.1269162210338681, + "grad_norm": 0.7131400697577688, + "learning_rate": 1.994741581479731e-05, + "loss": 0.9685, + "num_tokens": 4463584125.0, + "step": 1068 + }, + { + "epoch": 0.12703505644682114, + "grad_norm": 0.6799155690714408, + "learning_rate": 1.9947228699229494e-05, + "loss": 0.9723, + "num_tokens": 4467774274.0, + "step": 1069 + }, + { + "epoch": 0.12715389185977422, + "grad_norm": 0.8180131895505602, + "learning_rate": 1.9947041252315546e-05, + "loss": 0.9604, + "num_tokens": 4471963224.0, + "step": 1070 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 0.612647616515707, + "learning_rate": 1.9946853474062414e-05, + "loss": 0.9633, + "num_tokens": 4476151944.0, + "step": 1071 + }, + { + "epoch": 0.12739156268568033, + "grad_norm": 0.8443090082060254, + "learning_rate": 1.994666536447704e-05, + "loss": 0.989, + "num_tokens": 4480341872.0, + "step": 1072 + }, + { + "epoch": 0.1275103980986334, + "grad_norm": 0.6118838400469271, + "learning_rate": 1.9946476923566406e-05, + "loss": 0.9702, + "num_tokens": 4484530268.0, + "step": 1073 + }, + { + "epoch": 0.12762923351158645, + "grad_norm": 0.8245492053497347, + "learning_rate": 1.9946288151337482e-05, + "loss": 0.9931, + "num_tokens": 4488718975.0, + "step": 1074 + }, + { + "epoch": 0.12774806892453952, + "grad_norm": 0.7595105188762813, + "learning_rate": 1.994609904779726e-05, + "loss": 0.9363, + "num_tokens": 4492867869.0, + "step": 1075 + }, + { + "epoch": 0.12786690433749257, + "grad_norm": 0.7059872902636963, + "learning_rate": 1.994590961295274e-05, + "loss": 0.9511, + "num_tokens": 4497058980.0, + "step": 1076 + }, + { + "epoch": 0.12798573975044564, + "grad_norm": 0.7264999072256835, + "learning_rate": 1.994571984681094e-05, + "loss": 0.9465, + "num_tokens": 4501219709.0, + "step": 1077 + }, + { + "epoch": 0.12810457516339868, + "grad_norm": 0.7061127300634025, + "learning_rate": 1.9945529749378888e-05, + "loss": 0.9482, + "num_tokens": 4505401209.0, + "step": 1078 + }, + { + "epoch": 0.12822341057635175, + "grad_norm": 0.9387712163925503, + "learning_rate": 1.9945339320663625e-05, + "loss": 0.9439, + "num_tokens": 4509563511.0, + "step": 1079 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 0.7295186426205216, + "learning_rate": 1.99451485606722e-05, + "loss": 0.9734, + "num_tokens": 4513722969.0, + "step": 1080 + }, + { + "epoch": 0.12846108140225787, + "grad_norm": 0.8388793985656948, + "learning_rate": 1.9944957469411682e-05, + "loss": 0.9464, + "num_tokens": 4517904844.0, + "step": 1081 + }, + { + "epoch": 0.12857991681521094, + "grad_norm": 0.7966113550376187, + "learning_rate": 1.9944766046889145e-05, + "loss": 0.9789, + "num_tokens": 4522093243.0, + "step": 1082 + }, + { + "epoch": 0.128698752228164, + "grad_norm": 0.8165878304357957, + "learning_rate": 1.9944574293111676e-05, + "loss": 0.9526, + "num_tokens": 4526267740.0, + "step": 1083 + }, + { + "epoch": 0.12881758764111706, + "grad_norm": 0.6808636944736596, + "learning_rate": 1.9944382208086382e-05, + "loss": 0.9875, + "num_tokens": 4530457273.0, + "step": 1084 + }, + { + "epoch": 0.1289364230540701, + "grad_norm": 0.8947938896179456, + "learning_rate": 1.9944189791820373e-05, + "loss": 0.9807, + "num_tokens": 4534631446.0, + "step": 1085 + }, + { + "epoch": 0.12905525846702318, + "grad_norm": 0.6648592496333745, + "learning_rate": 1.9943997044320775e-05, + "loss": 0.9762, + "num_tokens": 4538798204.0, + "step": 1086 + }, + { + "epoch": 0.12917409387997622, + "grad_norm": 0.7824855375601201, + "learning_rate": 1.9943803965594722e-05, + "loss": 0.9339, + "num_tokens": 4542963903.0, + "step": 1087 + }, + { + "epoch": 0.1292929292929293, + "grad_norm": 0.693871192466379, + "learning_rate": 1.9943610555649372e-05, + "loss": 0.9777, + "num_tokens": 4547153718.0, + "step": 1088 + }, + { + "epoch": 0.12941176470588237, + "grad_norm": 0.7154002432026141, + "learning_rate": 1.994341681449188e-05, + "loss": 0.9414, + "num_tokens": 4551340677.0, + "step": 1089 + }, + { + "epoch": 0.1295306001188354, + "grad_norm": 0.7352130816014336, + "learning_rate": 1.994322274212943e-05, + "loss": 0.9901, + "num_tokens": 4555530783.0, + "step": 1090 + }, + { + "epoch": 0.12964943553178848, + "grad_norm": 0.7575742611890377, + "learning_rate": 1.9943028338569205e-05, + "loss": 0.9506, + "num_tokens": 4559653367.0, + "step": 1091 + }, + { + "epoch": 0.12976827094474153, + "grad_norm": 0.7347701166568241, + "learning_rate": 1.99428336038184e-05, + "loss": 0.9664, + "num_tokens": 4563842394.0, + "step": 1092 + }, + { + "epoch": 0.1298871063576946, + "grad_norm": 0.7337568818838438, + "learning_rate": 1.994263853788423e-05, + "loss": 0.9895, + "num_tokens": 4568031370.0, + "step": 1093 + }, + { + "epoch": 0.13000594177064764, + "grad_norm": 0.7302569403582692, + "learning_rate": 1.9942443140773923e-05, + "loss": 0.9087, + "num_tokens": 4572195612.0, + "step": 1094 + }, + { + "epoch": 0.13012477718360071, + "grad_norm": 0.6983488478600475, + "learning_rate": 1.9942247412494704e-05, + "loss": 0.9609, + "num_tokens": 4576384189.0, + "step": 1095 + }, + { + "epoch": 0.13024361259655376, + "grad_norm": 0.722105956151292, + "learning_rate": 1.9942051353053835e-05, + "loss": 0.9754, + "num_tokens": 4580571329.0, + "step": 1096 + }, + { + "epoch": 0.13036244800950683, + "grad_norm": 0.8783579618081627, + "learning_rate": 1.9941854962458568e-05, + "loss": 0.989, + "num_tokens": 4584760211.0, + "step": 1097 + }, + { + "epoch": 0.1304812834224599, + "grad_norm": 0.7233461022032468, + "learning_rate": 1.994165824071618e-05, + "loss": 0.9793, + "num_tokens": 4588925034.0, + "step": 1098 + }, + { + "epoch": 0.13060011883541295, + "grad_norm": 0.7387529316257951, + "learning_rate": 1.9941461187833956e-05, + "loss": 0.9644, + "num_tokens": 4593112701.0, + "step": 1099 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.7325302036185863, + "learning_rate": 1.994126380381919e-05, + "loss": 0.9643, + "num_tokens": 4597280028.0, + "step": 1100 + }, + { + "epoch": 0.13083778966131906, + "grad_norm": 1.013338455692407, + "learning_rate": 1.9941066088679194e-05, + "loss": 0.9924, + "num_tokens": 4601468991.0, + "step": 1101 + }, + { + "epoch": 0.13095662507427214, + "grad_norm": 0.6758853979852851, + "learning_rate": 1.994086804242129e-05, + "loss": 0.9529, + "num_tokens": 4605657070.0, + "step": 1102 + }, + { + "epoch": 0.13107546048722518, + "grad_norm": 0.8426127074802555, + "learning_rate": 1.9940669665052813e-05, + "loss": 0.966, + "num_tokens": 4609846905.0, + "step": 1103 + }, + { + "epoch": 0.13119429590017825, + "grad_norm": 0.6429356216563714, + "learning_rate": 1.9940470956581112e-05, + "loss": 1.0155, + "num_tokens": 4614036667.0, + "step": 1104 + }, + { + "epoch": 0.13131313131313133, + "grad_norm": 0.9206248370509453, + "learning_rate": 1.9940271917013542e-05, + "loss": 0.9731, + "num_tokens": 4618212956.0, + "step": 1105 + }, + { + "epoch": 0.13143196672608437, + "grad_norm": 0.8094952883193199, + "learning_rate": 1.9940072546357473e-05, + "loss": 0.9607, + "num_tokens": 4622401969.0, + "step": 1106 + }, + { + "epoch": 0.13155080213903744, + "grad_norm": 0.7704574343198645, + "learning_rate": 1.9939872844620295e-05, + "loss": 0.967, + "num_tokens": 4626544320.0, + "step": 1107 + }, + { + "epoch": 0.1316696375519905, + "grad_norm": 0.8340311151050624, + "learning_rate": 1.9939672811809395e-05, + "loss": 0.9995, + "num_tokens": 4630683244.0, + "step": 1108 + }, + { + "epoch": 0.13178847296494356, + "grad_norm": 0.6952856884057941, + "learning_rate": 1.9939472447932186e-05, + "loss": 0.9397, + "num_tokens": 4634873245.0, + "step": 1109 + }, + { + "epoch": 0.1319073083778966, + "grad_norm": 0.8182519737238035, + "learning_rate": 1.993927175299609e-05, + "loss": 0.942, + "num_tokens": 4639057419.0, + "step": 1110 + }, + { + "epoch": 0.13202614379084968, + "grad_norm": 0.6976864623624446, + "learning_rate": 1.9939070727008533e-05, + "loss": 0.9635, + "num_tokens": 4643245363.0, + "step": 1111 + }, + { + "epoch": 0.13214497920380272, + "grad_norm": 0.9356493383912144, + "learning_rate": 1.993886936997696e-05, + "loss": 0.9498, + "num_tokens": 4647404037.0, + "step": 1112 + }, + { + "epoch": 0.1322638146167558, + "grad_norm": 0.6538339342063842, + "learning_rate": 1.9938667681908837e-05, + "loss": 0.9671, + "num_tokens": 4651525551.0, + "step": 1113 + }, + { + "epoch": 0.13238265002970886, + "grad_norm": 0.9038587617141345, + "learning_rate": 1.9938465662811626e-05, + "loss": 0.9933, + "num_tokens": 4655714074.0, + "step": 1114 + }, + { + "epoch": 0.1325014854426619, + "grad_norm": 0.7120064579657113, + "learning_rate": 1.9938263312692808e-05, + "loss": 0.9507, + "num_tokens": 4659903042.0, + "step": 1115 + }, + { + "epoch": 0.13262032085561498, + "grad_norm": 0.681824854391346, + "learning_rate": 1.993806063155988e-05, + "loss": 0.9635, + "num_tokens": 4664092872.0, + "step": 1116 + }, + { + "epoch": 0.13273915626856803, + "grad_norm": 0.9006650206372103, + "learning_rate": 1.9937857619420348e-05, + "loss": 0.952, + "num_tokens": 4668250227.0, + "step": 1117 + }, + { + "epoch": 0.1328579916815211, + "grad_norm": 0.8283919901979867, + "learning_rate": 1.9937654276281725e-05, + "loss": 0.99, + "num_tokens": 4672439753.0, + "step": 1118 + }, + { + "epoch": 0.13297682709447414, + "grad_norm": 0.7049291001888223, + "learning_rate": 1.9937450602151548e-05, + "loss": 0.9742, + "num_tokens": 4676620926.0, + "step": 1119 + }, + { + "epoch": 0.13309566250742721, + "grad_norm": 0.9099391587545077, + "learning_rate": 1.9937246597037358e-05, + "loss": 0.9493, + "num_tokens": 4680810610.0, + "step": 1120 + }, + { + "epoch": 0.1332144979203803, + "grad_norm": 0.5593171730608899, + "learning_rate": 1.9937042260946708e-05, + "loss": 0.953, + "num_tokens": 4684999118.0, + "step": 1121 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.8253593023882928, + "learning_rate": 1.9936837593887163e-05, + "loss": 0.9447, + "num_tokens": 4689186626.0, + "step": 1122 + }, + { + "epoch": 0.1334521687462864, + "grad_norm": 0.7097927165793295, + "learning_rate": 1.993663259586631e-05, + "loss": 0.9508, + "num_tokens": 4693377162.0, + "step": 1123 + }, + { + "epoch": 0.13357100415923945, + "grad_norm": 0.7259861482282806, + "learning_rate": 1.9936427266891735e-05, + "loss": 0.9911, + "num_tokens": 4697550796.0, + "step": 1124 + }, + { + "epoch": 0.13368983957219252, + "grad_norm": 0.7935846554192104, + "learning_rate": 1.9936221606971045e-05, + "loss": 0.9681, + "num_tokens": 4701710611.0, + "step": 1125 + }, + { + "epoch": 0.13380867498514556, + "grad_norm": 0.7491126865427866, + "learning_rate": 1.993601561611185e-05, + "loss": 0.9642, + "num_tokens": 4705874662.0, + "step": 1126 + }, + { + "epoch": 0.13392751039809864, + "grad_norm": 0.6804448751750652, + "learning_rate": 1.993580929432179e-05, + "loss": 0.9437, + "num_tokens": 4710063857.0, + "step": 1127 + }, + { + "epoch": 0.13404634581105168, + "grad_norm": 0.8348552857908935, + "learning_rate": 1.9935602641608492e-05, + "loss": 0.9557, + "num_tokens": 4714228850.0, + "step": 1128 + }, + { + "epoch": 0.13416518122400475, + "grad_norm": 0.6429952286682424, + "learning_rate": 1.993539565797962e-05, + "loss": 1.0067, + "num_tokens": 4718418639.0, + "step": 1129 + }, + { + "epoch": 0.13428401663695783, + "grad_norm": 0.8067211199671555, + "learning_rate": 1.9935188343442836e-05, + "loss": 1.0091, + "num_tokens": 4722607354.0, + "step": 1130 + }, + { + "epoch": 0.13440285204991087, + "grad_norm": 0.777602839540239, + "learning_rate": 1.9934980698005818e-05, + "loss": 0.9712, + "num_tokens": 4726795632.0, + "step": 1131 + }, + { + "epoch": 0.13452168746286394, + "grad_norm": 0.774248459222677, + "learning_rate": 1.9934772721676253e-05, + "loss": 0.966, + "num_tokens": 4730986609.0, + "step": 1132 + }, + { + "epoch": 0.134640522875817, + "grad_norm": 0.6651821218828583, + "learning_rate": 1.9934564414461846e-05, + "loss": 0.9251, + "num_tokens": 4735176731.0, + "step": 1133 + }, + { + "epoch": 0.13475935828877006, + "grad_norm": 0.8523807071507856, + "learning_rate": 1.9934355776370308e-05, + "loss": 0.9439, + "num_tokens": 4739366067.0, + "step": 1134 + }, + { + "epoch": 0.1348781937017231, + "grad_norm": 0.5767325051984443, + "learning_rate": 1.9934146807409368e-05, + "loss": 0.9628, + "num_tokens": 4743556235.0, + "step": 1135 + }, + { + "epoch": 0.13499702911467618, + "grad_norm": 0.8764046241052897, + "learning_rate": 1.993393750758677e-05, + "loss": 0.9458, + "num_tokens": 4747744955.0, + "step": 1136 + }, + { + "epoch": 0.13511586452762922, + "grad_norm": 0.6655797433140193, + "learning_rate": 1.9933727876910255e-05, + "loss": 0.9556, + "num_tokens": 4751932278.0, + "step": 1137 + }, + { + "epoch": 0.1352346999405823, + "grad_norm": 0.7862719477809412, + "learning_rate": 1.993351791538759e-05, + "loss": 0.9478, + "num_tokens": 4756122045.0, + "step": 1138 + }, + { + "epoch": 0.13535353535353536, + "grad_norm": 0.6524600009873732, + "learning_rate": 1.9933307623026553e-05, + "loss": 0.9205, + "num_tokens": 4760290833.0, + "step": 1139 + }, + { + "epoch": 0.1354723707664884, + "grad_norm": 0.7677171895463177, + "learning_rate": 1.9933096999834933e-05, + "loss": 0.9833, + "num_tokens": 4764480200.0, + "step": 1140 + }, + { + "epoch": 0.13559120617944148, + "grad_norm": 0.7542685886469763, + "learning_rate": 1.9932886045820524e-05, + "loss": 0.9859, + "num_tokens": 4768669368.0, + "step": 1141 + }, + { + "epoch": 0.13571004159239453, + "grad_norm": 0.7402217660314752, + "learning_rate": 1.9932674760991143e-05, + "loss": 0.9412, + "num_tokens": 4772817153.0, + "step": 1142 + }, + { + "epoch": 0.1358288770053476, + "grad_norm": 0.6068101193846492, + "learning_rate": 1.9932463145354613e-05, + "loss": 0.9838, + "num_tokens": 4776986640.0, + "step": 1143 + }, + { + "epoch": 0.13594771241830064, + "grad_norm": 0.7547044555175931, + "learning_rate": 1.9932251198918774e-05, + "loss": 0.9514, + "num_tokens": 4781172625.0, + "step": 1144 + }, + { + "epoch": 0.13606654783125371, + "grad_norm": 0.7234868923301616, + "learning_rate": 1.993203892169147e-05, + "loss": 0.988, + "num_tokens": 4785362218.0, + "step": 1145 + }, + { + "epoch": 0.1361853832442068, + "grad_norm": 0.8262796718239588, + "learning_rate": 1.9931826313680565e-05, + "loss": 0.9634, + "num_tokens": 4789549883.0, + "step": 1146 + }, + { + "epoch": 0.13630421865715983, + "grad_norm": 0.6502917817014159, + "learning_rate": 1.9931613374893936e-05, + "loss": 0.9548, + "num_tokens": 4793723550.0, + "step": 1147 + }, + { + "epoch": 0.1364230540701129, + "grad_norm": 0.8877251451449237, + "learning_rate": 1.9931400105339457e-05, + "loss": 0.9639, + "num_tokens": 4797912794.0, + "step": 1148 + }, + { + "epoch": 0.13654188948306595, + "grad_norm": 0.6502057382920534, + "learning_rate": 1.9931186505025043e-05, + "loss": 1.001, + "num_tokens": 4802078048.0, + "step": 1149 + }, + { + "epoch": 0.13666072489601902, + "grad_norm": 0.6660763736273236, + "learning_rate": 1.993097257395859e-05, + "loss": 0.9976, + "num_tokens": 4806266956.0, + "step": 1150 + }, + { + "epoch": 0.13677956030897206, + "grad_norm": 0.8406251870151005, + "learning_rate": 1.993075831214803e-05, + "loss": 0.9677, + "num_tokens": 4810438980.0, + "step": 1151 + }, + { + "epoch": 0.13689839572192514, + "grad_norm": 0.7979867266448336, + "learning_rate": 1.9930543719601295e-05, + "loss": 0.967, + "num_tokens": 4814608603.0, + "step": 1152 + }, + { + "epoch": 0.13701723113487818, + "grad_norm": 0.6767667035705949, + "learning_rate": 1.9930328796326324e-05, + "loss": 0.9435, + "num_tokens": 4818797638.0, + "step": 1153 + }, + { + "epoch": 0.13713606654783125, + "grad_norm": 0.696559212756926, + "learning_rate": 1.993011354233109e-05, + "loss": 0.9521, + "num_tokens": 4822980939.0, + "step": 1154 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.721783307878327, + "learning_rate": 1.9929897957623557e-05, + "loss": 0.9462, + "num_tokens": 4827109624.0, + "step": 1155 + }, + { + "epoch": 0.13737373737373737, + "grad_norm": 0.7740422011196754, + "learning_rate": 1.9929682042211708e-05, + "loss": 0.9679, + "num_tokens": 4831277807.0, + "step": 1156 + }, + { + "epoch": 0.13749257278669044, + "grad_norm": 0.7053907851819737, + "learning_rate": 1.9929465796103543e-05, + "loss": 0.9474, + "num_tokens": 4835457706.0, + "step": 1157 + }, + { + "epoch": 0.1376114081996435, + "grad_norm": 0.6523617061480056, + "learning_rate": 1.992924921930707e-05, + "loss": 0.9887, + "num_tokens": 4839646158.0, + "step": 1158 + }, + { + "epoch": 0.13773024361259656, + "grad_norm": 0.8152752889003824, + "learning_rate": 1.9929032311830303e-05, + "loss": 0.9254, + "num_tokens": 4843814829.0, + "step": 1159 + }, + { + "epoch": 0.1378490790255496, + "grad_norm": 0.7577313664112107, + "learning_rate": 1.992881507368128e-05, + "loss": 1.0012, + "num_tokens": 4847991049.0, + "step": 1160 + }, + { + "epoch": 0.13796791443850268, + "grad_norm": 0.6450442712228174, + "learning_rate": 1.9928597504868048e-05, + "loss": 0.9406, + "num_tokens": 4852153298.0, + "step": 1161 + }, + { + "epoch": 0.13808674985145572, + "grad_norm": 0.8799879977173114, + "learning_rate": 1.9928379605398665e-05, + "loss": 0.9568, + "num_tokens": 4856341652.0, + "step": 1162 + }, + { + "epoch": 0.1382055852644088, + "grad_norm": 0.7302760991066107, + "learning_rate": 1.9928161375281193e-05, + "loss": 0.958, + "num_tokens": 4860530426.0, + "step": 1163 + }, + { + "epoch": 0.13832442067736186, + "grad_norm": 0.7036233281773469, + "learning_rate": 1.992794281452372e-05, + "loss": 0.931, + "num_tokens": 4864679271.0, + "step": 1164 + }, + { + "epoch": 0.1384432560903149, + "grad_norm": 0.7130704530638414, + "learning_rate": 1.9927723923134336e-05, + "loss": 0.9503, + "num_tokens": 4868807253.0, + "step": 1165 + }, + { + "epoch": 0.13856209150326798, + "grad_norm": 0.6327660626790802, + "learning_rate": 1.9927504701121154e-05, + "loss": 0.9417, + "num_tokens": 4872988217.0, + "step": 1166 + }, + { + "epoch": 0.13868092691622103, + "grad_norm": 0.8447164313890676, + "learning_rate": 1.992728514849229e-05, + "loss": 0.9793, + "num_tokens": 4877176670.0, + "step": 1167 + }, + { + "epoch": 0.1387997623291741, + "grad_norm": 0.6864831888569667, + "learning_rate": 1.9927065265255867e-05, + "loss": 0.9692, + "num_tokens": 4881324163.0, + "step": 1168 + }, + { + "epoch": 0.13891859774212714, + "grad_norm": 0.7068105056509745, + "learning_rate": 1.9926845051420036e-05, + "loss": 0.9523, + "num_tokens": 4885497996.0, + "step": 1169 + }, + { + "epoch": 0.13903743315508021, + "grad_norm": 0.7707055259601188, + "learning_rate": 1.9926624506992952e-05, + "loss": 0.9341, + "num_tokens": 4889686829.0, + "step": 1170 + }, + { + "epoch": 0.1391562685680333, + "grad_norm": 0.7665184152859191, + "learning_rate": 1.9926403631982782e-05, + "loss": 0.943, + "num_tokens": 4893876097.0, + "step": 1171 + }, + { + "epoch": 0.13927510398098633, + "grad_norm": 0.6643782273627882, + "learning_rate": 1.99261824263977e-05, + "loss": 0.9674, + "num_tokens": 4898041740.0, + "step": 1172 + }, + { + "epoch": 0.1393939393939394, + "grad_norm": 0.7362849839765234, + "learning_rate": 1.9925960890245904e-05, + "loss": 0.9214, + "num_tokens": 4902214212.0, + "step": 1173 + }, + { + "epoch": 0.13951277480689245, + "grad_norm": 0.6810655368390064, + "learning_rate": 1.99257390235356e-05, + "loss": 0.9549, + "num_tokens": 4906396577.0, + "step": 1174 + }, + { + "epoch": 0.13963161021984552, + "grad_norm": 0.8080166733898028, + "learning_rate": 1.9925516826275e-05, + "loss": 0.9754, + "num_tokens": 4910583550.0, + "step": 1175 + }, + { + "epoch": 0.13975044563279856, + "grad_norm": 0.7052885451963351, + "learning_rate": 1.9925294298472334e-05, + "loss": 0.9643, + "num_tokens": 4914745887.0, + "step": 1176 + }, + { + "epoch": 0.13986928104575164, + "grad_norm": 0.755197871925635, + "learning_rate": 1.992507144013584e-05, + "loss": 0.9355, + "num_tokens": 4918934205.0, + "step": 1177 + }, + { + "epoch": 0.13998811645870468, + "grad_norm": 0.7179843281018528, + "learning_rate": 1.9924848251273778e-05, + "loss": 0.9623, + "num_tokens": 4923123289.0, + "step": 1178 + }, + { + "epoch": 0.14010695187165775, + "grad_norm": 0.5906312257374838, + "learning_rate": 1.9924624731894408e-05, + "loss": 0.9456, + "num_tokens": 4927310854.0, + "step": 1179 + }, + { + "epoch": 0.14022578728461083, + "grad_norm": 0.8721153469366153, + "learning_rate": 1.992440088200601e-05, + "loss": 0.9658, + "num_tokens": 4931498968.0, + "step": 1180 + }, + { + "epoch": 0.14034462269756387, + "grad_norm": 0.7789941730627563, + "learning_rate": 1.9924176701616874e-05, + "loss": 0.9705, + "num_tokens": 4935679925.0, + "step": 1181 + }, + { + "epoch": 0.14046345811051694, + "grad_norm": 0.6877991296136948, + "learning_rate": 1.9923952190735302e-05, + "loss": 0.9586, + "num_tokens": 4939849446.0, + "step": 1182 + }, + { + "epoch": 0.14058229352347, + "grad_norm": 0.6679286813545187, + "learning_rate": 1.9923727349369602e-05, + "loss": 0.942, + "num_tokens": 4944038284.0, + "step": 1183 + }, + { + "epoch": 0.14070112893642306, + "grad_norm": 0.9062722388897982, + "learning_rate": 1.9923502177528113e-05, + "loss": 0.9649, + "num_tokens": 4948210594.0, + "step": 1184 + }, + { + "epoch": 0.1408199643493761, + "grad_norm": 0.5718722195322196, + "learning_rate": 1.9923276675219162e-05, + "loss": 0.9706, + "num_tokens": 4952380463.0, + "step": 1185 + }, + { + "epoch": 0.14093879976232918, + "grad_norm": 0.8534520262650257, + "learning_rate": 1.992305084245111e-05, + "loss": 0.9676, + "num_tokens": 4956569496.0, + "step": 1186 + }, + { + "epoch": 0.14105763517528225, + "grad_norm": 0.7303145627656389, + "learning_rate": 1.9922824679232313e-05, + "loss": 0.9897, + "num_tokens": 4960759369.0, + "step": 1187 + }, + { + "epoch": 0.1411764705882353, + "grad_norm": 0.7612479907501364, + "learning_rate": 1.9922598185571152e-05, + "loss": 0.9448, + "num_tokens": 4964947770.0, + "step": 1188 + }, + { + "epoch": 0.14129530600118836, + "grad_norm": 0.6169908252988667, + "learning_rate": 1.992237136147601e-05, + "loss": 0.9915, + "num_tokens": 4969135747.0, + "step": 1189 + }, + { + "epoch": 0.1414141414141414, + "grad_norm": 0.9492675608365553, + "learning_rate": 1.992214420695529e-05, + "loss": 0.9597, + "num_tokens": 4973323897.0, + "step": 1190 + }, + { + "epoch": 0.14153297682709448, + "grad_norm": 0.6118219267892957, + "learning_rate": 1.9921916722017403e-05, + "loss": 0.9506, + "num_tokens": 4977513401.0, + "step": 1191 + }, + { + "epoch": 0.14165181224004753, + "grad_norm": 0.7869080203868816, + "learning_rate": 1.9921688906670772e-05, + "loss": 0.9931, + "num_tokens": 4981700101.0, + "step": 1192 + }, + { + "epoch": 0.1417706476530006, + "grad_norm": 0.6988484819961142, + "learning_rate": 1.9921460760923838e-05, + "loss": 0.9387, + "num_tokens": 4985890297.0, + "step": 1193 + }, + { + "epoch": 0.14188948306595364, + "grad_norm": 0.689570487021352, + "learning_rate": 1.992123228478505e-05, + "loss": 0.9384, + "num_tokens": 4990052861.0, + "step": 1194 + }, + { + "epoch": 0.14200831847890671, + "grad_norm": 0.7393505851894069, + "learning_rate": 1.9921003478262865e-05, + "loss": 0.9183, + "num_tokens": 4994242069.0, + "step": 1195 + }, + { + "epoch": 0.1421271538918598, + "grad_norm": 0.7452090744528073, + "learning_rate": 1.9920774341365756e-05, + "loss": 0.9859, + "num_tokens": 4998433139.0, + "step": 1196 + }, + { + "epoch": 0.14224598930481283, + "grad_norm": 0.5808503458687709, + "learning_rate": 1.9920544874102217e-05, + "loss": 0.9552, + "num_tokens": 5002620781.0, + "step": 1197 + }, + { + "epoch": 0.1423648247177659, + "grad_norm": 0.7844083644794124, + "learning_rate": 1.9920315076480737e-05, + "loss": 0.927, + "num_tokens": 5006810211.0, + "step": 1198 + }, + { + "epoch": 0.14248366013071895, + "grad_norm": 0.7308568513808285, + "learning_rate": 1.9920084948509828e-05, + "loss": 0.9346, + "num_tokens": 5010998574.0, + "step": 1199 + }, + { + "epoch": 0.14260249554367202, + "grad_norm": 0.6898846045601839, + "learning_rate": 1.9919854490198017e-05, + "loss": 0.9925, + "num_tokens": 5015186942.0, + "step": 1200 + }, + { + "epoch": 0.14272133095662506, + "grad_norm": 0.8527612613884503, + "learning_rate": 1.9919623701553833e-05, + "loss": 0.9455, + "num_tokens": 5019374823.0, + "step": 1201 + }, + { + "epoch": 0.14284016636957814, + "grad_norm": 0.6897125607582677, + "learning_rate": 1.9919392582585825e-05, + "loss": 0.9727, + "num_tokens": 5023565409.0, + "step": 1202 + }, + { + "epoch": 0.14295900178253118, + "grad_norm": 0.8039654279915485, + "learning_rate": 1.9919161133302556e-05, + "loss": 0.92, + "num_tokens": 5027753498.0, + "step": 1203 + }, + { + "epoch": 0.14307783719548425, + "grad_norm": 0.7803706193420032, + "learning_rate": 1.991892935371259e-05, + "loss": 0.95, + "num_tokens": 5031942016.0, + "step": 1204 + }, + { + "epoch": 0.14319667260843733, + "grad_norm": 0.6579359076861981, + "learning_rate": 1.9918697243824517e-05, + "loss": 0.9295, + "num_tokens": 5036133066.0, + "step": 1205 + }, + { + "epoch": 0.14331550802139037, + "grad_norm": 0.7300324865857003, + "learning_rate": 1.9918464803646928e-05, + "loss": 0.9835, + "num_tokens": 5040314756.0, + "step": 1206 + }, + { + "epoch": 0.14343434343434344, + "grad_norm": 0.7208332975437884, + "learning_rate": 1.9918232033188438e-05, + "loss": 0.9286, + "num_tokens": 5044500528.0, + "step": 1207 + }, + { + "epoch": 0.1435531788472965, + "grad_norm": 0.6970900757003892, + "learning_rate": 1.991799893245766e-05, + "loss": 0.9611, + "num_tokens": 5048670954.0, + "step": 1208 + }, + { + "epoch": 0.14367201426024956, + "grad_norm": 0.6835912282377331, + "learning_rate": 1.9917765501463226e-05, + "loss": 0.9371, + "num_tokens": 5052859941.0, + "step": 1209 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.7447276086605391, + "learning_rate": 1.9917531740213783e-05, + "loss": 0.9571, + "num_tokens": 5057013247.0, + "step": 1210 + }, + { + "epoch": 0.14390968508615568, + "grad_norm": 0.6238183529281592, + "learning_rate": 1.9917297648717996e-05, + "loss": 0.9478, + "num_tokens": 5061201240.0, + "step": 1211 + }, + { + "epoch": 0.14402852049910875, + "grad_norm": 0.8004201500907543, + "learning_rate": 1.991706322698452e-05, + "loss": 0.9529, + "num_tokens": 5065389338.0, + "step": 1212 + }, + { + "epoch": 0.1441473559120618, + "grad_norm": 0.6693893778870469, + "learning_rate": 1.9916828475022045e-05, + "loss": 0.9252, + "num_tokens": 5069552809.0, + "step": 1213 + }, + { + "epoch": 0.14426619132501486, + "grad_norm": 0.8673349195025629, + "learning_rate": 1.9916593392839264e-05, + "loss": 0.9223, + "num_tokens": 5073742348.0, + "step": 1214 + }, + { + "epoch": 0.1443850267379679, + "grad_norm": 1.6302415102982852, + "learning_rate": 1.9916357980444882e-05, + "loss": 0.9341, + "num_tokens": 5077909953.0, + "step": 1215 + }, + { + "epoch": 0.14450386215092098, + "grad_norm": 0.9517161355264133, + "learning_rate": 1.9916122237847618e-05, + "loss": 0.9391, + "num_tokens": 5082072012.0, + "step": 1216 + }, + { + "epoch": 0.14462269756387403, + "grad_norm": 1.1716230346612546, + "learning_rate": 1.99158861650562e-05, + "loss": 0.9501, + "num_tokens": 5086261883.0, + "step": 1217 + }, + { + "epoch": 0.1447415329768271, + "grad_norm": 0.6727005159525272, + "learning_rate": 1.9915649762079375e-05, + "loss": 0.9591, + "num_tokens": 5090434659.0, + "step": 1218 + }, + { + "epoch": 0.14486036838978014, + "grad_norm": 1.74704456829988, + "learning_rate": 1.991541302892589e-05, + "loss": 0.9385, + "num_tokens": 5094604359.0, + "step": 1219 + }, + { + "epoch": 0.14497920380273321, + "grad_norm": 1.4119870258753726, + "learning_rate": 1.991517596560452e-05, + "loss": 0.959, + "num_tokens": 5098791703.0, + "step": 1220 + }, + { + "epoch": 0.1450980392156863, + "grad_norm": 1.2705281215351067, + "learning_rate": 1.991493857212404e-05, + "loss": 0.9549, + "num_tokens": 5102965666.0, + "step": 1221 + }, + { + "epoch": 0.14521687462863933, + "grad_norm": 1.1367247729370633, + "learning_rate": 1.9914700848493243e-05, + "loss": 0.9691, + "num_tokens": 5107155394.0, + "step": 1222 + }, + { + "epoch": 0.1453357100415924, + "grad_norm": 1.1202644138579976, + "learning_rate": 1.991446279472093e-05, + "loss": 0.9775, + "num_tokens": 5111344599.0, + "step": 1223 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.9239942233975728, + "learning_rate": 1.991422441081592e-05, + "loss": 0.967, + "num_tokens": 5115534525.0, + "step": 1224 + }, + { + "epoch": 0.14557338086749852, + "grad_norm": 0.9846743170115031, + "learning_rate": 1.9913985696787044e-05, + "loss": 0.9523, + "num_tokens": 5119689072.0, + "step": 1225 + }, + { + "epoch": 0.14569221628045156, + "grad_norm": 0.9203973482384977, + "learning_rate": 1.9913746652643136e-05, + "loss": 0.9858, + "num_tokens": 5123876954.0, + "step": 1226 + }, + { + "epoch": 0.14581105169340464, + "grad_norm": 0.9378418954057743, + "learning_rate": 1.9913507278393052e-05, + "loss": 0.9245, + "num_tokens": 5128014731.0, + "step": 1227 + }, + { + "epoch": 0.14592988710635768, + "grad_norm": 0.8352644956983016, + "learning_rate": 1.9913267574045658e-05, + "loss": 0.9357, + "num_tokens": 5132178249.0, + "step": 1228 + }, + { + "epoch": 0.14604872251931075, + "grad_norm": 0.8369396930094217, + "learning_rate": 1.9913027539609825e-05, + "loss": 0.9547, + "num_tokens": 5136360539.0, + "step": 1229 + }, + { + "epoch": 0.14616755793226383, + "grad_norm": 0.7679600396292392, + "learning_rate": 1.9912787175094447e-05, + "loss": 0.9835, + "num_tokens": 5140528591.0, + "step": 1230 + }, + { + "epoch": 0.14628639334521687, + "grad_norm": 0.8673561702434729, + "learning_rate": 1.9912546480508427e-05, + "loss": 0.9844, + "num_tokens": 5144717018.0, + "step": 1231 + }, + { + "epoch": 0.14640522875816994, + "grad_norm": 0.7907193589901438, + "learning_rate": 1.991230545586068e-05, + "loss": 0.9494, + "num_tokens": 5148907167.0, + "step": 1232 + }, + { + "epoch": 0.146524064171123, + "grad_norm": 0.6977861801244029, + "learning_rate": 1.9912064101160122e-05, + "loss": 0.9206, + "num_tokens": 5153095999.0, + "step": 1233 + }, + { + "epoch": 0.14664289958407606, + "grad_norm": 0.8030871234734892, + "learning_rate": 1.99118224164157e-05, + "loss": 0.952, + "num_tokens": 5157285752.0, + "step": 1234 + }, + { + "epoch": 0.1467617349970291, + "grad_norm": 0.7144631678774542, + "learning_rate": 1.9911580401636365e-05, + "loss": 0.9521, + "num_tokens": 5161475134.0, + "step": 1235 + }, + { + "epoch": 0.14688057040998218, + "grad_norm": 0.7529897279073902, + "learning_rate": 1.9911338056831077e-05, + "loss": 0.9578, + "num_tokens": 5165665023.0, + "step": 1236 + }, + { + "epoch": 0.14699940582293525, + "grad_norm": 0.5276145615541703, + "learning_rate": 1.9911095382008806e-05, + "loss": 0.934, + "num_tokens": 5169833210.0, + "step": 1237 + }, + { + "epoch": 0.1471182412358883, + "grad_norm": 0.9479803681893515, + "learning_rate": 1.991085237717855e-05, + "loss": 0.9574, + "num_tokens": 5174022329.0, + "step": 1238 + }, + { + "epoch": 0.14723707664884136, + "grad_norm": 0.6791170091998747, + "learning_rate": 1.9910609042349296e-05, + "loss": 0.9596, + "num_tokens": 5178184692.0, + "step": 1239 + }, + { + "epoch": 0.1473559120617944, + "grad_norm": 0.8912556592329429, + "learning_rate": 1.9910365377530068e-05, + "loss": 0.943, + "num_tokens": 5182374097.0, + "step": 1240 + }, + { + "epoch": 0.14747474747474748, + "grad_norm": 0.6065242642802171, + "learning_rate": 1.9910121382729877e-05, + "loss": 0.8923, + "num_tokens": 5186563824.0, + "step": 1241 + }, + { + "epoch": 0.14759358288770053, + "grad_norm": 0.8976932396591107, + "learning_rate": 1.9909877057957773e-05, + "loss": 0.9478, + "num_tokens": 5190729762.0, + "step": 1242 + }, + { + "epoch": 0.1477124183006536, + "grad_norm": 0.6556000116246871, + "learning_rate": 1.9909632403222788e-05, + "loss": 0.9468, + "num_tokens": 5194912906.0, + "step": 1243 + }, + { + "epoch": 0.14783125371360664, + "grad_norm": 0.846818437469568, + "learning_rate": 1.9909387418534e-05, + "loss": 0.9479, + "num_tokens": 5199101166.0, + "step": 1244 + }, + { + "epoch": 0.14795008912655971, + "grad_norm": 0.684540092727246, + "learning_rate": 1.9909142103900463e-05, + "loss": 0.9206, + "num_tokens": 5203252478.0, + "step": 1245 + }, + { + "epoch": 0.1480689245395128, + "grad_norm": 0.7919047178000013, + "learning_rate": 1.9908896459331277e-05, + "loss": 0.9602, + "num_tokens": 5207440867.0, + "step": 1246 + }, + { + "epoch": 0.14818775995246583, + "grad_norm": 0.7544365893838914, + "learning_rate": 1.9908650484835534e-05, + "loss": 0.9713, + "num_tokens": 5211603251.0, + "step": 1247 + }, + { + "epoch": 0.1483065953654189, + "grad_norm": 0.650746184585067, + "learning_rate": 1.990840418042234e-05, + "loss": 0.9413, + "num_tokens": 5215773819.0, + "step": 1248 + }, + { + "epoch": 0.14842543077837195, + "grad_norm": 0.8067557984286841, + "learning_rate": 1.9908157546100823e-05, + "loss": 0.9977, + "num_tokens": 5219956204.0, + "step": 1249 + }, + { + "epoch": 0.14854426619132502, + "grad_norm": 0.6747562065375045, + "learning_rate": 1.990791058188011e-05, + "loss": 0.9668, + "num_tokens": 5224143981.0, + "step": 1250 + }, + { + "epoch": 0.14866310160427806, + "grad_norm": 0.7346491840820215, + "learning_rate": 1.990766328776935e-05, + "loss": 0.9684, + "num_tokens": 5228305310.0, + "step": 1251 + }, + { + "epoch": 0.14878193701723114, + "grad_norm": 0.6403839817568258, + "learning_rate": 1.99074156637777e-05, + "loss": 0.925, + "num_tokens": 5232487968.0, + "step": 1252 + }, + { + "epoch": 0.1489007724301842, + "grad_norm": 0.7869606194809406, + "learning_rate": 1.9907167709914333e-05, + "loss": 0.9551, + "num_tokens": 5236677845.0, + "step": 1253 + }, + { + "epoch": 0.14901960784313725, + "grad_norm": 0.7541383337733444, + "learning_rate": 1.9906919426188428e-05, + "loss": 0.93, + "num_tokens": 5240860156.0, + "step": 1254 + }, + { + "epoch": 0.14913844325609033, + "grad_norm": 0.5922562784379818, + "learning_rate": 1.9906670812609184e-05, + "loss": 0.9295, + "num_tokens": 5245046947.0, + "step": 1255 + }, + { + "epoch": 0.14925727866904337, + "grad_norm": 0.7503675512826634, + "learning_rate": 1.99064218691858e-05, + "loss": 0.9345, + "num_tokens": 5249208003.0, + "step": 1256 + }, + { + "epoch": 0.14937611408199644, + "grad_norm": 0.7467728708480578, + "learning_rate": 1.990617259592751e-05, + "loss": 0.9618, + "num_tokens": 5253392927.0, + "step": 1257 + }, + { + "epoch": 0.1494949494949495, + "grad_norm": 0.6945720162106186, + "learning_rate": 1.990592299284353e-05, + "loss": 0.9237, + "num_tokens": 5257555487.0, + "step": 1258 + }, + { + "epoch": 0.14961378490790256, + "grad_norm": 0.6501020197581509, + "learning_rate": 1.990567305994311e-05, + "loss": 0.9987, + "num_tokens": 5261744470.0, + "step": 1259 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 0.6986293458675834, + "learning_rate": 1.990542279723551e-05, + "loss": 0.9532, + "num_tokens": 5265927105.0, + "step": 1260 + }, + { + "epoch": 0.14985145573380868, + "grad_norm": 0.771745935895565, + "learning_rate": 1.990517220472999e-05, + "loss": 0.9899, + "num_tokens": 5270112666.0, + "step": 1261 + }, + { + "epoch": 0.14997029114676175, + "grad_norm": 0.7192976595202728, + "learning_rate": 1.9904921282435835e-05, + "loss": 0.9595, + "num_tokens": 5274301937.0, + "step": 1262 + }, + { + "epoch": 0.1500891265597148, + "grad_norm": 0.6967684203887646, + "learning_rate": 1.9904670030362337e-05, + "loss": 0.9609, + "num_tokens": 5278471057.0, + "step": 1263 + }, + { + "epoch": 0.15020796197266786, + "grad_norm": 0.6451419126201817, + "learning_rate": 1.99044184485188e-05, + "loss": 0.9457, + "num_tokens": 5282655746.0, + "step": 1264 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.7501427672530548, + "learning_rate": 1.9904166536914543e-05, + "loss": 0.9667, + "num_tokens": 5286817285.0, + "step": 1265 + }, + { + "epoch": 0.15044563279857398, + "grad_norm": 0.8922483418902709, + "learning_rate": 1.9903914295558892e-05, + "loss": 0.9299, + "num_tokens": 5291006381.0, + "step": 1266 + }, + { + "epoch": 0.15056446821152702, + "grad_norm": 0.6226714373270811, + "learning_rate": 1.990366172446119e-05, + "loss": 0.9512, + "num_tokens": 5295185970.0, + "step": 1267 + }, + { + "epoch": 0.1506833036244801, + "grad_norm": 0.6112028213773857, + "learning_rate": 1.990340882363079e-05, + "loss": 0.9777, + "num_tokens": 5299345109.0, + "step": 1268 + }, + { + "epoch": 0.15080213903743314, + "grad_norm": 0.8979182993541209, + "learning_rate": 1.9903155593077057e-05, + "loss": 0.9605, + "num_tokens": 5303533345.0, + "step": 1269 + }, + { + "epoch": 0.1509209744503862, + "grad_norm": 0.5516348534054559, + "learning_rate": 1.9902902032809375e-05, + "loss": 0.9177, + "num_tokens": 5307721569.0, + "step": 1270 + }, + { + "epoch": 0.15103980986333929, + "grad_norm": 0.8352225156731109, + "learning_rate": 1.9902648142837124e-05, + "loss": 0.9542, + "num_tokens": 5311908565.0, + "step": 1271 + }, + { + "epoch": 0.15115864527629233, + "grad_norm": 0.618767492454716, + "learning_rate": 1.9902393923169712e-05, + "loss": 0.9275, + "num_tokens": 5316098057.0, + "step": 1272 + }, + { + "epoch": 0.1512774806892454, + "grad_norm": 0.7033619730523532, + "learning_rate": 1.9902139373816555e-05, + "loss": 0.9654, + "num_tokens": 5320287061.0, + "step": 1273 + }, + { + "epoch": 0.15139631610219845, + "grad_norm": 0.7834459652066604, + "learning_rate": 1.990188449478708e-05, + "loss": 0.9255, + "num_tokens": 5324476311.0, + "step": 1274 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.6506872869519394, + "learning_rate": 1.990162928609072e-05, + "loss": 0.9626, + "num_tokens": 5328660097.0, + "step": 1275 + }, + { + "epoch": 0.15163398692810456, + "grad_norm": 0.7404305162873548, + "learning_rate": 1.9901373747736933e-05, + "loss": 0.9378, + "num_tokens": 5332850196.0, + "step": 1276 + }, + { + "epoch": 0.15175282234105764, + "grad_norm": 0.7074968421588921, + "learning_rate": 1.990111787973518e-05, + "loss": 0.9357, + "num_tokens": 5337040435.0, + "step": 1277 + }, + { + "epoch": 0.1518716577540107, + "grad_norm": 0.8392754551079492, + "learning_rate": 1.9900861682094935e-05, + "loss": 0.9428, + "num_tokens": 5341201124.0, + "step": 1278 + }, + { + "epoch": 0.15199049316696375, + "grad_norm": 0.676384958816264, + "learning_rate": 1.9900605154825687e-05, + "loss": 0.9723, + "num_tokens": 5345391838.0, + "step": 1279 + }, + { + "epoch": 0.15210932857991682, + "grad_norm": 0.8338614174230209, + "learning_rate": 1.990034829793694e-05, + "loss": 0.9668, + "num_tokens": 5349580430.0, + "step": 1280 + }, + { + "epoch": 0.15222816399286987, + "grad_norm": 0.570214232066067, + "learning_rate": 1.99000911114382e-05, + "loss": 0.913, + "num_tokens": 5353769768.0, + "step": 1281 + }, + { + "epoch": 0.15234699940582294, + "grad_norm": 0.9530113752326732, + "learning_rate": 1.9899833595338997e-05, + "loss": 0.9457, + "num_tokens": 5357958981.0, + "step": 1282 + }, + { + "epoch": 0.15246583481877599, + "grad_norm": 0.5574670248622807, + "learning_rate": 1.989957574964886e-05, + "loss": 0.953, + "num_tokens": 5362105283.0, + "step": 1283 + }, + { + "epoch": 0.15258467023172906, + "grad_norm": 0.9679871082915698, + "learning_rate": 1.9899317574377345e-05, + "loss": 0.9749, + "num_tokens": 5366294385.0, + "step": 1284 + }, + { + "epoch": 0.1527035056446821, + "grad_norm": 0.6969674157922682, + "learning_rate": 1.9899059069534014e-05, + "loss": 0.9531, + "num_tokens": 5370483413.0, + "step": 1285 + }, + { + "epoch": 0.15282234105763517, + "grad_norm": 0.8168774274149517, + "learning_rate": 1.989880023512843e-05, + "loss": 0.9355, + "num_tokens": 5374672932.0, + "step": 1286 + }, + { + "epoch": 0.15294117647058825, + "grad_norm": 0.7437183725585524, + "learning_rate": 1.9898541071170193e-05, + "loss": 0.9601, + "num_tokens": 5378836184.0, + "step": 1287 + }, + { + "epoch": 0.1530600118835413, + "grad_norm": 0.6497891896077213, + "learning_rate": 1.989828157766889e-05, + "loss": 0.9558, + "num_tokens": 5383020208.0, + "step": 1288 + }, + { + "epoch": 0.15317884729649436, + "grad_norm": 0.6424565908560183, + "learning_rate": 1.9898021754634137e-05, + "loss": 0.9508, + "num_tokens": 5387209710.0, + "step": 1289 + }, + { + "epoch": 0.1532976827094474, + "grad_norm": 0.8607287228913603, + "learning_rate": 1.989776160207555e-05, + "loss": 0.9748, + "num_tokens": 5391399840.0, + "step": 1290 + }, + { + "epoch": 0.15341651812240048, + "grad_norm": 0.5793328298516703, + "learning_rate": 1.989750112000277e-05, + "loss": 0.9641, + "num_tokens": 5395588383.0, + "step": 1291 + }, + { + "epoch": 0.15353535353535352, + "grad_norm": 0.8115037887720857, + "learning_rate": 1.9897240308425437e-05, + "loss": 0.9278, + "num_tokens": 5399777917.0, + "step": 1292 + }, + { + "epoch": 0.1536541889483066, + "grad_norm": 0.5956311802948843, + "learning_rate": 1.9896979167353214e-05, + "loss": 0.9687, + "num_tokens": 5403968884.0, + "step": 1293 + }, + { + "epoch": 0.15377302436125967, + "grad_norm": 0.778140679739045, + "learning_rate": 1.9896717696795772e-05, + "loss": 0.9184, + "num_tokens": 5408112377.0, + "step": 1294 + }, + { + "epoch": 0.1538918597742127, + "grad_norm": 0.7811785034374199, + "learning_rate": 1.9896455896762796e-05, + "loss": 0.9886, + "num_tokens": 5412256977.0, + "step": 1295 + }, + { + "epoch": 0.15401069518716579, + "grad_norm": 0.6553878566210208, + "learning_rate": 1.9896193767263974e-05, + "loss": 0.9565, + "num_tokens": 5416445121.0, + "step": 1296 + }, + { + "epoch": 0.15412953060011883, + "grad_norm": 0.7215073748161966, + "learning_rate": 1.989593130830902e-05, + "loss": 0.9652, + "num_tokens": 5420621283.0, + "step": 1297 + }, + { + "epoch": 0.1542483660130719, + "grad_norm": 0.6681379091855651, + "learning_rate": 1.9895668519907653e-05, + "loss": 0.9308, + "num_tokens": 5424789518.0, + "step": 1298 + }, + { + "epoch": 0.15436720142602495, + "grad_norm": 0.7445382271160393, + "learning_rate": 1.98954054020696e-05, + "loss": 0.9275, + "num_tokens": 5428949294.0, + "step": 1299 + }, + { + "epoch": 0.15448603683897802, + "grad_norm": 0.6477716991875969, + "learning_rate": 1.9895141954804613e-05, + "loss": 0.9634, + "num_tokens": 5433093107.0, + "step": 1300 + }, + { + "epoch": 0.15460487225193106, + "grad_norm": 0.6883750409830114, + "learning_rate": 1.9894878178122438e-05, + "loss": 0.9694, + "num_tokens": 5437280604.0, + "step": 1301 + }, + { + "epoch": 0.15472370766488414, + "grad_norm": 0.8053314270805619, + "learning_rate": 1.989461407203286e-05, + "loss": 0.9323, + "num_tokens": 5441468934.0, + "step": 1302 + }, + { + "epoch": 0.1548425430778372, + "grad_norm": 0.5892370671479631, + "learning_rate": 1.989434963654564e-05, + "loss": 0.9749, + "num_tokens": 5445658888.0, + "step": 1303 + }, + { + "epoch": 0.15496137849079025, + "grad_norm": 0.7930992348049283, + "learning_rate": 1.9894084871670584e-05, + "loss": 0.9776, + "num_tokens": 5449817389.0, + "step": 1304 + }, + { + "epoch": 0.15508021390374332, + "grad_norm": 0.6945264161609481, + "learning_rate": 1.9893819777417492e-05, + "loss": 0.9464, + "num_tokens": 5454006684.0, + "step": 1305 + }, + { + "epoch": 0.15519904931669637, + "grad_norm": 0.7294381060608739, + "learning_rate": 1.989355435379619e-05, + "loss": 0.9757, + "num_tokens": 5458181647.0, + "step": 1306 + }, + { + "epoch": 0.15531788472964944, + "grad_norm": 0.7958105578208631, + "learning_rate": 1.9893288600816494e-05, + "loss": 0.9178, + "num_tokens": 5462370302.0, + "step": 1307 + }, + { + "epoch": 0.15543672014260249, + "grad_norm": 0.6669442005056503, + "learning_rate": 1.9893022518488248e-05, + "loss": 0.8919, + "num_tokens": 5466560198.0, + "step": 1308 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.8217164985393092, + "learning_rate": 1.9892756106821317e-05, + "loss": 0.8984, + "num_tokens": 5470750327.0, + "step": 1309 + }, + { + "epoch": 0.1556743909685086, + "grad_norm": 0.6953466974280383, + "learning_rate": 1.989248936582556e-05, + "loss": 0.9597, + "num_tokens": 5474940812.0, + "step": 1310 + }, + { + "epoch": 0.15579322638146167, + "grad_norm": 0.7533760988804669, + "learning_rate": 1.9892222295510855e-05, + "loss": 0.9872, + "num_tokens": 5479130532.0, + "step": 1311 + }, + { + "epoch": 0.15591206179441475, + "grad_norm": 0.5630944693684838, + "learning_rate": 1.989195489588709e-05, + "loss": 0.9842, + "num_tokens": 5483318864.0, + "step": 1312 + }, + { + "epoch": 0.1560308972073678, + "grad_norm": 0.726589364113958, + "learning_rate": 1.9891687166964172e-05, + "loss": 0.908, + "num_tokens": 5487509461.0, + "step": 1313 + }, + { + "epoch": 0.15614973262032086, + "grad_norm": 0.682086457804278, + "learning_rate": 1.989141910875202e-05, + "loss": 0.918, + "num_tokens": 5491698844.0, + "step": 1314 + }, + { + "epoch": 0.1562685680332739, + "grad_norm": 0.6168663209594061, + "learning_rate": 1.989115072126055e-05, + "loss": 0.969, + "num_tokens": 5495886189.0, + "step": 1315 + }, + { + "epoch": 0.15638740344622698, + "grad_norm": 0.6863132997871506, + "learning_rate": 1.9890882004499707e-05, + "loss": 0.9709, + "num_tokens": 5500051559.0, + "step": 1316 + }, + { + "epoch": 0.15650623885918002, + "grad_norm": 0.794135170459319, + "learning_rate": 1.9890612958479446e-05, + "loss": 0.9941, + "num_tokens": 5504218642.0, + "step": 1317 + }, + { + "epoch": 0.1566250742721331, + "grad_norm": 0.7005550564487227, + "learning_rate": 1.9890343583209722e-05, + "loss": 0.9524, + "num_tokens": 5508386173.0, + "step": 1318 + }, + { + "epoch": 0.15674390968508617, + "grad_norm": 0.725262472205454, + "learning_rate": 1.989007387870052e-05, + "loss": 0.9813, + "num_tokens": 5512576853.0, + "step": 1319 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.6173837457751932, + "learning_rate": 1.9889803844961825e-05, + "loss": 0.978, + "num_tokens": 5516766325.0, + "step": 1320 + }, + { + "epoch": 0.15698158051099229, + "grad_norm": 0.6638700920422046, + "learning_rate": 1.9889533482003635e-05, + "loss": 1.0006, + "num_tokens": 5520953748.0, + "step": 1321 + }, + { + "epoch": 0.15710041592394533, + "grad_norm": 0.6926980701401081, + "learning_rate": 1.9889262789835966e-05, + "loss": 0.9365, + "num_tokens": 5525133177.0, + "step": 1322 + }, + { + "epoch": 0.1572192513368984, + "grad_norm": 0.6990802003007857, + "learning_rate": 1.9888991768468836e-05, + "loss": 0.9418, + "num_tokens": 5529289357.0, + "step": 1323 + }, + { + "epoch": 0.15733808674985145, + "grad_norm": 0.6981190722916311, + "learning_rate": 1.9888720417912293e-05, + "loss": 0.914, + "num_tokens": 5533479459.0, + "step": 1324 + }, + { + "epoch": 0.15745692216280452, + "grad_norm": 0.7531428359587513, + "learning_rate": 1.9888448738176375e-05, + "loss": 0.9396, + "num_tokens": 5537630304.0, + "step": 1325 + }, + { + "epoch": 0.15757575757575756, + "grad_norm": 0.7163728770164078, + "learning_rate": 1.988817672927115e-05, + "loss": 0.9085, + "num_tokens": 5541793995.0, + "step": 1326 + }, + { + "epoch": 0.15769459298871064, + "grad_norm": 0.6924014053507208, + "learning_rate": 1.9887904391206684e-05, + "loss": 0.9447, + "num_tokens": 5545982246.0, + "step": 1327 + }, + { + "epoch": 0.1578134284016637, + "grad_norm": 0.6162963441384326, + "learning_rate": 1.9887631723993074e-05, + "loss": 0.9175, + "num_tokens": 5550172188.0, + "step": 1328 + }, + { + "epoch": 0.15793226381461675, + "grad_norm": 0.7630017010198878, + "learning_rate": 1.9887358727640408e-05, + "loss": 0.984, + "num_tokens": 5554361828.0, + "step": 1329 + }, + { + "epoch": 0.15805109922756982, + "grad_norm": 0.7216002465543913, + "learning_rate": 1.98870854021588e-05, + "loss": 1.0011, + "num_tokens": 5558540951.0, + "step": 1330 + }, + { + "epoch": 0.15816993464052287, + "grad_norm": 0.6952760690764321, + "learning_rate": 1.988681174755837e-05, + "loss": 0.9216, + "num_tokens": 5562729791.0, + "step": 1331 + }, + { + "epoch": 0.15828877005347594, + "grad_norm": 0.7749966649994999, + "learning_rate": 1.9886537763849255e-05, + "loss": 0.9858, + "num_tokens": 5566908254.0, + "step": 1332 + }, + { + "epoch": 0.15840760546642899, + "grad_norm": 0.6157235380818543, + "learning_rate": 1.9886263451041604e-05, + "loss": 0.9546, + "num_tokens": 5571096364.0, + "step": 1333 + }, + { + "epoch": 0.15852644087938206, + "grad_norm": 0.676660420905748, + "learning_rate": 1.9885988809145566e-05, + "loss": 0.9376, + "num_tokens": 5575255661.0, + "step": 1334 + }, + { + "epoch": 0.1586452762923351, + "grad_norm": 0.7268018620380446, + "learning_rate": 1.9885713838171323e-05, + "loss": 0.9413, + "num_tokens": 5579445152.0, + "step": 1335 + }, + { + "epoch": 0.15876411170528817, + "grad_norm": 0.6008999036212207, + "learning_rate": 1.9885438538129052e-05, + "loss": 0.9564, + "num_tokens": 5583633871.0, + "step": 1336 + }, + { + "epoch": 0.15888294711824125, + "grad_norm": 0.8554764983113571, + "learning_rate": 1.988516290902895e-05, + "loss": 0.9539, + "num_tokens": 5587822116.0, + "step": 1337 + }, + { + "epoch": 0.1590017825311943, + "grad_norm": 0.6759350147800242, + "learning_rate": 1.988488695088122e-05, + "loss": 0.9601, + "num_tokens": 5592011705.0, + "step": 1338 + }, + { + "epoch": 0.15912061794414736, + "grad_norm": 0.7780592118689657, + "learning_rate": 1.988461066369609e-05, + "loss": 0.9729, + "num_tokens": 5596200759.0, + "step": 1339 + }, + { + "epoch": 0.1592394533571004, + "grad_norm": 0.5654948962306733, + "learning_rate": 1.9884334047483788e-05, + "loss": 0.9822, + "num_tokens": 5600383608.0, + "step": 1340 + }, + { + "epoch": 0.15935828877005348, + "grad_norm": 0.7160860203030123, + "learning_rate": 1.9884057102254557e-05, + "loss": 0.9509, + "num_tokens": 5604572724.0, + "step": 1341 + }, + { + "epoch": 0.15947712418300652, + "grad_norm": 0.7290760530362107, + "learning_rate": 1.9883779828018654e-05, + "loss": 0.9542, + "num_tokens": 5608762440.0, + "step": 1342 + }, + { + "epoch": 0.1595959595959596, + "grad_norm": 0.695756122239012, + "learning_rate": 1.9883502224786345e-05, + "loss": 0.9546, + "num_tokens": 5612948400.0, + "step": 1343 + }, + { + "epoch": 0.15971479500891267, + "grad_norm": 0.7145505295554827, + "learning_rate": 1.9883224292567913e-05, + "loss": 0.9186, + "num_tokens": 5617137875.0, + "step": 1344 + }, + { + "epoch": 0.1598336304218657, + "grad_norm": 0.5917226695145252, + "learning_rate": 1.9882946031373655e-05, + "loss": 0.9488, + "num_tokens": 5621298917.0, + "step": 1345 + }, + { + "epoch": 0.15995246583481879, + "grad_norm": 0.6453687272476925, + "learning_rate": 1.9882667441213867e-05, + "loss": 0.962, + "num_tokens": 5625474867.0, + "step": 1346 + }, + { + "epoch": 0.16007130124777183, + "grad_norm": 0.7580707357563882, + "learning_rate": 1.988238852209887e-05, + "loss": 0.9343, + "num_tokens": 5629662840.0, + "step": 1347 + }, + { + "epoch": 0.1601901366607249, + "grad_norm": 0.6790271180588777, + "learning_rate": 1.9882109274038996e-05, + "loss": 0.9648, + "num_tokens": 5633850813.0, + "step": 1348 + }, + { + "epoch": 0.16030897207367795, + "grad_norm": 0.7422856575149268, + "learning_rate": 1.9881829697044584e-05, + "loss": 0.9842, + "num_tokens": 5638011227.0, + "step": 1349 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 0.7159206838692007, + "learning_rate": 1.9881549791125988e-05, + "loss": 0.9896, + "num_tokens": 5642184363.0, + "step": 1350 + }, + { + "epoch": 0.16054664289958406, + "grad_norm": 0.6938882782383566, + "learning_rate": 1.9881269556293575e-05, + "loss": 0.9507, + "num_tokens": 5646344176.0, + "step": 1351 + }, + { + "epoch": 0.16066547831253714, + "grad_norm": 0.7615989111767872, + "learning_rate": 1.988098899255772e-05, + "loss": 0.9342, + "num_tokens": 5650534799.0, + "step": 1352 + }, + { + "epoch": 0.1607843137254902, + "grad_norm": 0.6748555684098909, + "learning_rate": 1.9880708099928816e-05, + "loss": 0.9273, + "num_tokens": 5654707346.0, + "step": 1353 + }, + { + "epoch": 0.16090314913844325, + "grad_norm": 0.7101851148646104, + "learning_rate": 1.9880426878417264e-05, + "loss": 0.9106, + "num_tokens": 5658873318.0, + "step": 1354 + }, + { + "epoch": 0.16102198455139632, + "grad_norm": 0.6822766180834201, + "learning_rate": 1.988014532803348e-05, + "loss": 0.9197, + "num_tokens": 5663049067.0, + "step": 1355 + }, + { + "epoch": 0.16114081996434937, + "grad_norm": 0.6288284231435359, + "learning_rate": 1.9879863448787893e-05, + "loss": 0.9549, + "num_tokens": 5667238261.0, + "step": 1356 + }, + { + "epoch": 0.16125965537730244, + "grad_norm": 0.7540690836011492, + "learning_rate": 1.9879581240690934e-05, + "loss": 0.9051, + "num_tokens": 5671428258.0, + "step": 1357 + }, + { + "epoch": 0.16137849079025549, + "grad_norm": 0.6574380484627305, + "learning_rate": 1.9879298703753064e-05, + "loss": 0.9524, + "num_tokens": 5675617962.0, + "step": 1358 + }, + { + "epoch": 0.16149732620320856, + "grad_norm": 0.7682967796255087, + "learning_rate": 1.987901583798474e-05, + "loss": 0.9131, + "num_tokens": 5679746499.0, + "step": 1359 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 0.5944097952250088, + "learning_rate": 1.9878732643396436e-05, + "loss": 0.8964, + "num_tokens": 5683909193.0, + "step": 1360 + }, + { + "epoch": 0.16173499702911467, + "grad_norm": 0.6977378425999072, + "learning_rate": 1.9878449119998643e-05, + "loss": 0.9562, + "num_tokens": 5688098581.0, + "step": 1361 + }, + { + "epoch": 0.16185383244206775, + "grad_norm": 0.7431042009790748, + "learning_rate": 1.9878165267801863e-05, + "loss": 0.9967, + "num_tokens": 5692288401.0, + "step": 1362 + }, + { + "epoch": 0.1619726678550208, + "grad_norm": 0.5892098519624547, + "learning_rate": 1.9877881086816608e-05, + "loss": 0.9698, + "num_tokens": 5696476520.0, + "step": 1363 + }, + { + "epoch": 0.16209150326797386, + "grad_norm": 0.7910276420035659, + "learning_rate": 1.9877596577053393e-05, + "loss": 0.8945, + "num_tokens": 5700665668.0, + "step": 1364 + }, + { + "epoch": 0.1622103386809269, + "grad_norm": 0.5693921832100389, + "learning_rate": 1.9877311738522762e-05, + "loss": 0.9406, + "num_tokens": 5704853452.0, + "step": 1365 + }, + { + "epoch": 0.16232917409387998, + "grad_norm": 0.8886355938786527, + "learning_rate": 1.9877026571235263e-05, + "loss": 0.9505, + "num_tokens": 5709041726.0, + "step": 1366 + }, + { + "epoch": 0.16244800950683302, + "grad_norm": 0.6134100683331729, + "learning_rate": 1.9876741075201456e-05, + "loss": 0.9312, + "num_tokens": 5713206409.0, + "step": 1367 + }, + { + "epoch": 0.1625668449197861, + "grad_norm": 0.8440360554122291, + "learning_rate": 1.9876455250431917e-05, + "loss": 0.981, + "num_tokens": 5717395897.0, + "step": 1368 + }, + { + "epoch": 0.16268568033273917, + "grad_norm": 0.6572836785079795, + "learning_rate": 1.9876169096937225e-05, + "loss": 0.9743, + "num_tokens": 5721584646.0, + "step": 1369 + }, + { + "epoch": 0.1628045157456922, + "grad_norm": 0.6682186458606586, + "learning_rate": 1.9875882614727983e-05, + "loss": 0.9593, + "num_tokens": 5725767208.0, + "step": 1370 + }, + { + "epoch": 0.16292335115864529, + "grad_norm": 0.671059326779075, + "learning_rate": 1.9875595803814795e-05, + "loss": 0.9356, + "num_tokens": 5729958680.0, + "step": 1371 + }, + { + "epoch": 0.16304218657159833, + "grad_norm": 0.7862916556903364, + "learning_rate": 1.9875308664208286e-05, + "loss": 0.9296, + "num_tokens": 5734148261.0, + "step": 1372 + }, + { + "epoch": 0.1631610219845514, + "grad_norm": 0.6894847757086932, + "learning_rate": 1.987502119591909e-05, + "loss": 0.9528, + "num_tokens": 5738314793.0, + "step": 1373 + }, + { + "epoch": 0.16327985739750445, + "grad_norm": 0.6630882600417503, + "learning_rate": 1.987473339895785e-05, + "loss": 0.9834, + "num_tokens": 5742478674.0, + "step": 1374 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.6691943082208418, + "learning_rate": 1.987444527333523e-05, + "loss": 0.9518, + "num_tokens": 5746614982.0, + "step": 1375 + }, + { + "epoch": 0.16351752822341056, + "grad_norm": 0.7352662938865977, + "learning_rate": 1.9874156819061893e-05, + "loss": 0.9845, + "num_tokens": 5750803442.0, + "step": 1376 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 0.6099870768436008, + "learning_rate": 1.9873868036148526e-05, + "loss": 0.9443, + "num_tokens": 5754988781.0, + "step": 1377 + }, + { + "epoch": 0.1637551990493167, + "grad_norm": 0.8465131312230335, + "learning_rate": 1.987357892460582e-05, + "loss": 0.9632, + "num_tokens": 5759150458.0, + "step": 1378 + }, + { + "epoch": 0.16387403446226975, + "grad_norm": 0.5613274043412613, + "learning_rate": 1.9873289484444483e-05, + "loss": 0.9403, + "num_tokens": 5763340661.0, + "step": 1379 + }, + { + "epoch": 0.16399286987522282, + "grad_norm": 0.7269287580178749, + "learning_rate": 1.9872999715675242e-05, + "loss": 0.9775, + "num_tokens": 5767506248.0, + "step": 1380 + }, + { + "epoch": 0.16411170528817587, + "grad_norm": 0.7034282238026422, + "learning_rate": 1.9872709618308815e-05, + "loss": 0.9444, + "num_tokens": 5771693993.0, + "step": 1381 + }, + { + "epoch": 0.16423054070112894, + "grad_norm": 0.7282608366774027, + "learning_rate": 1.9872419192355953e-05, + "loss": 0.9188, + "num_tokens": 5775855565.0, + "step": 1382 + }, + { + "epoch": 0.16434937611408199, + "grad_norm": 0.625109235799692, + "learning_rate": 1.9872128437827413e-05, + "loss": 0.9282, + "num_tokens": 5780044961.0, + "step": 1383 + }, + { + "epoch": 0.16446821152703506, + "grad_norm": 0.7174980931381676, + "learning_rate": 1.9871837354733956e-05, + "loss": 0.9635, + "num_tokens": 5784234422.0, + "step": 1384 + }, + { + "epoch": 0.16458704693998813, + "grad_norm": 0.7738402757533563, + "learning_rate": 1.9871545943086366e-05, + "loss": 0.9093, + "num_tokens": 5788423566.0, + "step": 1385 + }, + { + "epoch": 0.16470588235294117, + "grad_norm": 0.565326203493776, + "learning_rate": 1.9871254202895438e-05, + "loss": 0.9469, + "num_tokens": 5792612565.0, + "step": 1386 + }, + { + "epoch": 0.16482471776589425, + "grad_norm": 0.7163608563039383, + "learning_rate": 1.9870962134171966e-05, + "loss": 0.9378, + "num_tokens": 5796802150.0, + "step": 1387 + }, + { + "epoch": 0.1649435531788473, + "grad_norm": 0.7084256142929434, + "learning_rate": 1.9870669736926777e-05, + "loss": 0.9745, + "num_tokens": 5800990003.0, + "step": 1388 + }, + { + "epoch": 0.16506238859180036, + "grad_norm": 0.6875999853186051, + "learning_rate": 1.9870377011170697e-05, + "loss": 0.9836, + "num_tokens": 5805178858.0, + "step": 1389 + }, + { + "epoch": 0.1651812240047534, + "grad_norm": 0.7949111228278142, + "learning_rate": 1.987008395691456e-05, + "loss": 0.9498, + "num_tokens": 5809368418.0, + "step": 1390 + }, + { + "epoch": 0.16530005941770648, + "grad_norm": 0.6520134364665127, + "learning_rate": 1.9869790574169222e-05, + "loss": 0.9665, + "num_tokens": 5813557798.0, + "step": 1391 + }, + { + "epoch": 0.16541889483065952, + "grad_norm": 0.7707584028863232, + "learning_rate": 1.9869496862945558e-05, + "loss": 0.9485, + "num_tokens": 5817747189.0, + "step": 1392 + }, + { + "epoch": 0.1655377302436126, + "grad_norm": 0.6259540332641057, + "learning_rate": 1.986920282325443e-05, + "loss": 0.9857, + "num_tokens": 5821936198.0, + "step": 1393 + }, + { + "epoch": 0.16565656565656567, + "grad_norm": 0.7260718485077354, + "learning_rate": 1.9868908455106737e-05, + "loss": 0.9128, + "num_tokens": 5826124733.0, + "step": 1394 + }, + { + "epoch": 0.1657754010695187, + "grad_norm": 0.7393271102592079, + "learning_rate": 1.9868613758513376e-05, + "loss": 0.9437, + "num_tokens": 5830314388.0, + "step": 1395 + }, + { + "epoch": 0.16589423648247179, + "grad_norm": 0.5756867843341175, + "learning_rate": 1.986831873348526e-05, + "loss": 0.9114, + "num_tokens": 5834505488.0, + "step": 1396 + }, + { + "epoch": 0.16601307189542483, + "grad_norm": 0.83989971378104, + "learning_rate": 1.9868023380033322e-05, + "loss": 0.9378, + "num_tokens": 5838661888.0, + "step": 1397 + }, + { + "epoch": 0.1661319073083779, + "grad_norm": 0.66706689325593, + "learning_rate": 1.986772769816849e-05, + "loss": 0.9297, + "num_tokens": 5842850775.0, + "step": 1398 + }, + { + "epoch": 0.16625074272133095, + "grad_norm": 0.6803962863870514, + "learning_rate": 1.9867431687901725e-05, + "loss": 0.9948, + "num_tokens": 5847040341.0, + "step": 1399 + }, + { + "epoch": 0.16636957813428402, + "grad_norm": 0.5952761623546051, + "learning_rate": 1.9867135349243977e-05, + "loss": 0.9605, + "num_tokens": 5851218866.0, + "step": 1400 + }, + { + "epoch": 0.16648841354723706, + "grad_norm": 0.7315537200398524, + "learning_rate": 1.9866838682206226e-05, + "loss": 0.9592, + "num_tokens": 5855407023.0, + "step": 1401 + }, + { + "epoch": 0.16660724896019014, + "grad_norm": 0.6676511702854946, + "learning_rate": 1.9866541686799463e-05, + "loss": 0.9725, + "num_tokens": 5859595650.0, + "step": 1402 + }, + { + "epoch": 0.1667260843731432, + "grad_norm": 0.5844346778199322, + "learning_rate": 1.9866244363034682e-05, + "loss": 0.9045, + "num_tokens": 5863785786.0, + "step": 1403 + }, + { + "epoch": 0.16684491978609625, + "grad_norm": 0.7404202256466521, + "learning_rate": 1.9865946710922892e-05, + "loss": 0.9481, + "num_tokens": 5867935465.0, + "step": 1404 + }, + { + "epoch": 0.16696375519904932, + "grad_norm": 0.7632479554293653, + "learning_rate": 1.986564873047512e-05, + "loss": 0.9397, + "num_tokens": 5872125170.0, + "step": 1405 + }, + { + "epoch": 0.16708259061200237, + "grad_norm": 0.5818684709471496, + "learning_rate": 1.98653504217024e-05, + "loss": 0.9344, + "num_tokens": 5876315108.0, + "step": 1406 + }, + { + "epoch": 0.16720142602495544, + "grad_norm": 0.7886050625697892, + "learning_rate": 1.9865051784615784e-05, + "loss": 0.927, + "num_tokens": 5880503652.0, + "step": 1407 + }, + { + "epoch": 0.16732026143790849, + "grad_norm": 0.6295385204446025, + "learning_rate": 1.986475281922632e-05, + "loss": 0.9648, + "num_tokens": 5884679585.0, + "step": 1408 + }, + { + "epoch": 0.16743909685086156, + "grad_norm": 0.618715663188727, + "learning_rate": 1.9864453525545093e-05, + "loss": 0.9289, + "num_tokens": 5888848931.0, + "step": 1409 + }, + { + "epoch": 0.16755793226381463, + "grad_norm": 0.6798055658211626, + "learning_rate": 1.9864153903583178e-05, + "loss": 0.9311, + "num_tokens": 5893011635.0, + "step": 1410 + }, + { + "epoch": 0.16767676767676767, + "grad_norm": 0.6738031058408891, + "learning_rate": 1.9863853953351672e-05, + "loss": 0.976, + "num_tokens": 5897200598.0, + "step": 1411 + }, + { + "epoch": 0.16779560308972075, + "grad_norm": 0.7983542665741521, + "learning_rate": 1.9863553674861685e-05, + "loss": 0.9218, + "num_tokens": 5901360202.0, + "step": 1412 + }, + { + "epoch": 0.1679144385026738, + "grad_norm": 0.5822892244785325, + "learning_rate": 1.986325306812434e-05, + "loss": 0.9486, + "num_tokens": 5905548287.0, + "step": 1413 + }, + { + "epoch": 0.16803327391562686, + "grad_norm": 0.7790671656525237, + "learning_rate": 1.9862952133150765e-05, + "loss": 0.9528, + "num_tokens": 5909705490.0, + "step": 1414 + }, + { + "epoch": 0.1681521093285799, + "grad_norm": 0.5870501837421773, + "learning_rate": 1.9862650869952102e-05, + "loss": 0.9317, + "num_tokens": 5913854123.0, + "step": 1415 + }, + { + "epoch": 0.16827094474153298, + "grad_norm": 0.8720110176836353, + "learning_rate": 1.9862349278539516e-05, + "loss": 0.9485, + "num_tokens": 5918042037.0, + "step": 1416 + }, + { + "epoch": 0.16838978015448602, + "grad_norm": 0.5609254398094025, + "learning_rate": 1.986204735892417e-05, + "loss": 0.9414, + "num_tokens": 5922232354.0, + "step": 1417 + }, + { + "epoch": 0.1685086155674391, + "grad_norm": 0.8527077719534097, + "learning_rate": 1.986174511111725e-05, + "loss": 0.9484, + "num_tokens": 5926398999.0, + "step": 1418 + }, + { + "epoch": 0.16862745098039217, + "grad_norm": 0.6390814163269638, + "learning_rate": 1.9861442535129945e-05, + "loss": 0.9295, + "num_tokens": 5930551086.0, + "step": 1419 + }, + { + "epoch": 0.1687462863933452, + "grad_norm": 0.8925922501139014, + "learning_rate": 1.9861139630973462e-05, + "loss": 0.9466, + "num_tokens": 5934729020.0, + "step": 1420 + }, + { + "epoch": 0.16886512180629829, + "grad_norm": 0.6646049555394717, + "learning_rate": 1.9860836398659014e-05, + "loss": 0.9663, + "num_tokens": 5938918165.0, + "step": 1421 + }, + { + "epoch": 0.16898395721925133, + "grad_norm": 0.6715186303299795, + "learning_rate": 1.986053283819784e-05, + "loss": 0.9554, + "num_tokens": 5943106625.0, + "step": 1422 + }, + { + "epoch": 0.1691027926322044, + "grad_norm": 0.6262979842735876, + "learning_rate": 1.9860228949601176e-05, + "loss": 0.9552, + "num_tokens": 5947296750.0, + "step": 1423 + }, + { + "epoch": 0.16922162804515745, + "grad_norm": 0.8497268737985747, + "learning_rate": 1.9859924732880277e-05, + "loss": 0.9652, + "num_tokens": 5951444641.0, + "step": 1424 + }, + { + "epoch": 0.16934046345811052, + "grad_norm": 0.7474840552912279, + "learning_rate": 1.9859620188046407e-05, + "loss": 0.9751, + "num_tokens": 5955635128.0, + "step": 1425 + }, + { + "epoch": 0.1694592988710636, + "grad_norm": 0.5833579836716078, + "learning_rate": 1.9859315315110848e-05, + "loss": 0.9388, + "num_tokens": 5959825292.0, + "step": 1426 + }, + { + "epoch": 0.16957813428401664, + "grad_norm": 0.8313479599737544, + "learning_rate": 1.985901011408489e-05, + "loss": 0.9554, + "num_tokens": 5964015913.0, + "step": 1427 + }, + { + "epoch": 0.1696969696969697, + "grad_norm": 0.6180390513922343, + "learning_rate": 1.985870458497983e-05, + "loss": 0.9856, + "num_tokens": 5968186914.0, + "step": 1428 + }, + { + "epoch": 0.16981580510992275, + "grad_norm": 0.6965788691336946, + "learning_rate": 1.9858398727806992e-05, + "loss": 0.9532, + "num_tokens": 5972351112.0, + "step": 1429 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.7376694823240306, + "learning_rate": 1.9858092542577695e-05, + "loss": 0.9507, + "num_tokens": 5976541704.0, + "step": 1430 + }, + { + "epoch": 0.17005347593582887, + "grad_norm": 0.7606283095236284, + "learning_rate": 1.985778602930328e-05, + "loss": 0.9291, + "num_tokens": 5980730128.0, + "step": 1431 + }, + { + "epoch": 0.17017231134878194, + "grad_norm": 0.6904951085208683, + "learning_rate": 1.9857479187995104e-05, + "loss": 0.9186, + "num_tokens": 5984901515.0, + "step": 1432 + }, + { + "epoch": 0.17029114676173498, + "grad_norm": 0.6892959645277822, + "learning_rate": 1.9857172018664527e-05, + "loss": 0.9399, + "num_tokens": 5989062298.0, + "step": 1433 + }, + { + "epoch": 0.17040998217468806, + "grad_norm": 0.6965539802231385, + "learning_rate": 1.9856864521322915e-05, + "loss": 0.9512, + "num_tokens": 5993231290.0, + "step": 1434 + }, + { + "epoch": 0.17052881758764113, + "grad_norm": 0.6395460093132918, + "learning_rate": 1.985655669598167e-05, + "loss": 0.9117, + "num_tokens": 5997419944.0, + "step": 1435 + }, + { + "epoch": 0.17064765300059417, + "grad_norm": 0.6687546753315015, + "learning_rate": 1.9856248542652186e-05, + "loss": 0.9204, + "num_tokens": 6001567157.0, + "step": 1436 + }, + { + "epoch": 0.17076648841354725, + "grad_norm": 0.7781359301803977, + "learning_rate": 1.9855940061345873e-05, + "loss": 0.9572, + "num_tokens": 6005755737.0, + "step": 1437 + }, + { + "epoch": 0.1708853238265003, + "grad_norm": 0.6393715476272455, + "learning_rate": 1.985563125207416e-05, + "loss": 0.9525, + "num_tokens": 6009945931.0, + "step": 1438 + }, + { + "epoch": 0.17100415923945336, + "grad_norm": 0.8113157550296696, + "learning_rate": 1.9855322114848475e-05, + "loss": 0.9743, + "num_tokens": 6014135097.0, + "step": 1439 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 0.600392832841067, + "learning_rate": 1.9855012649680273e-05, + "loss": 0.9493, + "num_tokens": 6018324334.0, + "step": 1440 + }, + { + "epoch": 0.17124183006535948, + "grad_norm": 0.7865940229801933, + "learning_rate": 1.9854702856581015e-05, + "loss": 0.9473, + "num_tokens": 6022481782.0, + "step": 1441 + }, + { + "epoch": 0.17136066547831252, + "grad_norm": 0.7730848340658193, + "learning_rate": 1.985439273556217e-05, + "loss": 0.9586, + "num_tokens": 6026670681.0, + "step": 1442 + }, + { + "epoch": 0.1714795008912656, + "grad_norm": 0.6221136097374558, + "learning_rate": 1.985408228663523e-05, + "loss": 0.9533, + "num_tokens": 6030859155.0, + "step": 1443 + }, + { + "epoch": 0.17159833630421867, + "grad_norm": 0.7209745848817212, + "learning_rate": 1.9853771509811682e-05, + "loss": 0.9712, + "num_tokens": 6035047503.0, + "step": 1444 + }, + { + "epoch": 0.1717171717171717, + "grad_norm": 0.6311965487327031, + "learning_rate": 1.9853460405103037e-05, + "loss": 0.9646, + "num_tokens": 6039234290.0, + "step": 1445 + }, + { + "epoch": 0.17183600713012478, + "grad_norm": 0.743505690515662, + "learning_rate": 1.9853148972520824e-05, + "loss": 0.9565, + "num_tokens": 6043423029.0, + "step": 1446 + }, + { + "epoch": 0.17195484254307783, + "grad_norm": 0.6460554521607847, + "learning_rate": 1.9852837212076567e-05, + "loss": 0.929, + "num_tokens": 6047611962.0, + "step": 1447 + }, + { + "epoch": 0.1720736779560309, + "grad_norm": 0.7171987594210343, + "learning_rate": 1.985252512378182e-05, + "loss": 0.9942, + "num_tokens": 6051785019.0, + "step": 1448 + }, + { + "epoch": 0.17219251336898395, + "grad_norm": 0.7926211149565312, + "learning_rate": 1.9852212707648135e-05, + "loss": 0.9342, + "num_tokens": 6055975035.0, + "step": 1449 + }, + { + "epoch": 0.17231134878193702, + "grad_norm": 0.6615978398126675, + "learning_rate": 1.9851899963687083e-05, + "loss": 0.8863, + "num_tokens": 6060163355.0, + "step": 1450 + }, + { + "epoch": 0.1724301841948901, + "grad_norm": 0.6744678023821996, + "learning_rate": 1.9851586891910243e-05, + "loss": 0.9613, + "num_tokens": 6064351838.0, + "step": 1451 + }, + { + "epoch": 0.17254901960784313, + "grad_norm": 0.6615649641108615, + "learning_rate": 1.9851273492329217e-05, + "loss": 0.943, + "num_tokens": 6068516864.0, + "step": 1452 + }, + { + "epoch": 0.1726678550207962, + "grad_norm": 0.7278726985116454, + "learning_rate": 1.9850959764955605e-05, + "loss": 0.9351, + "num_tokens": 6072673708.0, + "step": 1453 + }, + { + "epoch": 0.17278669043374925, + "grad_norm": 0.6635364742469689, + "learning_rate": 1.9850645709801032e-05, + "loss": 0.9161, + "num_tokens": 6076846005.0, + "step": 1454 + }, + { + "epoch": 0.17290552584670232, + "grad_norm": 0.6332241854047937, + "learning_rate": 1.9850331326877116e-05, + "loss": 0.9534, + "num_tokens": 6081028015.0, + "step": 1455 + }, + { + "epoch": 0.17302436125965537, + "grad_norm": 0.7274092348280393, + "learning_rate": 1.985001661619551e-05, + "loss": 1.0107, + "num_tokens": 6085212328.0, + "step": 1456 + }, + { + "epoch": 0.17314319667260844, + "grad_norm": 0.6280268832955447, + "learning_rate": 1.984970157776787e-05, + "loss": 0.9829, + "num_tokens": 6089402076.0, + "step": 1457 + }, + { + "epoch": 0.17326203208556148, + "grad_norm": 0.6905281560924604, + "learning_rate": 1.984938621160585e-05, + "loss": 0.9929, + "num_tokens": 6093513326.0, + "step": 1458 + }, + { + "epoch": 0.17338086749851456, + "grad_norm": 0.6170226636582403, + "learning_rate": 1.9849070517721144e-05, + "loss": 0.9047, + "num_tokens": 6097700768.0, + "step": 1459 + }, + { + "epoch": 0.17349970291146763, + "grad_norm": 0.7728995876524793, + "learning_rate": 1.984875449612544e-05, + "loss": 0.9349, + "num_tokens": 6101882182.0, + "step": 1460 + }, + { + "epoch": 0.17361853832442067, + "grad_norm": 0.6450266642609324, + "learning_rate": 1.9848438146830432e-05, + "loss": 0.9315, + "num_tokens": 6106071094.0, + "step": 1461 + }, + { + "epoch": 0.17373737373737375, + "grad_norm": 0.7038070985235121, + "learning_rate": 1.984812146984785e-05, + "loss": 0.9548, + "num_tokens": 6110256076.0, + "step": 1462 + }, + { + "epoch": 0.1738562091503268, + "grad_norm": 0.6262209115112424, + "learning_rate": 1.984780446518941e-05, + "loss": 0.9217, + "num_tokens": 6114443700.0, + "step": 1463 + }, + { + "epoch": 0.17397504456327986, + "grad_norm": 0.6287358051646317, + "learning_rate": 1.9847487132866852e-05, + "loss": 0.9397, + "num_tokens": 6118633027.0, + "step": 1464 + }, + { + "epoch": 0.1740938799762329, + "grad_norm": 0.7287755579069671, + "learning_rate": 1.9847169472891936e-05, + "loss": 0.9963, + "num_tokens": 6122807647.0, + "step": 1465 + }, + { + "epoch": 0.17421271538918598, + "grad_norm": 0.7456660409158923, + "learning_rate": 1.9846851485276418e-05, + "loss": 0.9191, + "num_tokens": 6126996789.0, + "step": 1466 + }, + { + "epoch": 0.17433155080213902, + "grad_norm": 0.6796508651438932, + "learning_rate": 1.9846533170032083e-05, + "loss": 0.9128, + "num_tokens": 6131168684.0, + "step": 1467 + }, + { + "epoch": 0.1744503862150921, + "grad_norm": 0.6282174479821696, + "learning_rate": 1.984621452717071e-05, + "loss": 0.9703, + "num_tokens": 6135356529.0, + "step": 1468 + }, + { + "epoch": 0.17456922162804517, + "grad_norm": 0.7325658068595071, + "learning_rate": 1.9845895556704106e-05, + "loss": 0.9752, + "num_tokens": 6139546692.0, + "step": 1469 + }, + { + "epoch": 0.1746880570409982, + "grad_norm": 0.5683968057682863, + "learning_rate": 1.9845576258644078e-05, + "loss": 0.9556, + "num_tokens": 6143737262.0, + "step": 1470 + }, + { + "epoch": 0.17480689245395128, + "grad_norm": 0.7148508636274631, + "learning_rate": 1.984525663300245e-05, + "loss": 0.9597, + "num_tokens": 6147905547.0, + "step": 1471 + }, + { + "epoch": 0.17492572786690433, + "grad_norm": 0.7225224492098637, + "learning_rate": 1.9844936679791068e-05, + "loss": 0.916, + "num_tokens": 6152061881.0, + "step": 1472 + }, + { + "epoch": 0.1750445632798574, + "grad_norm": 0.6309555926708461, + "learning_rate": 1.9844616399021775e-05, + "loss": 0.9337, + "num_tokens": 6156250976.0, + "step": 1473 + }, + { + "epoch": 0.17516339869281045, + "grad_norm": 0.7538889500800845, + "learning_rate": 1.984429579070643e-05, + "loss": 0.9966, + "num_tokens": 6160439337.0, + "step": 1474 + }, + { + "epoch": 0.17528223410576352, + "grad_norm": 0.558645051880994, + "learning_rate": 1.984397485485691e-05, + "loss": 0.9372, + "num_tokens": 6164627695.0, + "step": 1475 + }, + { + "epoch": 0.1754010695187166, + "grad_norm": 0.7037111459849743, + "learning_rate": 1.98436535914851e-05, + "loss": 0.924, + "num_tokens": 6168816684.0, + "step": 1476 + }, + { + "epoch": 0.17551990493166963, + "grad_norm": 0.7201471807871871, + "learning_rate": 1.9843332000602892e-05, + "loss": 0.9631, + "num_tokens": 6173005869.0, + "step": 1477 + }, + { + "epoch": 0.1756387403446227, + "grad_norm": 0.5926833696891454, + "learning_rate": 1.9843010082222203e-05, + "loss": 0.9295, + "num_tokens": 6177195998.0, + "step": 1478 + }, + { + "epoch": 0.17575757575757575, + "grad_norm": 0.7683567195052441, + "learning_rate": 1.9842687836354955e-05, + "loss": 0.9418, + "num_tokens": 6181360018.0, + "step": 1479 + }, + { + "epoch": 0.17587641117052882, + "grad_norm": 0.6266340613512744, + "learning_rate": 1.9842365263013074e-05, + "loss": 0.9717, + "num_tokens": 6185544061.0, + "step": 1480 + }, + { + "epoch": 0.17599524658348187, + "grad_norm": 0.6414937931410898, + "learning_rate": 1.9842042362208512e-05, + "loss": 0.9392, + "num_tokens": 6189711637.0, + "step": 1481 + }, + { + "epoch": 0.17611408199643494, + "grad_norm": 0.5599631273700042, + "learning_rate": 1.9841719133953225e-05, + "loss": 0.9341, + "num_tokens": 6193901621.0, + "step": 1482 + }, + { + "epoch": 0.17623291740938798, + "grad_norm": 0.7113328090137737, + "learning_rate": 1.9841395578259187e-05, + "loss": 0.9479, + "num_tokens": 6198090037.0, + "step": 1483 + }, + { + "epoch": 0.17635175282234106, + "grad_norm": 0.6293355233731581, + "learning_rate": 1.9841071695138374e-05, + "loss": 0.9277, + "num_tokens": 6202254247.0, + "step": 1484 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.7390059332079, + "learning_rate": 1.9840747484602782e-05, + "loss": 0.9531, + "num_tokens": 6206442833.0, + "step": 1485 + }, + { + "epoch": 0.17658942364824717, + "grad_norm": 0.6742776327661208, + "learning_rate": 1.9840422946664423e-05, + "loss": 0.9646, + "num_tokens": 6210631134.0, + "step": 1486 + }, + { + "epoch": 0.17670825906120025, + "grad_norm": 0.6864140065333102, + "learning_rate": 1.9840098081335313e-05, + "loss": 0.9369, + "num_tokens": 6214819402.0, + "step": 1487 + }, + { + "epoch": 0.1768270944741533, + "grad_norm": 0.6436663659137144, + "learning_rate": 1.983977288862748e-05, + "loss": 0.9444, + "num_tokens": 6219007533.0, + "step": 1488 + }, + { + "epoch": 0.17694592988710636, + "grad_norm": 0.7199010138473665, + "learning_rate": 1.983944736855297e-05, + "loss": 0.9376, + "num_tokens": 6223198485.0, + "step": 1489 + }, + { + "epoch": 0.1770647653000594, + "grad_norm": 0.6095462933729564, + "learning_rate": 1.9839121521123833e-05, + "loss": 0.9516, + "num_tokens": 6227365098.0, + "step": 1490 + }, + { + "epoch": 0.17718360071301248, + "grad_norm": 0.6956823413387956, + "learning_rate": 1.9838795346352145e-05, + "loss": 0.9249, + "num_tokens": 6231554419.0, + "step": 1491 + }, + { + "epoch": 0.17730243612596555, + "grad_norm": 0.5964343970653806, + "learning_rate": 1.983846884424998e-05, + "loss": 0.9624, + "num_tokens": 6235742533.0, + "step": 1492 + }, + { + "epoch": 0.1774212715389186, + "grad_norm": 0.929421803257819, + "learning_rate": 1.9838142014829427e-05, + "loss": 0.9628, + "num_tokens": 6239932268.0, + "step": 1493 + }, + { + "epoch": 0.17754010695187167, + "grad_norm": 0.5074948814856413, + "learning_rate": 1.9837814858102594e-05, + "loss": 0.9307, + "num_tokens": 6244120959.0, + "step": 1494 + }, + { + "epoch": 0.1776589423648247, + "grad_norm": 0.9357679001129185, + "learning_rate": 1.9837487374081597e-05, + "loss": 0.9495, + "num_tokens": 6248300316.0, + "step": 1495 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.6532771700749945, + "learning_rate": 1.983715956277856e-05, + "loss": 0.9387, + "num_tokens": 6252471519.0, + "step": 1496 + }, + { + "epoch": 0.17789661319073083, + "grad_norm": 0.7966529560176747, + "learning_rate": 1.983683142420563e-05, + "loss": 0.9553, + "num_tokens": 6256635512.0, + "step": 1497 + }, + { + "epoch": 0.1780154486036839, + "grad_norm": 0.7979776881231047, + "learning_rate": 1.983650295837495e-05, + "loss": 0.8993, + "num_tokens": 6260825697.0, + "step": 1498 + }, + { + "epoch": 0.17813428401663695, + "grad_norm": 0.6155386602456341, + "learning_rate": 1.983617416529869e-05, + "loss": 0.9275, + "num_tokens": 6265014294.0, + "step": 1499 + }, + { + "epoch": 0.17825311942959002, + "grad_norm": 1.2001177495638045, + "learning_rate": 1.9835845044989024e-05, + "loss": 0.9046, + "num_tokens": 6269203255.0, + "step": 1500 + }, + { + "epoch": 0.1783719548425431, + "grad_norm": 0.7704761380005374, + "learning_rate": 1.9835515597458143e-05, + "loss": 0.9476, + "num_tokens": 6273392594.0, + "step": 1501 + }, + { + "epoch": 0.17849079025549613, + "grad_norm": 0.9565361297495103, + "learning_rate": 1.9835185822718245e-05, + "loss": 0.9648, + "num_tokens": 6277582351.0, + "step": 1502 + }, + { + "epoch": 0.1786096256684492, + "grad_norm": 0.7490107951485991, + "learning_rate": 1.9834855720781544e-05, + "loss": 0.9859, + "num_tokens": 6281770178.0, + "step": 1503 + }, + { + "epoch": 0.17872846108140225, + "grad_norm": 0.8231197537154106, + "learning_rate": 1.9834525291660264e-05, + "loss": 0.9729, + "num_tokens": 6285934238.0, + "step": 1504 + }, + { + "epoch": 0.17884729649435532, + "grad_norm": 0.7016950628897677, + "learning_rate": 1.983419453536664e-05, + "loss": 0.9579, + "num_tokens": 6290124560.0, + "step": 1505 + }, + { + "epoch": 0.17896613190730837, + "grad_norm": 0.7251103411580923, + "learning_rate": 1.9833863451912923e-05, + "loss": 0.9408, + "num_tokens": 6294314741.0, + "step": 1506 + }, + { + "epoch": 0.17908496732026144, + "grad_norm": 0.7488940968927701, + "learning_rate": 1.983353204131138e-05, + "loss": 0.9205, + "num_tokens": 6298504650.0, + "step": 1507 + }, + { + "epoch": 0.17920380273321448, + "grad_norm": 0.6161894039655998, + "learning_rate": 1.9833200303574272e-05, + "loss": 0.9435, + "num_tokens": 6302666121.0, + "step": 1508 + }, + { + "epoch": 0.17932263814616756, + "grad_norm": 0.806099277025628, + "learning_rate": 1.9832868238713895e-05, + "loss": 0.9413, + "num_tokens": 6306855260.0, + "step": 1509 + }, + { + "epoch": 0.17944147355912063, + "grad_norm": 0.6775720744449605, + "learning_rate": 1.9832535846742543e-05, + "loss": 0.9575, + "num_tokens": 6311039849.0, + "step": 1510 + }, + { + "epoch": 0.17956030897207367, + "grad_norm": 0.7598347171299322, + "learning_rate": 1.9832203127672522e-05, + "loss": 0.9114, + "num_tokens": 6315227132.0, + "step": 1511 + }, + { + "epoch": 0.17967914438502675, + "grad_norm": 0.6017170956982968, + "learning_rate": 1.9831870081516157e-05, + "loss": 0.9162, + "num_tokens": 6319392346.0, + "step": 1512 + }, + { + "epoch": 0.1797979797979798, + "grad_norm": 0.596618413472203, + "learning_rate": 1.9831536708285783e-05, + "loss": 0.9266, + "num_tokens": 6323582256.0, + "step": 1513 + }, + { + "epoch": 0.17991681521093286, + "grad_norm": 0.6599294159622429, + "learning_rate": 1.9831203007993744e-05, + "loss": 0.9409, + "num_tokens": 6327772025.0, + "step": 1514 + }, + { + "epoch": 0.1800356506238859, + "grad_norm": 0.6062257145583486, + "learning_rate": 1.98308689806524e-05, + "loss": 0.9307, + "num_tokens": 6331961955.0, + "step": 1515 + }, + { + "epoch": 0.18015448603683898, + "grad_norm": 0.7268029462050685, + "learning_rate": 1.9830534626274117e-05, + "loss": 0.932, + "num_tokens": 6336133026.0, + "step": 1516 + }, + { + "epoch": 0.18027332144979205, + "grad_norm": 0.6243754665962993, + "learning_rate": 1.9830199944871286e-05, + "loss": 0.9436, + "num_tokens": 6340321442.0, + "step": 1517 + }, + { + "epoch": 0.1803921568627451, + "grad_norm": 0.7767489164602174, + "learning_rate": 1.982986493645629e-05, + "loss": 0.9882, + "num_tokens": 6344509473.0, + "step": 1518 + }, + { + "epoch": 0.18051099227569817, + "grad_norm": 0.6802363989281534, + "learning_rate": 1.982952960104154e-05, + "loss": 0.9472, + "num_tokens": 6348681983.0, + "step": 1519 + }, + { + "epoch": 0.1806298276886512, + "grad_norm": 0.7317032390794136, + "learning_rate": 1.982919393863946e-05, + "loss": 0.9586, + "num_tokens": 6352849241.0, + "step": 1520 + }, + { + "epoch": 0.18074866310160428, + "grad_norm": 0.6746443156159927, + "learning_rate": 1.9828857949262473e-05, + "loss": 0.9759, + "num_tokens": 6357038529.0, + "step": 1521 + }, + { + "epoch": 0.18086749851455733, + "grad_norm": 0.6938746560418277, + "learning_rate": 1.9828521632923024e-05, + "loss": 0.9674, + "num_tokens": 6361227303.0, + "step": 1522 + }, + { + "epoch": 0.1809863339275104, + "grad_norm": 0.5704295237158271, + "learning_rate": 1.9828184989633574e-05, + "loss": 0.9236, + "num_tokens": 6365417401.0, + "step": 1523 + }, + { + "epoch": 0.18110516934046345, + "grad_norm": 0.7225153830697626, + "learning_rate": 1.982784801940658e-05, + "loss": 0.9196, + "num_tokens": 6369606977.0, + "step": 1524 + }, + { + "epoch": 0.18122400475341652, + "grad_norm": 0.6886804470180631, + "learning_rate": 1.9827510722254532e-05, + "loss": 0.936, + "num_tokens": 6373794075.0, + "step": 1525 + }, + { + "epoch": 0.1813428401663696, + "grad_norm": 0.6581506474735679, + "learning_rate": 1.9827173098189907e-05, + "loss": 0.9565, + "num_tokens": 6377970200.0, + "step": 1526 + }, + { + "epoch": 0.18146167557932263, + "grad_norm": 0.6374903907964832, + "learning_rate": 1.9826835147225226e-05, + "loss": 0.9536, + "num_tokens": 6382159636.0, + "step": 1527 + }, + { + "epoch": 0.1815805109922757, + "grad_norm": 0.7468107839371809, + "learning_rate": 1.982649686937299e-05, + "loss": 0.924, + "num_tokens": 6386347782.0, + "step": 1528 + }, + { + "epoch": 0.18169934640522875, + "grad_norm": 0.6414601541423622, + "learning_rate": 1.9826158264645732e-05, + "loss": 0.9563, + "num_tokens": 6390494447.0, + "step": 1529 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.7272749127810856, + "learning_rate": 1.9825819333055992e-05, + "loss": 0.935, + "num_tokens": 6394682424.0, + "step": 1530 + }, + { + "epoch": 0.18193701723113487, + "grad_norm": 0.6075988917859316, + "learning_rate": 1.9825480074616322e-05, + "loss": 0.9073, + "num_tokens": 6398871909.0, + "step": 1531 + }, + { + "epoch": 0.18205585264408794, + "grad_norm": 0.7789418798590823, + "learning_rate": 1.9825140489339286e-05, + "loss": 0.9409, + "num_tokens": 6403053786.0, + "step": 1532 + }, + { + "epoch": 0.182174688057041, + "grad_norm": 0.567499128827979, + "learning_rate": 1.9824800577237457e-05, + "loss": 0.954, + "num_tokens": 6407242213.0, + "step": 1533 + }, + { + "epoch": 0.18229352346999406, + "grad_norm": 0.7197764752966664, + "learning_rate": 1.9824460338323425e-05, + "loss": 0.9467, + "num_tokens": 6411399574.0, + "step": 1534 + }, + { + "epoch": 0.18241235888294713, + "grad_norm": 0.7016709862117497, + "learning_rate": 1.9824119772609787e-05, + "loss": 0.9214, + "num_tokens": 6415588816.0, + "step": 1535 + }, + { + "epoch": 0.18253119429590017, + "grad_norm": 0.515645668514504, + "learning_rate": 1.9823778880109165e-05, + "loss": 0.9342, + "num_tokens": 6419778269.0, + "step": 1536 + }, + { + "epoch": 0.18265002970885325, + "grad_norm": 0.584103233112964, + "learning_rate": 1.982343766083418e-05, + "loss": 0.9309, + "num_tokens": 6423966453.0, + "step": 1537 + }, + { + "epoch": 0.1827688651218063, + "grad_norm": 0.8362925007955969, + "learning_rate": 1.9823096114797456e-05, + "loss": 0.9292, + "num_tokens": 6428132734.0, + "step": 1538 + }, + { + "epoch": 0.18288770053475936, + "grad_norm": 0.6499868444694742, + "learning_rate": 1.982275424201166e-05, + "loss": 0.9195, + "num_tokens": 6432318229.0, + "step": 1539 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.7014134135328443, + "learning_rate": 1.9822412042489437e-05, + "loss": 0.9413, + "num_tokens": 6436488633.0, + "step": 1540 + }, + { + "epoch": 0.18312537136066548, + "grad_norm": 0.6160854474368138, + "learning_rate": 1.982206951624347e-05, + "loss": 0.9697, + "num_tokens": 6440676232.0, + "step": 1541 + }, + { + "epoch": 0.18324420677361855, + "grad_norm": 0.7543436506657011, + "learning_rate": 1.9821726663286444e-05, + "loss": 0.9001, + "num_tokens": 6444865962.0, + "step": 1542 + }, + { + "epoch": 0.1833630421865716, + "grad_norm": 0.5717991533512412, + "learning_rate": 1.982138348363105e-05, + "loss": 0.9366, + "num_tokens": 6449054874.0, + "step": 1543 + }, + { + "epoch": 0.18348187759952467, + "grad_norm": 0.8514884865118301, + "learning_rate": 1.9821039977289998e-05, + "loss": 0.9541, + "num_tokens": 6453220349.0, + "step": 1544 + }, + { + "epoch": 0.1836007130124777, + "grad_norm": 0.6167099404729571, + "learning_rate": 1.9820696144276014e-05, + "loss": 0.9226, + "num_tokens": 6457411102.0, + "step": 1545 + }, + { + "epoch": 0.18371954842543078, + "grad_norm": 0.7262672621354307, + "learning_rate": 1.9820351984601828e-05, + "loss": 0.9602, + "num_tokens": 6461585419.0, + "step": 1546 + }, + { + "epoch": 0.18383838383838383, + "grad_norm": 0.7437270088699819, + "learning_rate": 1.982000749828018e-05, + "loss": 0.933, + "num_tokens": 6465754847.0, + "step": 1547 + }, + { + "epoch": 0.1839572192513369, + "grad_norm": 0.6458214554744521, + "learning_rate": 1.981966268532384e-05, + "loss": 0.9749, + "num_tokens": 6469943886.0, + "step": 1548 + }, + { + "epoch": 0.18407605466428995, + "grad_norm": 0.7340856968061801, + "learning_rate": 1.9819317545745566e-05, + "loss": 0.9487, + "num_tokens": 6474133439.0, + "step": 1549 + }, + { + "epoch": 0.18419489007724302, + "grad_norm": 0.6274805451997244, + "learning_rate": 1.9818972079558147e-05, + "loss": 0.9579, + "num_tokens": 6478293469.0, + "step": 1550 + }, + { + "epoch": 0.1843137254901961, + "grad_norm": 0.7685192390321491, + "learning_rate": 1.9818626286774373e-05, + "loss": 0.9248, + "num_tokens": 6482481871.0, + "step": 1551 + }, + { + "epoch": 0.18443256090314913, + "grad_norm": 0.5731698040550309, + "learning_rate": 1.9818280167407055e-05, + "loss": 0.9449, + "num_tokens": 6486670528.0, + "step": 1552 + }, + { + "epoch": 0.1845513963161022, + "grad_norm": 0.7806416373171012, + "learning_rate": 1.9817933721469002e-05, + "loss": 0.9344, + "num_tokens": 6490859647.0, + "step": 1553 + }, + { + "epoch": 0.18467023172905525, + "grad_norm": 0.7540677182859024, + "learning_rate": 1.9817586948973054e-05, + "loss": 0.9282, + "num_tokens": 6494996528.0, + "step": 1554 + }, + { + "epoch": 0.18478906714200832, + "grad_norm": 0.7042035351457434, + "learning_rate": 1.9817239849932047e-05, + "loss": 0.9712, + "num_tokens": 6499185035.0, + "step": 1555 + }, + { + "epoch": 0.18490790255496137, + "grad_norm": 0.6559952826034806, + "learning_rate": 1.9816892424358832e-05, + "loss": 0.9401, + "num_tokens": 6503373562.0, + "step": 1556 + }, + { + "epoch": 0.18502673796791444, + "grad_norm": 0.7457790409325433, + "learning_rate": 1.9816544672266285e-05, + "loss": 0.9562, + "num_tokens": 6507558236.0, + "step": 1557 + }, + { + "epoch": 0.1851455733808675, + "grad_norm": 0.572735515598034, + "learning_rate": 1.9816196593667277e-05, + "loss": 0.953, + "num_tokens": 6511747352.0, + "step": 1558 + }, + { + "epoch": 0.18526440879382056, + "grad_norm": 0.7271066261309639, + "learning_rate": 1.98158481885747e-05, + "loss": 0.9565, + "num_tokens": 6515936481.0, + "step": 1559 + }, + { + "epoch": 0.18538324420677363, + "grad_norm": 0.6032371827136959, + "learning_rate": 1.9815499457001456e-05, + "loss": 0.9293, + "num_tokens": 6520105234.0, + "step": 1560 + }, + { + "epoch": 0.18550207961972667, + "grad_norm": 0.6765534628565635, + "learning_rate": 1.9815150398960466e-05, + "loss": 0.9082, + "num_tokens": 6524260303.0, + "step": 1561 + }, + { + "epoch": 0.18562091503267975, + "grad_norm": 0.7459078068114782, + "learning_rate": 1.981480101446465e-05, + "loss": 0.9283, + "num_tokens": 6528448276.0, + "step": 1562 + }, + { + "epoch": 0.1857397504456328, + "grad_norm": 0.6247146200388379, + "learning_rate": 1.981445130352695e-05, + "loss": 0.9642, + "num_tokens": 6532636147.0, + "step": 1563 + }, + { + "epoch": 0.18585858585858586, + "grad_norm": 0.802578043469922, + "learning_rate": 1.9814101266160314e-05, + "loss": 0.9293, + "num_tokens": 6536823531.0, + "step": 1564 + }, + { + "epoch": 0.1859774212715389, + "grad_norm": 0.573192001391761, + "learning_rate": 1.9813750902377707e-05, + "loss": 0.9466, + "num_tokens": 6541006267.0, + "step": 1565 + }, + { + "epoch": 0.18609625668449198, + "grad_norm": 0.7866647056724239, + "learning_rate": 1.9813400212192106e-05, + "loss": 0.9271, + "num_tokens": 6545164497.0, + "step": 1566 + }, + { + "epoch": 0.18621509209744505, + "grad_norm": 0.691859107477907, + "learning_rate": 1.9813049195616495e-05, + "loss": 0.9122, + "num_tokens": 6549355243.0, + "step": 1567 + }, + { + "epoch": 0.1863339275103981, + "grad_norm": 0.6975935783660783, + "learning_rate": 1.9812697852663873e-05, + "loss": 0.9307, + "num_tokens": 6553516588.0, + "step": 1568 + }, + { + "epoch": 0.18645276292335117, + "grad_norm": 0.6747477208537657, + "learning_rate": 1.9812346183347257e-05, + "loss": 0.9706, + "num_tokens": 6557703639.0, + "step": 1569 + }, + { + "epoch": 0.1865715983363042, + "grad_norm": 0.6384219038318513, + "learning_rate": 1.9811994187679665e-05, + "loss": 0.9479, + "num_tokens": 6561891155.0, + "step": 1570 + }, + { + "epoch": 0.18669043374925728, + "grad_norm": 0.6166007646438522, + "learning_rate": 1.9811641865674134e-05, + "loss": 0.9493, + "num_tokens": 6566081126.0, + "step": 1571 + }, + { + "epoch": 0.18680926916221033, + "grad_norm": 0.7263987079024334, + "learning_rate": 1.981128921734371e-05, + "loss": 0.9513, + "num_tokens": 6570270782.0, + "step": 1572 + }, + { + "epoch": 0.1869281045751634, + "grad_norm": 0.7037049975545608, + "learning_rate": 1.9810936242701457e-05, + "loss": 0.9243, + "num_tokens": 6574461112.0, + "step": 1573 + }, + { + "epoch": 0.18704693998811645, + "grad_norm": 0.674009758545363, + "learning_rate": 1.9810582941760444e-05, + "loss": 0.965, + "num_tokens": 6578648535.0, + "step": 1574 + }, + { + "epoch": 0.18716577540106952, + "grad_norm": 0.6412625947623342, + "learning_rate": 1.9810229314533753e-05, + "loss": 0.9692, + "num_tokens": 6582836316.0, + "step": 1575 + }, + { + "epoch": 0.1872846108140226, + "grad_norm": 0.7405862219116214, + "learning_rate": 1.980987536103448e-05, + "loss": 0.9053, + "num_tokens": 6587026533.0, + "step": 1576 + }, + { + "epoch": 0.18740344622697563, + "grad_norm": 0.6159719551845549, + "learning_rate": 1.9809521081275747e-05, + "loss": 0.8902, + "num_tokens": 6591215848.0, + "step": 1577 + }, + { + "epoch": 0.1875222816399287, + "grad_norm": 0.8309998838460696, + "learning_rate": 1.9809166475270655e-05, + "loss": 0.9069, + "num_tokens": 6595405943.0, + "step": 1578 + }, + { + "epoch": 0.18764111705288175, + "grad_norm": 0.7143139755173065, + "learning_rate": 1.9808811543032344e-05, + "loss": 0.9508, + "num_tokens": 6599595252.0, + "step": 1579 + }, + { + "epoch": 0.18775995246583482, + "grad_norm": 0.559631022480301, + "learning_rate": 1.9808456284573958e-05, + "loss": 0.9581, + "num_tokens": 6603766305.0, + "step": 1580 + }, + { + "epoch": 0.18787878787878787, + "grad_norm": 0.661708194384034, + "learning_rate": 1.9808100699908655e-05, + "loss": 0.9006, + "num_tokens": 6607954944.0, + "step": 1581 + }, + { + "epoch": 0.18799762329174094, + "grad_norm": 0.681288451190308, + "learning_rate": 1.98077447890496e-05, + "loss": 0.9332, + "num_tokens": 6612143443.0, + "step": 1582 + }, + { + "epoch": 0.188116458704694, + "grad_norm": 0.53985215845485, + "learning_rate": 1.9807388552009976e-05, + "loss": 0.9639, + "num_tokens": 6616318499.0, + "step": 1583 + }, + { + "epoch": 0.18823529411764706, + "grad_norm": 0.6306240707369902, + "learning_rate": 1.980703198880298e-05, + "loss": 0.937, + "num_tokens": 6620505449.0, + "step": 1584 + }, + { + "epoch": 0.18835412953060013, + "grad_norm": 0.712381753388363, + "learning_rate": 1.9806675099441806e-05, + "loss": 0.9478, + "num_tokens": 6624694876.0, + "step": 1585 + }, + { + "epoch": 0.18847296494355317, + "grad_norm": 0.6401772253127707, + "learning_rate": 1.980631788393968e-05, + "loss": 0.9208, + "num_tokens": 6628882968.0, + "step": 1586 + }, + { + "epoch": 0.18859180035650625, + "grad_norm": 0.6986776083774054, + "learning_rate": 1.9805960342309826e-05, + "loss": 0.8976, + "num_tokens": 6633073260.0, + "step": 1587 + }, + { + "epoch": 0.1887106357694593, + "grad_norm": 0.7531042060960274, + "learning_rate": 1.980560247456549e-05, + "loss": 0.9071, + "num_tokens": 6637246671.0, + "step": 1588 + }, + { + "epoch": 0.18882947118241236, + "grad_norm": 0.6358496666716852, + "learning_rate": 1.980524428071992e-05, + "loss": 0.913, + "num_tokens": 6641436027.0, + "step": 1589 + }, + { + "epoch": 0.1889483065953654, + "grad_norm": 0.5961188300148624, + "learning_rate": 1.9804885760786382e-05, + "loss": 0.947, + "num_tokens": 6645624479.0, + "step": 1590 + }, + { + "epoch": 0.18906714200831848, + "grad_norm": 0.6481718189171797, + "learning_rate": 1.9804526914778155e-05, + "loss": 0.9349, + "num_tokens": 6649813392.0, + "step": 1591 + }, + { + "epoch": 0.18918597742127155, + "grad_norm": 0.7281651647440159, + "learning_rate": 1.980416774270853e-05, + "loss": 0.9636, + "num_tokens": 6653976271.0, + "step": 1592 + }, + { + "epoch": 0.1893048128342246, + "grad_norm": 0.6309161997151699, + "learning_rate": 1.9803808244590806e-05, + "loss": 0.9489, + "num_tokens": 6658165924.0, + "step": 1593 + }, + { + "epoch": 0.18942364824717767, + "grad_norm": 0.738572175577202, + "learning_rate": 1.9803448420438297e-05, + "loss": 0.916, + "num_tokens": 6662349989.0, + "step": 1594 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.6383323291254351, + "learning_rate": 1.9803088270264324e-05, + "loss": 0.9599, + "num_tokens": 6666539232.0, + "step": 1595 + }, + { + "epoch": 0.18966131907308378, + "grad_norm": 0.7789996244515802, + "learning_rate": 1.9802727794082227e-05, + "loss": 0.9666, + "num_tokens": 6670682188.0, + "step": 1596 + }, + { + "epoch": 0.18978015448603683, + "grad_norm": 0.5621189520667144, + "learning_rate": 1.980236699190536e-05, + "loss": 0.9139, + "num_tokens": 6674872470.0, + "step": 1597 + }, + { + "epoch": 0.1898989898989899, + "grad_norm": 0.6590488617567078, + "learning_rate": 1.9802005863747084e-05, + "loss": 0.9368, + "num_tokens": 6678969658.0, + "step": 1598 + }, + { + "epoch": 0.19001782531194297, + "grad_norm": 0.7510679897802401, + "learning_rate": 1.9801644409620767e-05, + "loss": 0.9429, + "num_tokens": 6683158866.0, + "step": 1599 + }, + { + "epoch": 0.19013666072489602, + "grad_norm": 0.6676040899569897, + "learning_rate": 1.98012826295398e-05, + "loss": 0.9389, + "num_tokens": 6687330756.0, + "step": 1600 + }, + { + "epoch": 0.1902554961378491, + "grad_norm": 0.5581518146373501, + "learning_rate": 1.980092052351758e-05, + "loss": 0.9237, + "num_tokens": 6691519050.0, + "step": 1601 + }, + { + "epoch": 0.19037433155080213, + "grad_norm": 0.6295952588218506, + "learning_rate": 1.9800558091567514e-05, + "loss": 0.9683, + "num_tokens": 6695709008.0, + "step": 1602 + }, + { + "epoch": 0.1904931669637552, + "grad_norm": 0.6418196544128917, + "learning_rate": 1.980019533370303e-05, + "loss": 0.9501, + "num_tokens": 6699877995.0, + "step": 1603 + }, + { + "epoch": 0.19061200237670825, + "grad_norm": 0.7288383461964, + "learning_rate": 1.9799832249937556e-05, + "loss": 0.9667, + "num_tokens": 6704058824.0, + "step": 1604 + }, + { + "epoch": 0.19073083778966132, + "grad_norm": 0.582777467103515, + "learning_rate": 1.979946884028454e-05, + "loss": 0.9392, + "num_tokens": 6708245926.0, + "step": 1605 + }, + { + "epoch": 0.19084967320261437, + "grad_norm": 0.7078648225379366, + "learning_rate": 1.9799105104757442e-05, + "loss": 0.9228, + "num_tokens": 6712435148.0, + "step": 1606 + }, + { + "epoch": 0.19096850861556744, + "grad_norm": 0.6419297134817433, + "learning_rate": 1.9798741043369733e-05, + "loss": 0.9526, + "num_tokens": 6716610161.0, + "step": 1607 + }, + { + "epoch": 0.1910873440285205, + "grad_norm": 0.687194606498348, + "learning_rate": 1.9798376656134893e-05, + "loss": 0.9638, + "num_tokens": 6720798990.0, + "step": 1608 + }, + { + "epoch": 0.19120617944147356, + "grad_norm": 0.605414704697992, + "learning_rate": 1.9798011943066418e-05, + "loss": 0.9229, + "num_tokens": 6724959932.0, + "step": 1609 + }, + { + "epoch": 0.19132501485442663, + "grad_norm": 0.6927728673304174, + "learning_rate": 1.9797646904177812e-05, + "loss": 0.9631, + "num_tokens": 6729149837.0, + "step": 1610 + }, + { + "epoch": 0.19144385026737967, + "grad_norm": 0.6015154024024797, + "learning_rate": 1.9797281539482598e-05, + "loss": 0.9316, + "num_tokens": 6733338743.0, + "step": 1611 + }, + { + "epoch": 0.19156268568033274, + "grad_norm": 0.7160791463156573, + "learning_rate": 1.9796915848994304e-05, + "loss": 0.9131, + "num_tokens": 6737502686.0, + "step": 1612 + }, + { + "epoch": 0.1916815210932858, + "grad_norm": 0.7034712476880369, + "learning_rate": 1.979654983272647e-05, + "loss": 0.9454, + "num_tokens": 6741691454.0, + "step": 1613 + }, + { + "epoch": 0.19180035650623886, + "grad_norm": 0.6150954083571546, + "learning_rate": 1.9796183490692656e-05, + "loss": 0.9263, + "num_tokens": 6745879918.0, + "step": 1614 + }, + { + "epoch": 0.1919191919191919, + "grad_norm": 0.7047857481100303, + "learning_rate": 1.9795816822906425e-05, + "loss": 0.9845, + "num_tokens": 6750059773.0, + "step": 1615 + }, + { + "epoch": 0.19203802733214498, + "grad_norm": 0.6339222035061018, + "learning_rate": 1.979544982938136e-05, + "loss": 0.9369, + "num_tokens": 6754248800.0, + "step": 1616 + }, + { + "epoch": 0.19215686274509805, + "grad_norm": 0.7271079258297449, + "learning_rate": 1.979508251013105e-05, + "loss": 0.9482, + "num_tokens": 6758438255.0, + "step": 1617 + }, + { + "epoch": 0.1922756981580511, + "grad_norm": 0.6696126310834092, + "learning_rate": 1.97947148651691e-05, + "loss": 0.9435, + "num_tokens": 6762604563.0, + "step": 1618 + }, + { + "epoch": 0.19239453357100417, + "grad_norm": 0.669856311599321, + "learning_rate": 1.9794346894509117e-05, + "loss": 0.9237, + "num_tokens": 6766791079.0, + "step": 1619 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 0.5399246751836616, + "learning_rate": 1.9793978598164737e-05, + "loss": 0.9184, + "num_tokens": 6770951027.0, + "step": 1620 + }, + { + "epoch": 0.19263220439691028, + "grad_norm": 0.6868723697515242, + "learning_rate": 1.97936099761496e-05, + "loss": 0.9636, + "num_tokens": 6775140734.0, + "step": 1621 + }, + { + "epoch": 0.19275103980986333, + "grad_norm": 0.6710682318542847, + "learning_rate": 1.9793241028477347e-05, + "loss": 0.9385, + "num_tokens": 6779312645.0, + "step": 1622 + }, + { + "epoch": 0.1928698752228164, + "grad_norm": 0.7279992554221227, + "learning_rate": 1.9792871755161652e-05, + "loss": 0.9279, + "num_tokens": 6783478293.0, + "step": 1623 + }, + { + "epoch": 0.19298871063576947, + "grad_norm": 0.663407391391603, + "learning_rate": 1.9792502156216185e-05, + "loss": 0.9614, + "num_tokens": 6787668139.0, + "step": 1624 + }, + { + "epoch": 0.19310754604872252, + "grad_norm": 0.5856274637544209, + "learning_rate": 1.9792132231654635e-05, + "loss": 0.9414, + "num_tokens": 6791843860.0, + "step": 1625 + }, + { + "epoch": 0.1932263814616756, + "grad_norm": 0.6590391731119198, + "learning_rate": 1.9791761981490706e-05, + "loss": 0.9537, + "num_tokens": 6796031652.0, + "step": 1626 + }, + { + "epoch": 0.19334521687462863, + "grad_norm": 0.6597609741778909, + "learning_rate": 1.97913914057381e-05, + "loss": 0.9225, + "num_tokens": 6800220798.0, + "step": 1627 + }, + { + "epoch": 0.1934640522875817, + "grad_norm": 0.6350824109518834, + "learning_rate": 1.9791020504410545e-05, + "loss": 0.9149, + "num_tokens": 6804411442.0, + "step": 1628 + }, + { + "epoch": 0.19358288770053475, + "grad_norm": 0.667353082601684, + "learning_rate": 1.979064927752178e-05, + "loss": 0.9275, + "num_tokens": 6808600540.0, + "step": 1629 + }, + { + "epoch": 0.19370172311348782, + "grad_norm": 0.5873823491643548, + "learning_rate": 1.979027772508555e-05, + "loss": 0.9504, + "num_tokens": 6812789266.0, + "step": 1630 + }, + { + "epoch": 0.19382055852644087, + "grad_norm": 0.7427469846055162, + "learning_rate": 1.9789905847115616e-05, + "loss": 0.9741, + "num_tokens": 6816979551.0, + "step": 1631 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.5996419659191164, + "learning_rate": 1.978953364362575e-05, + "loss": 0.9566, + "num_tokens": 6821167905.0, + "step": 1632 + }, + { + "epoch": 0.194058229352347, + "grad_norm": 0.754827358264245, + "learning_rate": 1.978916111462973e-05, + "loss": 0.9047, + "num_tokens": 6825357929.0, + "step": 1633 + }, + { + "epoch": 0.19417706476530006, + "grad_norm": 0.5784683216888263, + "learning_rate": 1.978878826014136e-05, + "loss": 0.9511, + "num_tokens": 6829506578.0, + "step": 1634 + }, + { + "epoch": 0.19429590017825313, + "grad_norm": 0.5578050968027789, + "learning_rate": 1.9788415080174446e-05, + "loss": 0.9818, + "num_tokens": 6833636860.0, + "step": 1635 + }, + { + "epoch": 0.19441473559120617, + "grad_norm": 0.6267007379733825, + "learning_rate": 1.978804157474281e-05, + "loss": 0.9567, + "num_tokens": 6837826227.0, + "step": 1636 + }, + { + "epoch": 0.19453357100415924, + "grad_norm": 0.6171012858333585, + "learning_rate": 1.978766774386028e-05, + "loss": 0.9777, + "num_tokens": 6842015734.0, + "step": 1637 + }, + { + "epoch": 0.1946524064171123, + "grad_norm": 0.5931490311543939, + "learning_rate": 1.9787293587540694e-05, + "loss": 0.9251, + "num_tokens": 6846206178.0, + "step": 1638 + }, + { + "epoch": 0.19477124183006536, + "grad_norm": 0.788985179329218, + "learning_rate": 1.9786919105797922e-05, + "loss": 0.9131, + "num_tokens": 6850396089.0, + "step": 1639 + }, + { + "epoch": 0.1948900772430184, + "grad_norm": 0.6880860403385725, + "learning_rate": 1.9786544298645828e-05, + "loss": 0.8867, + "num_tokens": 6854554939.0, + "step": 1640 + }, + { + "epoch": 0.19500891265597148, + "grad_norm": 0.6615219663004344, + "learning_rate": 1.9786169166098288e-05, + "loss": 0.9528, + "num_tokens": 6858730089.0, + "step": 1641 + }, + { + "epoch": 0.19512774806892455, + "grad_norm": 0.5359648986710468, + "learning_rate": 1.9785793708169197e-05, + "loss": 0.9744, + "num_tokens": 6862916823.0, + "step": 1642 + }, + { + "epoch": 0.1952465834818776, + "grad_norm": 0.6257368762267774, + "learning_rate": 1.978541792487246e-05, + "loss": 0.9305, + "num_tokens": 6867084528.0, + "step": 1643 + }, + { + "epoch": 0.19536541889483067, + "grad_norm": 0.8313813604352, + "learning_rate": 1.9785041816221995e-05, + "loss": 0.9349, + "num_tokens": 6871273832.0, + "step": 1644 + }, + { + "epoch": 0.1954842543077837, + "grad_norm": 0.649123541683152, + "learning_rate": 1.9784665382231727e-05, + "loss": 0.9261, + "num_tokens": 6875462376.0, + "step": 1645 + }, + { + "epoch": 0.19560308972073678, + "grad_norm": 0.5372746626436009, + "learning_rate": 1.97842886229156e-05, + "loss": 0.9122, + "num_tokens": 6879650470.0, + "step": 1646 + }, + { + "epoch": 0.19572192513368983, + "grad_norm": 0.6855071567756565, + "learning_rate": 1.978391153828756e-05, + "loss": 0.8976, + "num_tokens": 6883841465.0, + "step": 1647 + }, + { + "epoch": 0.1958407605466429, + "grad_norm": 0.5750227169455578, + "learning_rate": 1.9783534128361583e-05, + "loss": 0.9003, + "num_tokens": 6888032299.0, + "step": 1648 + }, + { + "epoch": 0.19595959595959597, + "grad_norm": 0.8997831796507805, + "learning_rate": 1.9783156393151634e-05, + "loss": 0.9152, + "num_tokens": 6892217516.0, + "step": 1649 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.48696794257038, + "learning_rate": 1.9782778332671712e-05, + "loss": 0.8867, + "num_tokens": 6896406162.0, + "step": 1650 + }, + { + "epoch": 0.1961972667855021, + "grad_norm": 0.6305745166040811, + "learning_rate": 1.9782399946935807e-05, + "loss": 0.9374, + "num_tokens": 6900595882.0, + "step": 1651 + }, + { + "epoch": 0.19631610219845513, + "grad_norm": 0.5316980387831967, + "learning_rate": 1.9782021235957943e-05, + "loss": 0.9454, + "num_tokens": 6904758579.0, + "step": 1652 + }, + { + "epoch": 0.1964349376114082, + "grad_norm": 0.9519503207569394, + "learning_rate": 1.9781642199752136e-05, + "loss": 0.9035, + "num_tokens": 6908947123.0, + "step": 1653 + }, + { + "epoch": 0.19655377302436125, + "grad_norm": 0.5105160850209752, + "learning_rate": 1.978126283833243e-05, + "loss": 0.9173, + "num_tokens": 6913137580.0, + "step": 1654 + }, + { + "epoch": 0.19667260843731432, + "grad_norm": 0.840538237500728, + "learning_rate": 1.978088315171287e-05, + "loss": 0.9563, + "num_tokens": 6917314151.0, + "step": 1655 + }, + { + "epoch": 0.19679144385026737, + "grad_norm": 0.7418663884449193, + "learning_rate": 1.9780503139907518e-05, + "loss": 0.9319, + "num_tokens": 6921502296.0, + "step": 1656 + }, + { + "epoch": 0.19691027926322044, + "grad_norm": 0.5515449532143976, + "learning_rate": 1.978012280293045e-05, + "loss": 0.9137, + "num_tokens": 6925692536.0, + "step": 1657 + }, + { + "epoch": 0.1970291146761735, + "grad_norm": 0.8613408932180677, + "learning_rate": 1.9779742140795742e-05, + "loss": 0.9421, + "num_tokens": 6929881464.0, + "step": 1658 + }, + { + "epoch": 0.19714795008912656, + "grad_norm": 0.5686502212541459, + "learning_rate": 1.9779361153517504e-05, + "loss": 0.9324, + "num_tokens": 6934070422.0, + "step": 1659 + }, + { + "epoch": 0.19726678550207963, + "grad_norm": 0.7538310131188958, + "learning_rate": 1.9778979841109836e-05, + "loss": 0.9252, + "num_tokens": 6938259541.0, + "step": 1660 + }, + { + "epoch": 0.19738562091503267, + "grad_norm": 0.7164584289821136, + "learning_rate": 1.977859820358686e-05, + "loss": 0.9372, + "num_tokens": 6942446712.0, + "step": 1661 + }, + { + "epoch": 0.19750445632798574, + "grad_norm": 0.5932002015881165, + "learning_rate": 1.9778216240962713e-05, + "loss": 0.9285, + "num_tokens": 6946636124.0, + "step": 1662 + }, + { + "epoch": 0.1976232917409388, + "grad_norm": 0.7002325198925975, + "learning_rate": 1.977783395325154e-05, + "loss": 0.967, + "num_tokens": 6950824150.0, + "step": 1663 + }, + { + "epoch": 0.19774212715389186, + "grad_norm": 0.5638359243184983, + "learning_rate": 1.9777451340467496e-05, + "loss": 0.9527, + "num_tokens": 6954985285.0, + "step": 1664 + }, + { + "epoch": 0.19786096256684493, + "grad_norm": 0.7982140324600789, + "learning_rate": 1.977706840262475e-05, + "loss": 0.8893, + "num_tokens": 6959175039.0, + "step": 1665 + }, + { + "epoch": 0.19797979797979798, + "grad_norm": 0.6319595811701826, + "learning_rate": 1.9776685139737487e-05, + "loss": 0.9007, + "num_tokens": 6963332147.0, + "step": 1666 + }, + { + "epoch": 0.19809863339275105, + "grad_norm": 0.6202905552613405, + "learning_rate": 1.97763015518199e-05, + "loss": 0.9551, + "num_tokens": 6967521589.0, + "step": 1667 + }, + { + "epoch": 0.1982174688057041, + "grad_norm": 0.6964546024153121, + "learning_rate": 1.9775917638886193e-05, + "loss": 0.9484, + "num_tokens": 6971673355.0, + "step": 1668 + }, + { + "epoch": 0.19833630421865717, + "grad_norm": 0.624306886993022, + "learning_rate": 1.9775533400950582e-05, + "loss": 0.9486, + "num_tokens": 6975862986.0, + "step": 1669 + }, + { + "epoch": 0.1984551396316102, + "grad_norm": 0.6361438090245323, + "learning_rate": 1.9775148838027297e-05, + "loss": 0.8987, + "num_tokens": 6980039237.0, + "step": 1670 + }, + { + "epoch": 0.19857397504456328, + "grad_norm": 0.7367183126430662, + "learning_rate": 1.9774763950130584e-05, + "loss": 0.9283, + "num_tokens": 6984198785.0, + "step": 1671 + }, + { + "epoch": 0.19869281045751633, + "grad_norm": 0.5728290057305212, + "learning_rate": 1.977437873727469e-05, + "loss": 0.9251, + "num_tokens": 6988382596.0, + "step": 1672 + }, + { + "epoch": 0.1988116458704694, + "grad_norm": 0.6605439927723864, + "learning_rate": 1.977399319947389e-05, + "loss": 0.9291, + "num_tokens": 6992570108.0, + "step": 1673 + }, + { + "epoch": 0.19893048128342247, + "grad_norm": 0.679929014858227, + "learning_rate": 1.977360733674245e-05, + "loss": 0.9185, + "num_tokens": 6996757713.0, + "step": 1674 + }, + { + "epoch": 0.19904931669637552, + "grad_norm": 0.7613368591291982, + "learning_rate": 1.977322114909467e-05, + "loss": 0.9402, + "num_tokens": 7000944958.0, + "step": 1675 + }, + { + "epoch": 0.1991681521093286, + "grad_norm": 0.5831524537278769, + "learning_rate": 1.9772834636544848e-05, + "loss": 0.9623, + "num_tokens": 7005116282.0, + "step": 1676 + }, + { + "epoch": 0.19928698752228163, + "grad_norm": 0.7254416119166199, + "learning_rate": 1.9772447799107297e-05, + "loss": 0.9672, + "num_tokens": 7009305403.0, + "step": 1677 + }, + { + "epoch": 0.1994058229352347, + "grad_norm": 0.6011922645862223, + "learning_rate": 1.977206063679634e-05, + "loss": 0.9349, + "num_tokens": 7013473806.0, + "step": 1678 + }, + { + "epoch": 0.19952465834818775, + "grad_norm": 0.5865626325186728, + "learning_rate": 1.9771673149626326e-05, + "loss": 0.916, + "num_tokens": 7017664740.0, + "step": 1679 + }, + { + "epoch": 0.19964349376114082, + "grad_norm": 0.6316344386347679, + "learning_rate": 1.9771285337611595e-05, + "loss": 0.9564, + "num_tokens": 7021853647.0, + "step": 1680 + }, + { + "epoch": 0.19976232917409387, + "grad_norm": 0.745140781797426, + "learning_rate": 1.977089720076651e-05, + "loss": 0.9258, + "num_tokens": 7026043191.0, + "step": 1681 + }, + { + "epoch": 0.19988116458704694, + "grad_norm": 0.6830517387941526, + "learning_rate": 1.9770508739105444e-05, + "loss": 0.9395, + "num_tokens": 7030231374.0, + "step": 1682 + }, + { + "epoch": 0.2, + "grad_norm": 0.6412253021084634, + "learning_rate": 1.977011995264279e-05, + "loss": 0.9505, + "num_tokens": 7034392932.0, + "step": 1683 + }, + { + "epoch": 0.20011883541295306, + "grad_norm": 0.6773637225374193, + "learning_rate": 1.976973084139294e-05, + "loss": 0.9362, + "num_tokens": 7038577104.0, + "step": 1684 + }, + { + "epoch": 0.20023767082590613, + "grad_norm": 0.7154468810558123, + "learning_rate": 1.9769341405370307e-05, + "loss": 0.9398, + "num_tokens": 7042765908.0, + "step": 1685 + }, + { + "epoch": 0.20035650623885917, + "grad_norm": 0.6822022246891224, + "learning_rate": 1.976895164458931e-05, + "loss": 0.9444, + "num_tokens": 7046956049.0, + "step": 1686 + }, + { + "epoch": 0.20047534165181224, + "grad_norm": 0.6445212497649615, + "learning_rate": 1.9768561559064384e-05, + "loss": 0.9366, + "num_tokens": 7051142305.0, + "step": 1687 + }, + { + "epoch": 0.2005941770647653, + "grad_norm": 0.6823522680580068, + "learning_rate": 1.976817114880998e-05, + "loss": 0.9435, + "num_tokens": 7055331842.0, + "step": 1688 + }, + { + "epoch": 0.20071301247771836, + "grad_norm": 0.5364554966701084, + "learning_rate": 1.976778041384055e-05, + "loss": 0.9313, + "num_tokens": 7059518986.0, + "step": 1689 + }, + { + "epoch": 0.20083184789067143, + "grad_norm": 0.7547361637315375, + "learning_rate": 1.9767389354170567e-05, + "loss": 0.9237, + "num_tokens": 7063708094.0, + "step": 1690 + }, + { + "epoch": 0.20095068330362448, + "grad_norm": 0.682566779576465, + "learning_rate": 1.9766997969814514e-05, + "loss": 0.9531, + "num_tokens": 7067896681.0, + "step": 1691 + }, + { + "epoch": 0.20106951871657755, + "grad_norm": 0.6770454959132177, + "learning_rate": 1.9766606260786883e-05, + "loss": 0.9221, + "num_tokens": 7072084703.0, + "step": 1692 + }, + { + "epoch": 0.2011883541295306, + "grad_norm": 0.5597840294460904, + "learning_rate": 1.976621422710218e-05, + "loss": 0.9289, + "num_tokens": 7076274113.0, + "step": 1693 + }, + { + "epoch": 0.20130718954248367, + "grad_norm": 0.6750945122889823, + "learning_rate": 1.9765821868774928e-05, + "loss": 0.9148, + "num_tokens": 7080450634.0, + "step": 1694 + }, + { + "epoch": 0.2014260249554367, + "grad_norm": 0.637166776684465, + "learning_rate": 1.9765429185819653e-05, + "loss": 0.9634, + "num_tokens": 7084616856.0, + "step": 1695 + }, + { + "epoch": 0.20154486036838978, + "grad_norm": 0.6242822522752481, + "learning_rate": 1.97650361782509e-05, + "loss": 0.9212, + "num_tokens": 7088804979.0, + "step": 1696 + }, + { + "epoch": 0.20166369578134283, + "grad_norm": 0.6611478321209238, + "learning_rate": 1.9764642846083222e-05, + "loss": 0.9451, + "num_tokens": 7092992548.0, + "step": 1697 + }, + { + "epoch": 0.2017825311942959, + "grad_norm": 0.701047221078464, + "learning_rate": 1.9764249189331183e-05, + "loss": 0.9106, + "num_tokens": 7097181572.0, + "step": 1698 + }, + { + "epoch": 0.20190136660724897, + "grad_norm": 0.5661215356481152, + "learning_rate": 1.9763855208009366e-05, + "loss": 0.9238, + "num_tokens": 7101369738.0, + "step": 1699 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.6226856705066893, + "learning_rate": 1.9763460902132358e-05, + "loss": 0.933, + "num_tokens": 7105535917.0, + "step": 1700 + }, + { + "epoch": 0.2021390374331551, + "grad_norm": 0.6470519707412329, + "learning_rate": 1.9763066271714764e-05, + "loss": 0.9752, + "num_tokens": 7109725444.0, + "step": 1701 + }, + { + "epoch": 0.20225787284610813, + "grad_norm": 0.6818146975025539, + "learning_rate": 1.9762671316771197e-05, + "loss": 0.9468, + "num_tokens": 7113915665.0, + "step": 1702 + }, + { + "epoch": 0.2023767082590612, + "grad_norm": 0.6118020914370992, + "learning_rate": 1.976227603731628e-05, + "loss": 0.9275, + "num_tokens": 7118073068.0, + "step": 1703 + }, + { + "epoch": 0.20249554367201425, + "grad_norm": 0.6055714489957862, + "learning_rate": 1.976188043336466e-05, + "loss": 0.9562, + "num_tokens": 7122245009.0, + "step": 1704 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.6440735731734707, + "learning_rate": 1.9761484504930983e-05, + "loss": 0.9057, + "num_tokens": 7126434680.0, + "step": 1705 + }, + { + "epoch": 0.20273321449792037, + "grad_norm": 0.6827761258109795, + "learning_rate": 1.976108825202991e-05, + "loss": 0.9544, + "num_tokens": 7130624410.0, + "step": 1706 + }, + { + "epoch": 0.20285204991087344, + "grad_norm": 0.556420833973879, + "learning_rate": 1.976069167467612e-05, + "loss": 0.965, + "num_tokens": 7134811931.0, + "step": 1707 + }, + { + "epoch": 0.2029708853238265, + "grad_norm": 0.6045701391871763, + "learning_rate": 1.9760294772884294e-05, + "loss": 0.9516, + "num_tokens": 7138930893.0, + "step": 1708 + }, + { + "epoch": 0.20308972073677956, + "grad_norm": 0.7547745268223434, + "learning_rate": 1.9759897546669134e-05, + "loss": 0.9482, + "num_tokens": 7143114398.0, + "step": 1709 + }, + { + "epoch": 0.20320855614973263, + "grad_norm": 0.6085490206686552, + "learning_rate": 1.975949999604535e-05, + "loss": 0.9253, + "num_tokens": 7147301365.0, + "step": 1710 + }, + { + "epoch": 0.20332739156268567, + "grad_norm": 0.8142569240734074, + "learning_rate": 1.9759102121027663e-05, + "loss": 0.9158, + "num_tokens": 7151459659.0, + "step": 1711 + }, + { + "epoch": 0.20344622697563874, + "grad_norm": 0.5694099073517591, + "learning_rate": 1.975870392163081e-05, + "loss": 0.9503, + "num_tokens": 7155648531.0, + "step": 1712 + }, + { + "epoch": 0.2035650623885918, + "grad_norm": 0.6005085478752925, + "learning_rate": 1.9758305397869536e-05, + "loss": 0.9071, + "num_tokens": 7159836842.0, + "step": 1713 + }, + { + "epoch": 0.20368389780154486, + "grad_norm": 0.6166057698353794, + "learning_rate": 1.9757906549758602e-05, + "loss": 0.9378, + "num_tokens": 7164023339.0, + "step": 1714 + }, + { + "epoch": 0.20380273321449793, + "grad_norm": 0.7212236220801846, + "learning_rate": 1.9757507377312776e-05, + "loss": 0.9265, + "num_tokens": 7168212018.0, + "step": 1715 + }, + { + "epoch": 0.20392156862745098, + "grad_norm": 0.628785495970483, + "learning_rate": 1.9757107880546843e-05, + "loss": 0.9174, + "num_tokens": 7172401602.0, + "step": 1716 + }, + { + "epoch": 0.20404040404040405, + "grad_norm": 0.6555644822907171, + "learning_rate": 1.97567080594756e-05, + "loss": 0.931, + "num_tokens": 7176590492.0, + "step": 1717 + }, + { + "epoch": 0.2041592394533571, + "grad_norm": 0.5535511775930834, + "learning_rate": 1.9756307914113845e-05, + "loss": 0.938, + "num_tokens": 7180779463.0, + "step": 1718 + }, + { + "epoch": 0.20427807486631017, + "grad_norm": 0.706478238242821, + "learning_rate": 1.9755907444476403e-05, + "loss": 0.9378, + "num_tokens": 7184969974.0, + "step": 1719 + }, + { + "epoch": 0.2043969102792632, + "grad_norm": 0.6181854270271743, + "learning_rate": 1.9755506650578106e-05, + "loss": 0.9615, + "num_tokens": 7189149453.0, + "step": 1720 + }, + { + "epoch": 0.20451574569221628, + "grad_norm": 0.590365125927688, + "learning_rate": 1.9755105532433794e-05, + "loss": 0.9428, + "num_tokens": 7193323399.0, + "step": 1721 + }, + { + "epoch": 0.20463458110516933, + "grad_norm": 0.6842807191732553, + "learning_rate": 1.9754704090058322e-05, + "loss": 0.9223, + "num_tokens": 7197512242.0, + "step": 1722 + }, + { + "epoch": 0.2047534165181224, + "grad_norm": 0.6909234515934927, + "learning_rate": 1.9754302323466555e-05, + "loss": 0.9467, + "num_tokens": 7201702886.0, + "step": 1723 + }, + { + "epoch": 0.20487225193107547, + "grad_norm": 0.5851069845542997, + "learning_rate": 1.975390023267338e-05, + "loss": 0.9324, + "num_tokens": 7205891967.0, + "step": 1724 + }, + { + "epoch": 0.20499108734402852, + "grad_norm": 0.5930205515208166, + "learning_rate": 1.9753497817693676e-05, + "loss": 0.9071, + "num_tokens": 7210080818.0, + "step": 1725 + }, + { + "epoch": 0.2051099227569816, + "grad_norm": 0.646402837401271, + "learning_rate": 1.9753095078542352e-05, + "loss": 0.9126, + "num_tokens": 7214226012.0, + "step": 1726 + }, + { + "epoch": 0.20522875816993463, + "grad_norm": 0.595763324315736, + "learning_rate": 1.9752692015234324e-05, + "loss": 0.942, + "num_tokens": 7218392425.0, + "step": 1727 + }, + { + "epoch": 0.2053475935828877, + "grad_norm": 0.5950592423774907, + "learning_rate": 1.9752288627784518e-05, + "loss": 0.927, + "num_tokens": 7222577723.0, + "step": 1728 + }, + { + "epoch": 0.20546642899584075, + "grad_norm": 0.7473933146270961, + "learning_rate": 1.975188491620787e-05, + "loss": 0.9559, + "num_tokens": 7226767496.0, + "step": 1729 + }, + { + "epoch": 0.20558526440879382, + "grad_norm": 0.7055239926275508, + "learning_rate": 1.9751480880519335e-05, + "loss": 0.9159, + "num_tokens": 7230958332.0, + "step": 1730 + }, + { + "epoch": 0.2057040998217469, + "grad_norm": 0.4846550629715927, + "learning_rate": 1.9751076520733873e-05, + "loss": 0.9722, + "num_tokens": 7235134983.0, + "step": 1731 + }, + { + "epoch": 0.20582293523469994, + "grad_norm": 0.65973377923926, + "learning_rate": 1.9750671836866456e-05, + "loss": 0.9134, + "num_tokens": 7239299024.0, + "step": 1732 + }, + { + "epoch": 0.205941770647653, + "grad_norm": 0.6679475170867429, + "learning_rate": 1.975026682893208e-05, + "loss": 0.9061, + "num_tokens": 7243489856.0, + "step": 1733 + }, + { + "epoch": 0.20606060606060606, + "grad_norm": 0.6914748235666587, + "learning_rate": 1.974986149694573e-05, + "loss": 0.9139, + "num_tokens": 7247647421.0, + "step": 1734 + }, + { + "epoch": 0.20617944147355913, + "grad_norm": 0.6292667102768592, + "learning_rate": 1.9749455840922433e-05, + "loss": 0.9293, + "num_tokens": 7251836863.0, + "step": 1735 + }, + { + "epoch": 0.20629827688651217, + "grad_norm": 0.784779875701111, + "learning_rate": 1.97490498608772e-05, + "loss": 0.9336, + "num_tokens": 7256025812.0, + "step": 1736 + }, + { + "epoch": 0.20641711229946524, + "grad_norm": 0.5811254896495945, + "learning_rate": 1.974864355682507e-05, + "loss": 0.9461, + "num_tokens": 7260214594.0, + "step": 1737 + }, + { + "epoch": 0.2065359477124183, + "grad_norm": 0.8663478218685864, + "learning_rate": 1.9748236928781087e-05, + "loss": 0.8931, + "num_tokens": 7264370899.0, + "step": 1738 + }, + { + "epoch": 0.20665478312537136, + "grad_norm": 0.5401387013925605, + "learning_rate": 1.9747829976760312e-05, + "loss": 0.948, + "num_tokens": 7268505807.0, + "step": 1739 + }, + { + "epoch": 0.20677361853832443, + "grad_norm": 0.5916008352103516, + "learning_rate": 1.974742270077782e-05, + "loss": 0.951, + "num_tokens": 7272667684.0, + "step": 1740 + }, + { + "epoch": 0.20689245395127748, + "grad_norm": 0.6956258577644332, + "learning_rate": 1.9747015100848688e-05, + "loss": 0.8959, + "num_tokens": 7276852897.0, + "step": 1741 + }, + { + "epoch": 0.20701128936423055, + "grad_norm": 0.6542506329339083, + "learning_rate": 1.9746607176988013e-05, + "loss": 0.9103, + "num_tokens": 7281042904.0, + "step": 1742 + }, + { + "epoch": 0.2071301247771836, + "grad_norm": 0.7025075507586117, + "learning_rate": 1.97461989292109e-05, + "loss": 0.9172, + "num_tokens": 7285233716.0, + "step": 1743 + }, + { + "epoch": 0.20724896019013667, + "grad_norm": 0.5218237654982637, + "learning_rate": 1.9745790357532467e-05, + "loss": 0.9073, + "num_tokens": 7289423075.0, + "step": 1744 + }, + { + "epoch": 0.2073677956030897, + "grad_norm": 0.6254520798154413, + "learning_rate": 1.9745381461967853e-05, + "loss": 0.9152, + "num_tokens": 7293612574.0, + "step": 1745 + }, + { + "epoch": 0.20748663101604278, + "grad_norm": 0.605088328329152, + "learning_rate": 1.974497224253219e-05, + "loss": 0.9417, + "num_tokens": 7297779486.0, + "step": 1746 + }, + { + "epoch": 0.20760546642899583, + "grad_norm": 0.6845775910227795, + "learning_rate": 1.9744562699240636e-05, + "loss": 0.9432, + "num_tokens": 7301970747.0, + "step": 1747 + }, + { + "epoch": 0.2077243018419489, + "grad_norm": 0.6408973792629593, + "learning_rate": 1.974415283210836e-05, + "loss": 0.9325, + "num_tokens": 7306158052.0, + "step": 1748 + }, + { + "epoch": 0.20784313725490197, + "grad_norm": 0.7196278882169498, + "learning_rate": 1.974374264115054e-05, + "loss": 0.95, + "num_tokens": 7310328295.0, + "step": 1749 + }, + { + "epoch": 0.20796197266785502, + "grad_norm": 0.5910369287202012, + "learning_rate": 1.974333212638237e-05, + "loss": 0.9505, + "num_tokens": 7314472412.0, + "step": 1750 + }, + { + "epoch": 0.2080808080808081, + "grad_norm": 0.5791814263074585, + "learning_rate": 1.974292128781905e-05, + "loss": 0.9378, + "num_tokens": 7318661850.0, + "step": 1751 + }, + { + "epoch": 0.20819964349376113, + "grad_norm": 0.7390929049719818, + "learning_rate": 1.9742510125475788e-05, + "loss": 0.9294, + "num_tokens": 7322848590.0, + "step": 1752 + }, + { + "epoch": 0.2083184789067142, + "grad_norm": 0.5935813039912301, + "learning_rate": 1.974209863936782e-05, + "loss": 0.9265, + "num_tokens": 7327035726.0, + "step": 1753 + }, + { + "epoch": 0.20843731431966725, + "grad_norm": 0.6471576025648766, + "learning_rate": 1.974168682951038e-05, + "loss": 0.9339, + "num_tokens": 7331223938.0, + "step": 1754 + }, + { + "epoch": 0.20855614973262032, + "grad_norm": 0.6977343691932346, + "learning_rate": 1.9741274695918725e-05, + "loss": 0.9119, + "num_tokens": 7335398841.0, + "step": 1755 + }, + { + "epoch": 0.2086749851455734, + "grad_norm": 0.5189090974610998, + "learning_rate": 1.974086223860811e-05, + "loss": 0.915, + "num_tokens": 7339588689.0, + "step": 1756 + }, + { + "epoch": 0.20879382055852644, + "grad_norm": 0.6743223558661166, + "learning_rate": 1.974044945759381e-05, + "loss": 0.939, + "num_tokens": 7343777232.0, + "step": 1757 + }, + { + "epoch": 0.2089126559714795, + "grad_norm": 0.6987122271456215, + "learning_rate": 1.974003635289112e-05, + "loss": 0.9304, + "num_tokens": 7347966196.0, + "step": 1758 + }, + { + "epoch": 0.20903149138443256, + "grad_norm": 0.6575684637550397, + "learning_rate": 1.973962292451533e-05, + "loss": 0.9234, + "num_tokens": 7352157493.0, + "step": 1759 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.6106862503079982, + "learning_rate": 1.9739209172481756e-05, + "loss": 0.9245, + "num_tokens": 7356346331.0, + "step": 1760 + }, + { + "epoch": 0.20926916221033867, + "grad_norm": 0.6523355743394285, + "learning_rate": 1.9738795096805716e-05, + "loss": 0.9457, + "num_tokens": 7360535496.0, + "step": 1761 + }, + { + "epoch": 0.20938799762329174, + "grad_norm": 0.6195690800742736, + "learning_rate": 1.9738380697502548e-05, + "loss": 0.9199, + "num_tokens": 7364725206.0, + "step": 1762 + }, + { + "epoch": 0.2095068330362448, + "grad_norm": 0.6731073622563277, + "learning_rate": 1.9737965974587595e-05, + "loss": 0.9297, + "num_tokens": 7368912996.0, + "step": 1763 + }, + { + "epoch": 0.20962566844919786, + "grad_norm": 0.6424741270325436, + "learning_rate": 1.9737550928076225e-05, + "loss": 0.9217, + "num_tokens": 7373090310.0, + "step": 1764 + }, + { + "epoch": 0.20974450386215093, + "grad_norm": 0.6743677129707877, + "learning_rate": 1.9737135557983794e-05, + "loss": 0.9178, + "num_tokens": 7377271846.0, + "step": 1765 + }, + { + "epoch": 0.20986333927510398, + "grad_norm": 0.594962277689161, + "learning_rate": 1.9736719864325696e-05, + "loss": 0.9114, + "num_tokens": 7381434579.0, + "step": 1766 + }, + { + "epoch": 0.20998217468805705, + "grad_norm": 0.7499743373358436, + "learning_rate": 1.9736303847117323e-05, + "loss": 0.8863, + "num_tokens": 7385624788.0, + "step": 1767 + }, + { + "epoch": 0.2101010101010101, + "grad_norm": 0.5649329416785327, + "learning_rate": 1.973588750637408e-05, + "loss": 0.9275, + "num_tokens": 7389754887.0, + "step": 1768 + }, + { + "epoch": 0.21021984551396317, + "grad_norm": 0.7107853792428144, + "learning_rate": 1.9735470842111383e-05, + "loss": 0.9412, + "num_tokens": 7393944012.0, + "step": 1769 + }, + { + "epoch": 0.2103386809269162, + "grad_norm": 0.6206503322526578, + "learning_rate": 1.973505385434467e-05, + "loss": 0.8974, + "num_tokens": 7398102659.0, + "step": 1770 + }, + { + "epoch": 0.21045751633986928, + "grad_norm": 0.6513886649891705, + "learning_rate": 1.9734636543089375e-05, + "loss": 0.895, + "num_tokens": 7402279613.0, + "step": 1771 + }, + { + "epoch": 0.21057635175282233, + "grad_norm": 0.7501366637967952, + "learning_rate": 1.9734218908360956e-05, + "loss": 0.9356, + "num_tokens": 7406446627.0, + "step": 1772 + }, + { + "epoch": 0.2106951871657754, + "grad_norm": 0.48443203108624283, + "learning_rate": 1.9733800950174883e-05, + "loss": 0.9203, + "num_tokens": 7410636839.0, + "step": 1773 + }, + { + "epoch": 0.21081402257872847, + "grad_norm": 0.6717392627764838, + "learning_rate": 1.973338266854663e-05, + "loss": 0.9319, + "num_tokens": 7414810337.0, + "step": 1774 + }, + { + "epoch": 0.21093285799168152, + "grad_norm": 0.601064487903266, + "learning_rate": 1.9732964063491686e-05, + "loss": 0.9394, + "num_tokens": 7418994654.0, + "step": 1775 + }, + { + "epoch": 0.2110516934046346, + "grad_norm": 0.5879659576213129, + "learning_rate": 1.9732545135025557e-05, + "loss": 0.9416, + "num_tokens": 7423180683.0, + "step": 1776 + }, + { + "epoch": 0.21117052881758763, + "grad_norm": 0.6740090480429939, + "learning_rate": 1.9732125883163756e-05, + "loss": 0.8968, + "num_tokens": 7427370691.0, + "step": 1777 + }, + { + "epoch": 0.2112893642305407, + "grad_norm": 0.6718786406442655, + "learning_rate": 1.9731706307921812e-05, + "loss": 0.9072, + "num_tokens": 7431558524.0, + "step": 1778 + }, + { + "epoch": 0.21140819964349375, + "grad_norm": 0.5522838422597842, + "learning_rate": 1.973128640931526e-05, + "loss": 0.9125, + "num_tokens": 7435722261.0, + "step": 1779 + }, + { + "epoch": 0.21152703505644682, + "grad_norm": 0.6137685777305348, + "learning_rate": 1.973086618735965e-05, + "loss": 0.9429, + "num_tokens": 7439911260.0, + "step": 1780 + }, + { + "epoch": 0.2116458704693999, + "grad_norm": 0.5858416042739591, + "learning_rate": 1.9730445642070547e-05, + "loss": 0.975, + "num_tokens": 7444099374.0, + "step": 1781 + }, + { + "epoch": 0.21176470588235294, + "grad_norm": 0.7053569280320994, + "learning_rate": 1.973002477346352e-05, + "loss": 0.9117, + "num_tokens": 7448289159.0, + "step": 1782 + }, + { + "epoch": 0.211883541295306, + "grad_norm": 0.5831601496103271, + "learning_rate": 1.9729603581554163e-05, + "loss": 0.9138, + "num_tokens": 7452477956.0, + "step": 1783 + }, + { + "epoch": 0.21200237670825905, + "grad_norm": 0.6546820218600554, + "learning_rate": 1.9729182066358067e-05, + "loss": 0.9433, + "num_tokens": 7456667018.0, + "step": 1784 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 0.6365720606654042, + "learning_rate": 1.9728760227890847e-05, + "loss": 0.9375, + "num_tokens": 7460829465.0, + "step": 1785 + }, + { + "epoch": 0.21224004753416517, + "grad_norm": 0.7397310622101644, + "learning_rate": 1.972833806616812e-05, + "loss": 0.9299, + "num_tokens": 7465019189.0, + "step": 1786 + }, + { + "epoch": 0.21235888294711824, + "grad_norm": 0.5659823305940258, + "learning_rate": 1.9727915581205523e-05, + "loss": 0.921, + "num_tokens": 7469191384.0, + "step": 1787 + }, + { + "epoch": 0.2124777183600713, + "grad_norm": 0.6578434610094127, + "learning_rate": 1.9727492773018705e-05, + "loss": 0.9035, + "num_tokens": 7473379618.0, + "step": 1788 + }, + { + "epoch": 0.21259655377302436, + "grad_norm": 0.6251584848535858, + "learning_rate": 1.9727069641623322e-05, + "loss": 0.9133, + "num_tokens": 7477569294.0, + "step": 1789 + }, + { + "epoch": 0.21271538918597743, + "grad_norm": 0.6459762254263368, + "learning_rate": 1.972664618703504e-05, + "loss": 0.9222, + "num_tokens": 7481745126.0, + "step": 1790 + }, + { + "epoch": 0.21283422459893048, + "grad_norm": 0.5797326854422291, + "learning_rate": 1.9726222409269547e-05, + "loss": 0.9326, + "num_tokens": 7485903822.0, + "step": 1791 + }, + { + "epoch": 0.21295306001188355, + "grad_norm": 0.598947555523464, + "learning_rate": 1.9725798308342535e-05, + "loss": 0.9163, + "num_tokens": 7490091218.0, + "step": 1792 + }, + { + "epoch": 0.2130718954248366, + "grad_norm": 0.6454125406093253, + "learning_rate": 1.9725373884269707e-05, + "loss": 0.9205, + "num_tokens": 7494266934.0, + "step": 1793 + }, + { + "epoch": 0.21319073083778967, + "grad_norm": 0.6455052532059954, + "learning_rate": 1.9724949137066782e-05, + "loss": 0.9291, + "num_tokens": 7498414187.0, + "step": 1794 + }, + { + "epoch": 0.2133095662507427, + "grad_norm": 0.6728605744042954, + "learning_rate": 1.9724524066749492e-05, + "loss": 0.9578, + "num_tokens": 7502602569.0, + "step": 1795 + }, + { + "epoch": 0.21342840166369578, + "grad_norm": 0.6654678917176491, + "learning_rate": 1.9724098673333575e-05, + "loss": 0.8824, + "num_tokens": 7506782981.0, + "step": 1796 + }, + { + "epoch": 0.21354723707664885, + "grad_norm": 0.5798903620092517, + "learning_rate": 1.9723672956834793e-05, + "loss": 0.9178, + "num_tokens": 7510925833.0, + "step": 1797 + }, + { + "epoch": 0.2136660724896019, + "grad_norm": 0.5846971022399556, + "learning_rate": 1.9723246917268903e-05, + "loss": 0.9058, + "num_tokens": 7515092452.0, + "step": 1798 + }, + { + "epoch": 0.21378490790255497, + "grad_norm": 0.6725710330545498, + "learning_rate": 1.9722820554651684e-05, + "loss": 0.9283, + "num_tokens": 7519270137.0, + "step": 1799 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 0.6012216538474553, + "learning_rate": 1.9722393868998924e-05, + "loss": 0.9372, + "num_tokens": 7523427180.0, + "step": 1800 + }, + { + "epoch": 0.2140225787284611, + "grad_norm": 0.6426613687885003, + "learning_rate": 1.9721966860326437e-05, + "loss": 0.9165, + "num_tokens": 7527606323.0, + "step": 1801 + }, + { + "epoch": 0.21414141414141413, + "grad_norm": 0.6890765024340603, + "learning_rate": 1.972153952865002e-05, + "loss": 0.9452, + "num_tokens": 7531796336.0, + "step": 1802 + }, + { + "epoch": 0.2142602495543672, + "grad_norm": 0.6315544596780304, + "learning_rate": 1.9721111873985512e-05, + "loss": 0.9099, + "num_tokens": 7535969804.0, + "step": 1803 + }, + { + "epoch": 0.21437908496732025, + "grad_norm": 0.6447676072114127, + "learning_rate": 1.9720683896348743e-05, + "loss": 0.9155, + "num_tokens": 7540117135.0, + "step": 1804 + }, + { + "epoch": 0.21449792038027332, + "grad_norm": 0.6001919757651432, + "learning_rate": 1.972025559575556e-05, + "loss": 0.8937, + "num_tokens": 7544275622.0, + "step": 1805 + }, + { + "epoch": 0.2146167557932264, + "grad_norm": 0.6189784637362886, + "learning_rate": 1.971982697222183e-05, + "loss": 0.927, + "num_tokens": 7548465126.0, + "step": 1806 + }, + { + "epoch": 0.21473559120617944, + "grad_norm": 0.6540277582327867, + "learning_rate": 1.9719398025763426e-05, + "loss": 0.8888, + "num_tokens": 7552626453.0, + "step": 1807 + }, + { + "epoch": 0.2148544266191325, + "grad_norm": 0.6585975791169729, + "learning_rate": 1.9718968756396232e-05, + "loss": 0.9418, + "num_tokens": 7556816154.0, + "step": 1808 + }, + { + "epoch": 0.21497326203208555, + "grad_norm": 0.6531133610838197, + "learning_rate": 1.9718539164136145e-05, + "loss": 0.9199, + "num_tokens": 7561005800.0, + "step": 1809 + }, + { + "epoch": 0.21509209744503863, + "grad_norm": 0.60231036689154, + "learning_rate": 1.9718109248999075e-05, + "loss": 0.9323, + "num_tokens": 7565196065.0, + "step": 1810 + }, + { + "epoch": 0.21521093285799167, + "grad_norm": 0.6842680446142354, + "learning_rate": 1.9717679011000944e-05, + "loss": 0.9751, + "num_tokens": 7569373010.0, + "step": 1811 + }, + { + "epoch": 0.21532976827094474, + "grad_norm": 0.6101116989002079, + "learning_rate": 1.9717248450157682e-05, + "loss": 0.96, + "num_tokens": 7573537200.0, + "step": 1812 + }, + { + "epoch": 0.2154486036838978, + "grad_norm": 0.651693645479536, + "learning_rate": 1.9716817566485234e-05, + "loss": 0.9211, + "num_tokens": 7577726649.0, + "step": 1813 + }, + { + "epoch": 0.21556743909685086, + "grad_norm": 0.7208861415288116, + "learning_rate": 1.9716386359999565e-05, + "loss": 0.8844, + "num_tokens": 7581904021.0, + "step": 1814 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.5690921218579591, + "learning_rate": 1.9715954830716636e-05, + "loss": 0.9529, + "num_tokens": 7586080608.0, + "step": 1815 + }, + { + "epoch": 0.21580510992275698, + "grad_norm": 0.665566258219471, + "learning_rate": 1.971552297865243e-05, + "loss": 0.9179, + "num_tokens": 7590243716.0, + "step": 1816 + }, + { + "epoch": 0.21592394533571005, + "grad_norm": 0.6313868673696471, + "learning_rate": 1.9715090803822937e-05, + "loss": 0.9185, + "num_tokens": 7594433278.0, + "step": 1817 + }, + { + "epoch": 0.2160427807486631, + "grad_norm": 0.7677947929103122, + "learning_rate": 1.971465830624417e-05, + "loss": 0.9303, + "num_tokens": 7598622916.0, + "step": 1818 + }, + { + "epoch": 0.21616161616161617, + "grad_norm": 0.5392662123375116, + "learning_rate": 1.9714225485932143e-05, + "loss": 0.9237, + "num_tokens": 7602812048.0, + "step": 1819 + }, + { + "epoch": 0.2162804515745692, + "grad_norm": 0.745177608627311, + "learning_rate": 1.9713792342902877e-05, + "loss": 0.9389, + "num_tokens": 7607000701.0, + "step": 1820 + }, + { + "epoch": 0.21639928698752228, + "grad_norm": 0.6053017938142369, + "learning_rate": 1.971335887717242e-05, + "loss": 0.9071, + "num_tokens": 7611173990.0, + "step": 1821 + }, + { + "epoch": 0.21651812240047535, + "grad_norm": 0.7384377160986118, + "learning_rate": 1.9712925088756822e-05, + "loss": 0.9156, + "num_tokens": 7615334669.0, + "step": 1822 + }, + { + "epoch": 0.2166369578134284, + "grad_norm": 0.5991264435028605, + "learning_rate": 1.971249097767215e-05, + "loss": 0.9337, + "num_tokens": 7619494260.0, + "step": 1823 + }, + { + "epoch": 0.21675579322638147, + "grad_norm": 0.5025262552148587, + "learning_rate": 1.971205654393448e-05, + "loss": 0.8998, + "num_tokens": 7623656078.0, + "step": 1824 + }, + { + "epoch": 0.21687462863933452, + "grad_norm": 0.6141485299319007, + "learning_rate": 1.9711621787559898e-05, + "loss": 0.9159, + "num_tokens": 7627845409.0, + "step": 1825 + }, + { + "epoch": 0.2169934640522876, + "grad_norm": 0.6089488137765279, + "learning_rate": 1.971118670856451e-05, + "loss": 0.9621, + "num_tokens": 7632034270.0, + "step": 1826 + }, + { + "epoch": 0.21711229946524063, + "grad_norm": 0.8745354561915256, + "learning_rate": 1.9710751306964425e-05, + "loss": 0.9432, + "num_tokens": 7636223960.0, + "step": 1827 + }, + { + "epoch": 0.2172311348781937, + "grad_norm": 0.519852328569586, + "learning_rate": 1.9710315582775762e-05, + "loss": 0.9167, + "num_tokens": 7640413348.0, + "step": 1828 + }, + { + "epoch": 0.21734997029114675, + "grad_norm": 0.799320343604616, + "learning_rate": 1.970987953601467e-05, + "loss": 0.9377, + "num_tokens": 7644601189.0, + "step": 1829 + }, + { + "epoch": 0.21746880570409982, + "grad_norm": 0.631926372604472, + "learning_rate": 1.9709443166697282e-05, + "loss": 0.8723, + "num_tokens": 7648772755.0, + "step": 1830 + }, + { + "epoch": 0.2175876411170529, + "grad_norm": 0.6197919947281108, + "learning_rate": 1.9709006474839772e-05, + "loss": 0.9248, + "num_tokens": 7652950098.0, + "step": 1831 + }, + { + "epoch": 0.21770647653000594, + "grad_norm": 0.6886630188805484, + "learning_rate": 1.9708569460458306e-05, + "loss": 0.9356, + "num_tokens": 7657139510.0, + "step": 1832 + }, + { + "epoch": 0.217825311942959, + "grad_norm": 0.6424588410317108, + "learning_rate": 1.970813212356907e-05, + "loss": 0.92, + "num_tokens": 7661328747.0, + "step": 1833 + }, + { + "epoch": 0.21794414735591205, + "grad_norm": 0.5800081204266064, + "learning_rate": 1.9707694464188256e-05, + "loss": 0.9014, + "num_tokens": 7665515790.0, + "step": 1834 + }, + { + "epoch": 0.21806298276886513, + "grad_norm": 0.5964706631622216, + "learning_rate": 1.9707256482332073e-05, + "loss": 0.9648, + "num_tokens": 7669675354.0, + "step": 1835 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 0.7298623776097144, + "learning_rate": 1.9706818178016745e-05, + "loss": 0.9091, + "num_tokens": 7673805254.0, + "step": 1836 + }, + { + "epoch": 0.21830065359477124, + "grad_norm": 0.5321382241861391, + "learning_rate": 1.9706379551258503e-05, + "loss": 0.9388, + "num_tokens": 7677992388.0, + "step": 1837 + }, + { + "epoch": 0.21841948900772432, + "grad_norm": 0.5912309423968349, + "learning_rate": 1.9705940602073587e-05, + "loss": 0.9044, + "num_tokens": 7682181374.0, + "step": 1838 + }, + { + "epoch": 0.21853832442067736, + "grad_norm": 0.5939828381508532, + "learning_rate": 1.9705501330478252e-05, + "loss": 0.896, + "num_tokens": 7686356016.0, + "step": 1839 + }, + { + "epoch": 0.21865715983363043, + "grad_norm": 0.7219846044943101, + "learning_rate": 1.9705061736488773e-05, + "loss": 0.9071, + "num_tokens": 7690506440.0, + "step": 1840 + }, + { + "epoch": 0.21877599524658348, + "grad_norm": 0.6061986705435096, + "learning_rate": 1.9704621820121425e-05, + "loss": 0.9074, + "num_tokens": 7694695241.0, + "step": 1841 + }, + { + "epoch": 0.21889483065953655, + "grad_norm": 0.6376871926859227, + "learning_rate": 1.9704181581392497e-05, + "loss": 0.9404, + "num_tokens": 7698876417.0, + "step": 1842 + }, + { + "epoch": 0.2190136660724896, + "grad_norm": 0.6670215043558968, + "learning_rate": 1.9703741020318296e-05, + "loss": 0.9091, + "num_tokens": 7703066449.0, + "step": 1843 + }, + { + "epoch": 0.21913250148544267, + "grad_norm": 0.7315812513502781, + "learning_rate": 1.970330013691514e-05, + "loss": 0.944, + "num_tokens": 7707207428.0, + "step": 1844 + }, + { + "epoch": 0.2192513368983957, + "grad_norm": 0.5611231891055108, + "learning_rate": 1.970285893119935e-05, + "loss": 0.9588, + "num_tokens": 7711396190.0, + "step": 1845 + }, + { + "epoch": 0.21937017231134878, + "grad_norm": 0.6990407495408056, + "learning_rate": 1.970241740318727e-05, + "loss": 0.9166, + "num_tokens": 7715569515.0, + "step": 1846 + }, + { + "epoch": 0.21948900772430185, + "grad_norm": 0.6101189643140927, + "learning_rate": 1.970197555289525e-05, + "loss": 0.9248, + "num_tokens": 7719736691.0, + "step": 1847 + }, + { + "epoch": 0.2196078431372549, + "grad_norm": 0.6297980825468901, + "learning_rate": 1.9701533380339658e-05, + "loss": 0.9221, + "num_tokens": 7723926051.0, + "step": 1848 + }, + { + "epoch": 0.21972667855020797, + "grad_norm": 0.6959234257404454, + "learning_rate": 1.970109088553686e-05, + "loss": 0.9773, + "num_tokens": 7728115064.0, + "step": 1849 + }, + { + "epoch": 0.21984551396316102, + "grad_norm": 0.5728308761108792, + "learning_rate": 1.9700648068503244e-05, + "loss": 0.9646, + "num_tokens": 7732254738.0, + "step": 1850 + }, + { + "epoch": 0.2199643493761141, + "grad_norm": 0.7737601240626822, + "learning_rate": 1.9700204929255217e-05, + "loss": 0.8966, + "num_tokens": 7736429086.0, + "step": 1851 + }, + { + "epoch": 0.22008318478906713, + "grad_norm": 0.5676207480449075, + "learning_rate": 1.9699761467809186e-05, + "loss": 0.8998, + "num_tokens": 7740617919.0, + "step": 1852 + }, + { + "epoch": 0.2202020202020202, + "grad_norm": 0.79611622690471, + "learning_rate": 1.9699317684181572e-05, + "loss": 0.9311, + "num_tokens": 7744807679.0, + "step": 1853 + }, + { + "epoch": 0.22032085561497325, + "grad_norm": 0.5107181618248139, + "learning_rate": 1.969887357838881e-05, + "loss": 0.9186, + "num_tokens": 7748995015.0, + "step": 1854 + }, + { + "epoch": 0.22043969102792632, + "grad_norm": 0.7027517811550776, + "learning_rate": 1.969842915044735e-05, + "loss": 0.8931, + "num_tokens": 7753184063.0, + "step": 1855 + }, + { + "epoch": 0.2205585264408794, + "grad_norm": 0.6420208459944878, + "learning_rate": 1.9697984400373643e-05, + "loss": 0.9086, + "num_tokens": 7757373719.0, + "step": 1856 + }, + { + "epoch": 0.22067736185383244, + "grad_norm": 0.7352317965295849, + "learning_rate": 1.969753932818417e-05, + "loss": 0.9161, + "num_tokens": 7761563520.0, + "step": 1857 + }, + { + "epoch": 0.2207961972667855, + "grad_norm": 0.6182221091312325, + "learning_rate": 1.969709393389541e-05, + "loss": 0.9173, + "num_tokens": 7765754485.0, + "step": 1858 + }, + { + "epoch": 0.22091503267973855, + "grad_norm": 0.6920830758700132, + "learning_rate": 1.9696648217523853e-05, + "loss": 0.9078, + "num_tokens": 7769944410.0, + "step": 1859 + }, + { + "epoch": 0.22103386809269163, + "grad_norm": 0.4722124469238995, + "learning_rate": 1.969620217908601e-05, + "loss": 0.9413, + "num_tokens": 7774134488.0, + "step": 1860 + }, + { + "epoch": 0.22115270350564467, + "grad_norm": 0.8064856389153017, + "learning_rate": 1.9695755818598396e-05, + "loss": 0.9324, + "num_tokens": 7778324068.0, + "step": 1861 + }, + { + "epoch": 0.22127153891859774, + "grad_norm": 0.6018226845676953, + "learning_rate": 1.9695309136077543e-05, + "loss": 0.9615, + "num_tokens": 7782496032.0, + "step": 1862 + }, + { + "epoch": 0.22139037433155082, + "grad_norm": 0.7064551883025002, + "learning_rate": 1.9694862131539995e-05, + "loss": 0.8974, + "num_tokens": 7786674362.0, + "step": 1863 + }, + { + "epoch": 0.22150920974450386, + "grad_norm": 0.6031794147628704, + "learning_rate": 1.9694414805002302e-05, + "loss": 0.9225, + "num_tokens": 7790856181.0, + "step": 1864 + }, + { + "epoch": 0.22162804515745693, + "grad_norm": 0.7297187309798677, + "learning_rate": 1.9693967156481034e-05, + "loss": 0.9723, + "num_tokens": 7795045668.0, + "step": 1865 + }, + { + "epoch": 0.22174688057040998, + "grad_norm": 0.6501865797101686, + "learning_rate": 1.9693519185992768e-05, + "loss": 0.9263, + "num_tokens": 7799236699.0, + "step": 1866 + }, + { + "epoch": 0.22186571598336305, + "grad_norm": 0.6486673442815615, + "learning_rate": 1.9693070893554088e-05, + "loss": 0.9163, + "num_tokens": 7803425828.0, + "step": 1867 + }, + { + "epoch": 0.2219845513963161, + "grad_norm": 0.6002996365746139, + "learning_rate": 1.9692622279181607e-05, + "loss": 0.8824, + "num_tokens": 7807604266.0, + "step": 1868 + }, + { + "epoch": 0.22210338680926917, + "grad_norm": 0.6251454881103964, + "learning_rate": 1.969217334289193e-05, + "loss": 0.8741, + "num_tokens": 7811766168.0, + "step": 1869 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.6941360576123251, + "learning_rate": 1.9691724084701684e-05, + "loss": 0.9346, + "num_tokens": 7815956264.0, + "step": 1870 + }, + { + "epoch": 0.22234105763517528, + "grad_norm": 0.5527608916932435, + "learning_rate": 1.969127450462751e-05, + "loss": 0.8842, + "num_tokens": 7820136628.0, + "step": 1871 + }, + { + "epoch": 0.22245989304812835, + "grad_norm": 0.6947934482855251, + "learning_rate": 1.9690824602686053e-05, + "loss": 0.9599, + "num_tokens": 7824325157.0, + "step": 1872 + }, + { + "epoch": 0.2225787284610814, + "grad_norm": 0.6517888741685237, + "learning_rate": 1.969037437889398e-05, + "loss": 0.9028, + "num_tokens": 7828512236.0, + "step": 1873 + }, + { + "epoch": 0.22269756387403447, + "grad_norm": 0.6259949079429763, + "learning_rate": 1.9689923833267956e-05, + "loss": 0.9142, + "num_tokens": 7832661347.0, + "step": 1874 + }, + { + "epoch": 0.22281639928698752, + "grad_norm": 0.5968000486194682, + "learning_rate": 1.9689472965824675e-05, + "loss": 0.8986, + "num_tokens": 7836827695.0, + "step": 1875 + }, + { + "epoch": 0.2229352346999406, + "grad_norm": 0.6721079306668741, + "learning_rate": 1.968902177658083e-05, + "loss": 0.9257, + "num_tokens": 7840997957.0, + "step": 1876 + }, + { + "epoch": 0.22305407011289363, + "grad_norm": 0.6375007665246523, + "learning_rate": 1.9688570265553128e-05, + "loss": 0.9532, + "num_tokens": 7845162085.0, + "step": 1877 + }, + { + "epoch": 0.2231729055258467, + "grad_norm": 0.6891803105032092, + "learning_rate": 1.9688118432758296e-05, + "loss": 0.9169, + "num_tokens": 7849321698.0, + "step": 1878 + }, + { + "epoch": 0.22329174093879975, + "grad_norm": 0.6132759122092845, + "learning_rate": 1.968766627821306e-05, + "loss": 0.9191, + "num_tokens": 7853481372.0, + "step": 1879 + }, + { + "epoch": 0.22341057635175282, + "grad_norm": 0.6537179106003063, + "learning_rate": 1.9687213801934168e-05, + "loss": 0.9105, + "num_tokens": 7857669879.0, + "step": 1880 + }, + { + "epoch": 0.2235294117647059, + "grad_norm": 0.6679030183110662, + "learning_rate": 1.9686761003938377e-05, + "loss": 0.898, + "num_tokens": 7861841157.0, + "step": 1881 + }, + { + "epoch": 0.22364824717765894, + "grad_norm": 0.5687409506765583, + "learning_rate": 1.9686307884242458e-05, + "loss": 0.9256, + "num_tokens": 7866030343.0, + "step": 1882 + }, + { + "epoch": 0.223767082590612, + "grad_norm": 0.679146588471286, + "learning_rate": 1.968585444286319e-05, + "loss": 0.9203, + "num_tokens": 7870213361.0, + "step": 1883 + }, + { + "epoch": 0.22388591800356505, + "grad_norm": 0.6110577857126741, + "learning_rate": 1.9685400679817357e-05, + "loss": 0.9296, + "num_tokens": 7874353309.0, + "step": 1884 + }, + { + "epoch": 0.22400475341651813, + "grad_norm": 0.6686237209302769, + "learning_rate": 1.9684946595121775e-05, + "loss": 0.9027, + "num_tokens": 7878542686.0, + "step": 1885 + }, + { + "epoch": 0.22412358882947117, + "grad_norm": 0.7074495138421453, + "learning_rate": 1.968449218879326e-05, + "loss": 0.9219, + "num_tokens": 7882732249.0, + "step": 1886 + }, + { + "epoch": 0.22424242424242424, + "grad_norm": 0.5693398110488429, + "learning_rate": 1.968403746084863e-05, + "loss": 0.8718, + "num_tokens": 7886899471.0, + "step": 1887 + }, + { + "epoch": 0.22436125965537732, + "grad_norm": 0.739570868087973, + "learning_rate": 1.9683582411304735e-05, + "loss": 0.926, + "num_tokens": 7891088239.0, + "step": 1888 + }, + { + "epoch": 0.22448009506833036, + "grad_norm": 0.655177805082727, + "learning_rate": 1.968312704017842e-05, + "loss": 0.9863, + "num_tokens": 7895277173.0, + "step": 1889 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 0.5632224711785905, + "learning_rate": 1.9682671347486553e-05, + "loss": 0.9344, + "num_tokens": 7899463432.0, + "step": 1890 + }, + { + "epoch": 0.22471776589423648, + "grad_norm": 0.820360949965603, + "learning_rate": 1.9682215333246015e-05, + "loss": 0.9098, + "num_tokens": 7903653362.0, + "step": 1891 + }, + { + "epoch": 0.22483660130718955, + "grad_norm": 0.5885591191512289, + "learning_rate": 1.9681758997473683e-05, + "loss": 0.9532, + "num_tokens": 7907843679.0, + "step": 1892 + }, + { + "epoch": 0.2249554367201426, + "grad_norm": 0.5957564436723485, + "learning_rate": 1.9681302340186463e-05, + "loss": 0.939, + "num_tokens": 7912033897.0, + "step": 1893 + }, + { + "epoch": 0.22507427213309567, + "grad_norm": 0.6179556511754555, + "learning_rate": 1.9680845361401263e-05, + "loss": 0.9341, + "num_tokens": 7916190225.0, + "step": 1894 + }, + { + "epoch": 0.2251931075460487, + "grad_norm": 0.6947776120261914, + "learning_rate": 1.9680388061135016e-05, + "loss": 0.9271, + "num_tokens": 7920380308.0, + "step": 1895 + }, + { + "epoch": 0.22531194295900178, + "grad_norm": 0.6870663876159689, + "learning_rate": 1.967993043940464e-05, + "loss": 0.9058, + "num_tokens": 7924554833.0, + "step": 1896 + }, + { + "epoch": 0.22543077837195485, + "grad_norm": 0.664561758606933, + "learning_rate": 1.9679472496227097e-05, + "loss": 0.9377, + "num_tokens": 7928744537.0, + "step": 1897 + }, + { + "epoch": 0.2255496137849079, + "grad_norm": 0.5928965580131066, + "learning_rate": 1.967901423161934e-05, + "loss": 0.9507, + "num_tokens": 7932910600.0, + "step": 1898 + }, + { + "epoch": 0.22566844919786097, + "grad_norm": 0.6913279933243857, + "learning_rate": 1.9678555645598343e-05, + "loss": 0.9465, + "num_tokens": 7937099649.0, + "step": 1899 + }, + { + "epoch": 0.22578728461081402, + "grad_norm": 0.5549473663158883, + "learning_rate": 1.967809673818109e-05, + "loss": 0.8985, + "num_tokens": 7941289509.0, + "step": 1900 + }, + { + "epoch": 0.2259061200237671, + "grad_norm": 0.6204866564131293, + "learning_rate": 1.967763750938457e-05, + "loss": 0.9383, + "num_tokens": 7945477226.0, + "step": 1901 + }, + { + "epoch": 0.22602495543672013, + "grad_norm": 0.6979699285919612, + "learning_rate": 1.967717795922579e-05, + "loss": 0.9011, + "num_tokens": 7949666735.0, + "step": 1902 + }, + { + "epoch": 0.2261437908496732, + "grad_norm": 0.6435763609100217, + "learning_rate": 1.9676718087721778e-05, + "loss": 0.9368, + "num_tokens": 7953850993.0, + "step": 1903 + }, + { + "epoch": 0.22626262626262628, + "grad_norm": 0.6817925662543848, + "learning_rate": 1.9676257894889556e-05, + "loss": 0.941, + "num_tokens": 7958040286.0, + "step": 1904 + }, + { + "epoch": 0.22638146167557932, + "grad_norm": 0.5804440121568555, + "learning_rate": 1.9675797380746172e-05, + "loss": 0.9304, + "num_tokens": 7962229827.0, + "step": 1905 + }, + { + "epoch": 0.2265002970885324, + "grad_norm": 0.6764908895366795, + "learning_rate": 1.967533654530867e-05, + "loss": 0.9142, + "num_tokens": 7966419233.0, + "step": 1906 + }, + { + "epoch": 0.22661913250148544, + "grad_norm": 0.6862577081302531, + "learning_rate": 1.9674875388594134e-05, + "loss": 0.934, + "num_tokens": 7970608451.0, + "step": 1907 + }, + { + "epoch": 0.2267379679144385, + "grad_norm": 0.7003104774787313, + "learning_rate": 1.9674413910619624e-05, + "loss": 0.9374, + "num_tokens": 7974765674.0, + "step": 1908 + }, + { + "epoch": 0.22685680332739155, + "grad_norm": 0.6279129810924803, + "learning_rate": 1.967395211140224e-05, + "loss": 0.9354, + "num_tokens": 7978949646.0, + "step": 1909 + }, + { + "epoch": 0.22697563874034463, + "grad_norm": 0.61593182495211, + "learning_rate": 1.9673489990959082e-05, + "loss": 0.9472, + "num_tokens": 7983136509.0, + "step": 1910 + }, + { + "epoch": 0.22709447415329767, + "grad_norm": 0.5450721060564918, + "learning_rate": 1.9673027549307263e-05, + "loss": 0.9183, + "num_tokens": 7987297572.0, + "step": 1911 + }, + { + "epoch": 0.22721330956625074, + "grad_norm": 0.7173656025661154, + "learning_rate": 1.9672564786463912e-05, + "loss": 0.9168, + "num_tokens": 7991451890.0, + "step": 1912 + }, + { + "epoch": 0.22733214497920382, + "grad_norm": 0.5814035001349702, + "learning_rate": 1.9672101702446165e-05, + "loss": 0.9405, + "num_tokens": 7995575077.0, + "step": 1913 + }, + { + "epoch": 0.22745098039215686, + "grad_norm": 0.7239466826925066, + "learning_rate": 1.967163829727117e-05, + "loss": 0.9226, + "num_tokens": 7999757988.0, + "step": 1914 + }, + { + "epoch": 0.22756981580510993, + "grad_norm": 0.6651893233166847, + "learning_rate": 1.9671174570956085e-05, + "loss": 0.9094, + "num_tokens": 8003943404.0, + "step": 1915 + }, + { + "epoch": 0.22768865121806298, + "grad_norm": 0.5593553896926021, + "learning_rate": 1.9670710523518093e-05, + "loss": 0.8886, + "num_tokens": 8008113796.0, + "step": 1916 + }, + { + "epoch": 0.22780748663101605, + "grad_norm": 0.617056951231659, + "learning_rate": 1.9670246154974374e-05, + "loss": 0.9543, + "num_tokens": 8012302299.0, + "step": 1917 + }, + { + "epoch": 0.2279263220439691, + "grad_norm": 0.6599818675168246, + "learning_rate": 1.966978146534212e-05, + "loss": 0.9472, + "num_tokens": 8016482253.0, + "step": 1918 + }, + { + "epoch": 0.22804515745692217, + "grad_norm": 0.5382011037947826, + "learning_rate": 1.966931645463855e-05, + "loss": 0.9606, + "num_tokens": 8020647700.0, + "step": 1919 + }, + { + "epoch": 0.2281639928698752, + "grad_norm": 0.6701920434848027, + "learning_rate": 1.966885112288088e-05, + "loss": 0.9165, + "num_tokens": 8024835809.0, + "step": 1920 + }, + { + "epoch": 0.22828282828282828, + "grad_norm": 0.6005792849232672, + "learning_rate": 1.966838547008634e-05, + "loss": 0.9146, + "num_tokens": 8029007163.0, + "step": 1921 + }, + { + "epoch": 0.22840166369578135, + "grad_norm": 0.5597685169622607, + "learning_rate": 1.966791949627218e-05, + "loss": 0.9151, + "num_tokens": 8033181980.0, + "step": 1922 + }, + { + "epoch": 0.2285204991087344, + "grad_norm": 0.6504813163983413, + "learning_rate": 1.9667453201455652e-05, + "loss": 0.9771, + "num_tokens": 8037338883.0, + "step": 1923 + }, + { + "epoch": 0.22863933452168747, + "grad_norm": 0.6216060479913836, + "learning_rate": 1.9666986585654028e-05, + "loss": 0.9125, + "num_tokens": 8041526857.0, + "step": 1924 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.5441639246963003, + "learning_rate": 1.9666519648884586e-05, + "loss": 0.9148, + "num_tokens": 8045702913.0, + "step": 1925 + }, + { + "epoch": 0.2288770053475936, + "grad_norm": 0.6252821432399917, + "learning_rate": 1.966605239116462e-05, + "loss": 0.8972, + "num_tokens": 8049873363.0, + "step": 1926 + }, + { + "epoch": 0.22899584076054663, + "grad_norm": 0.6950217683419091, + "learning_rate": 1.966558481251143e-05, + "loss": 0.9601, + "num_tokens": 8054064690.0, + "step": 1927 + }, + { + "epoch": 0.2291146761734997, + "grad_norm": 0.5041094096088279, + "learning_rate": 1.9665116912942342e-05, + "loss": 0.9222, + "num_tokens": 8058250216.0, + "step": 1928 + }, + { + "epoch": 0.22923351158645278, + "grad_norm": 0.6421092656646715, + "learning_rate": 1.9664648692474672e-05, + "loss": 0.9442, + "num_tokens": 8062419941.0, + "step": 1929 + }, + { + "epoch": 0.22935234699940582, + "grad_norm": 0.6644133276060161, + "learning_rate": 1.9664180151125768e-05, + "loss": 0.9612, + "num_tokens": 8066608914.0, + "step": 1930 + }, + { + "epoch": 0.2294711824123589, + "grad_norm": 0.7142292798943987, + "learning_rate": 1.9663711288912977e-05, + "loss": 0.9468, + "num_tokens": 8070798326.0, + "step": 1931 + }, + { + "epoch": 0.22959001782531194, + "grad_norm": 0.5207586343134231, + "learning_rate": 1.9663242105853667e-05, + "loss": 0.9545, + "num_tokens": 8074984338.0, + "step": 1932 + }, + { + "epoch": 0.229708853238265, + "grad_norm": 0.716660513206123, + "learning_rate": 1.9662772601965205e-05, + "loss": 0.9399, + "num_tokens": 8079175822.0, + "step": 1933 + }, + { + "epoch": 0.22982768865121805, + "grad_norm": 0.528347527295126, + "learning_rate": 1.9662302777264987e-05, + "loss": 0.9585, + "num_tokens": 8083365904.0, + "step": 1934 + }, + { + "epoch": 0.22994652406417113, + "grad_norm": 0.7924490443363846, + "learning_rate": 1.966183263177041e-05, + "loss": 0.9209, + "num_tokens": 8087538478.0, + "step": 1935 + }, + { + "epoch": 0.23006535947712417, + "grad_norm": 0.5547694286899216, + "learning_rate": 1.966136216549888e-05, + "loss": 0.9335, + "num_tokens": 8091726727.0, + "step": 1936 + }, + { + "epoch": 0.23018419489007724, + "grad_norm": 0.6664370582782931, + "learning_rate": 1.966089137846783e-05, + "loss": 0.9666, + "num_tokens": 8095898168.0, + "step": 1937 + }, + { + "epoch": 0.23030303030303031, + "grad_norm": 0.5550777154433306, + "learning_rate": 1.9660420270694688e-05, + "loss": 0.9462, + "num_tokens": 8100086147.0, + "step": 1938 + }, + { + "epoch": 0.23042186571598336, + "grad_norm": 0.7227747755639123, + "learning_rate": 1.9659948842196903e-05, + "loss": 0.936, + "num_tokens": 8104275074.0, + "step": 1939 + }, + { + "epoch": 0.23054070112893643, + "grad_norm": 0.5761564957852987, + "learning_rate": 1.965947709299193e-05, + "loss": 0.9182, + "num_tokens": 8108462333.0, + "step": 1940 + }, + { + "epoch": 0.23065953654188948, + "grad_norm": 0.7106817349627713, + "learning_rate": 1.9659005023097243e-05, + "loss": 0.9022, + "num_tokens": 8112580881.0, + "step": 1941 + }, + { + "epoch": 0.23077837195484255, + "grad_norm": 0.5473050991348458, + "learning_rate": 1.965853263253032e-05, + "loss": 0.9227, + "num_tokens": 8116760858.0, + "step": 1942 + }, + { + "epoch": 0.2308972073677956, + "grad_norm": 0.632698434095111, + "learning_rate": 1.9658059921308662e-05, + "loss": 0.9057, + "num_tokens": 8120947480.0, + "step": 1943 + }, + { + "epoch": 0.23101604278074866, + "grad_norm": 0.6649341899631767, + "learning_rate": 1.965758688944977e-05, + "loss": 0.9508, + "num_tokens": 8125120138.0, + "step": 1944 + }, + { + "epoch": 0.2311348781937017, + "grad_norm": 0.5235444669161341, + "learning_rate": 1.9657113536971168e-05, + "loss": 0.8951, + "num_tokens": 8129309771.0, + "step": 1945 + }, + { + "epoch": 0.23125371360665478, + "grad_norm": 0.752202445968234, + "learning_rate": 1.9656639863890374e-05, + "loss": 0.9209, + "num_tokens": 8133500416.0, + "step": 1946 + }, + { + "epoch": 0.23137254901960785, + "grad_norm": 0.6132647148157434, + "learning_rate": 1.965616587022494e-05, + "loss": 0.9455, + "num_tokens": 8137689422.0, + "step": 1947 + }, + { + "epoch": 0.2314913844325609, + "grad_norm": 0.5416848367025928, + "learning_rate": 1.9655691555992417e-05, + "loss": 0.8606, + "num_tokens": 8141853265.0, + "step": 1948 + }, + { + "epoch": 0.23161021984551397, + "grad_norm": 0.6424704418359779, + "learning_rate": 1.965521692121037e-05, + "loss": 0.948, + "num_tokens": 8146042320.0, + "step": 1949 + }, + { + "epoch": 0.23172905525846701, + "grad_norm": 0.5822975621830174, + "learning_rate": 1.9654741965896375e-05, + "loss": 0.9224, + "num_tokens": 8150231183.0, + "step": 1950 + }, + { + "epoch": 0.2318478906714201, + "grad_norm": 0.6825992464235847, + "learning_rate": 1.9654266690068028e-05, + "loss": 0.9684, + "num_tokens": 8154418307.0, + "step": 1951 + }, + { + "epoch": 0.23196672608437313, + "grad_norm": 0.6071885147205139, + "learning_rate": 1.965379109374292e-05, + "loss": 0.9309, + "num_tokens": 8158609197.0, + "step": 1952 + }, + { + "epoch": 0.2320855614973262, + "grad_norm": 0.5578591115016377, + "learning_rate": 1.965331517693867e-05, + "loss": 0.9228, + "num_tokens": 8162797991.0, + "step": 1953 + }, + { + "epoch": 0.23220439691027928, + "grad_norm": 0.572080491059658, + "learning_rate": 1.96528389396729e-05, + "loss": 0.9058, + "num_tokens": 8166987118.0, + "step": 1954 + }, + { + "epoch": 0.23232323232323232, + "grad_norm": 0.7149767023185012, + "learning_rate": 1.9652362381963247e-05, + "loss": 0.8783, + "num_tokens": 8171150896.0, + "step": 1955 + }, + { + "epoch": 0.2324420677361854, + "grad_norm": 0.5671153315100605, + "learning_rate": 1.9651885503827363e-05, + "loss": 0.8736, + "num_tokens": 8175319092.0, + "step": 1956 + }, + { + "epoch": 0.23256090314913844, + "grad_norm": 0.7621683520211446, + "learning_rate": 1.9651408305282907e-05, + "loss": 0.9201, + "num_tokens": 8179508426.0, + "step": 1957 + }, + { + "epoch": 0.2326797385620915, + "grad_norm": 0.5083887477434527, + "learning_rate": 1.965093078634755e-05, + "loss": 0.9043, + "num_tokens": 8183696517.0, + "step": 1958 + }, + { + "epoch": 0.23279857397504455, + "grad_norm": 0.5880021373502085, + "learning_rate": 1.9650452947038973e-05, + "loss": 0.9117, + "num_tokens": 8187885696.0, + "step": 1959 + }, + { + "epoch": 0.23291740938799763, + "grad_norm": 0.8360828455942336, + "learning_rate": 1.964997478737488e-05, + "loss": 0.963, + "num_tokens": 8192060812.0, + "step": 1960 + }, + { + "epoch": 0.23303624480095067, + "grad_norm": 0.5138926581732076, + "learning_rate": 1.964949630737297e-05, + "loss": 0.934, + "num_tokens": 8196251053.0, + "step": 1961 + }, + { + "epoch": 0.23315508021390374, + "grad_norm": 0.7216655814775077, + "learning_rate": 1.964901750705097e-05, + "loss": 0.9374, + "num_tokens": 8200432288.0, + "step": 1962 + }, + { + "epoch": 0.23327391562685681, + "grad_norm": 0.6036141553994679, + "learning_rate": 1.964853838642661e-05, + "loss": 0.9025, + "num_tokens": 8204621494.0, + "step": 1963 + }, + { + "epoch": 0.23339275103980986, + "grad_norm": 0.7644642142940643, + "learning_rate": 1.9648058945517628e-05, + "loss": 0.9141, + "num_tokens": 8208765926.0, + "step": 1964 + }, + { + "epoch": 0.23351158645276293, + "grad_norm": 0.5753918999309612, + "learning_rate": 1.9647579184341787e-05, + "loss": 0.9162, + "num_tokens": 8212954398.0, + "step": 1965 + }, + { + "epoch": 0.23363042186571598, + "grad_norm": 0.6173955843028358, + "learning_rate": 1.964709910291685e-05, + "loss": 0.9459, + "num_tokens": 8217143357.0, + "step": 1966 + }, + { + "epoch": 0.23374925727866905, + "grad_norm": 0.5358200962766574, + "learning_rate": 1.9646618701260595e-05, + "loss": 0.9502, + "num_tokens": 8221331479.0, + "step": 1967 + }, + { + "epoch": 0.2338680926916221, + "grad_norm": 0.5848355099807726, + "learning_rate": 1.9646137979390816e-05, + "loss": 0.8744, + "num_tokens": 8225520106.0, + "step": 1968 + }, + { + "epoch": 0.23398692810457516, + "grad_norm": 0.6251353426524995, + "learning_rate": 1.9645656937325315e-05, + "loss": 0.8917, + "num_tokens": 8229710296.0, + "step": 1969 + }, + { + "epoch": 0.23410576351752824, + "grad_norm": 0.61581117598947, + "learning_rate": 1.9645175575081904e-05, + "loss": 0.9597, + "num_tokens": 8233837895.0, + "step": 1970 + }, + { + "epoch": 0.23422459893048128, + "grad_norm": 0.6012636378419228, + "learning_rate": 1.9644693892678414e-05, + "loss": 0.9309, + "num_tokens": 8237995659.0, + "step": 1971 + }, + { + "epoch": 0.23434343434343435, + "grad_norm": 0.6818952932614715, + "learning_rate": 1.9644211890132678e-05, + "loss": 0.9567, + "num_tokens": 8242161464.0, + "step": 1972 + }, + { + "epoch": 0.2344622697563874, + "grad_norm": 0.6392961821567884, + "learning_rate": 1.964372956746255e-05, + "loss": 0.9357, + "num_tokens": 8246350450.0, + "step": 1973 + }, + { + "epoch": 0.23458110516934047, + "grad_norm": 0.6608379120937813, + "learning_rate": 1.964324692468589e-05, + "loss": 0.9701, + "num_tokens": 8250499345.0, + "step": 1974 + }, + { + "epoch": 0.23469994058229351, + "grad_norm": 0.5529006437716985, + "learning_rate": 1.9642763961820575e-05, + "loss": 0.946, + "num_tokens": 8254687231.0, + "step": 1975 + }, + { + "epoch": 0.2348187759952466, + "grad_norm": 0.699836805151762, + "learning_rate": 1.9642280678884488e-05, + "loss": 0.9523, + "num_tokens": 8258875527.0, + "step": 1976 + }, + { + "epoch": 0.23493761140819963, + "grad_norm": 0.6115535327214101, + "learning_rate": 1.9641797075895525e-05, + "loss": 0.9001, + "num_tokens": 8263065163.0, + "step": 1977 + }, + { + "epoch": 0.2350564468211527, + "grad_norm": 0.6766892578326416, + "learning_rate": 1.9641313152871602e-05, + "loss": 0.9475, + "num_tokens": 8267255710.0, + "step": 1978 + }, + { + "epoch": 0.23517528223410578, + "grad_norm": 0.6000682943141737, + "learning_rate": 1.964082890983063e-05, + "loss": 0.894, + "num_tokens": 8271419103.0, + "step": 1979 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.680227914919709, + "learning_rate": 1.9640344346790555e-05, + "loss": 0.9473, + "num_tokens": 8275582223.0, + "step": 1980 + }, + { + "epoch": 0.2354129530600119, + "grad_norm": 0.6104281796743392, + "learning_rate": 1.963985946376931e-05, + "loss": 0.9198, + "num_tokens": 8279771840.0, + "step": 1981 + }, + { + "epoch": 0.23553178847296494, + "grad_norm": 0.6735019228653667, + "learning_rate": 1.963937426078486e-05, + "loss": 0.8929, + "num_tokens": 8283932099.0, + "step": 1982 + }, + { + "epoch": 0.235650623885918, + "grad_norm": 0.6128722443173318, + "learning_rate": 1.9638888737855167e-05, + "loss": 0.9187, + "num_tokens": 8288077583.0, + "step": 1983 + }, + { + "epoch": 0.23576945929887105, + "grad_norm": 0.626359407908476, + "learning_rate": 1.963840289499822e-05, + "loss": 0.8947, + "num_tokens": 8292243424.0, + "step": 1984 + }, + { + "epoch": 0.23588829471182413, + "grad_norm": 0.5087420697076045, + "learning_rate": 1.9637916732232005e-05, + "loss": 0.9159, + "num_tokens": 8296431688.0, + "step": 1985 + }, + { + "epoch": 0.23600713012477717, + "grad_norm": 0.6331052247153857, + "learning_rate": 1.9637430249574526e-05, + "loss": 0.9541, + "num_tokens": 8300604189.0, + "step": 1986 + }, + { + "epoch": 0.23612596553773024, + "grad_norm": 0.6397158583398912, + "learning_rate": 1.9636943447043802e-05, + "loss": 0.9204, + "num_tokens": 8304794720.0, + "step": 1987 + }, + { + "epoch": 0.23624480095068331, + "grad_norm": 0.6519009904952208, + "learning_rate": 1.963645632465786e-05, + "loss": 0.9202, + "num_tokens": 8308953398.0, + "step": 1988 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 0.6574947363279467, + "learning_rate": 1.963596888243474e-05, + "loss": 0.9454, + "num_tokens": 8313142215.0, + "step": 1989 + }, + { + "epoch": 0.23648247177658943, + "grad_norm": 0.5401614173506252, + "learning_rate": 1.9635481120392496e-05, + "loss": 0.9558, + "num_tokens": 8317330251.0, + "step": 1990 + }, + { + "epoch": 0.23660130718954248, + "grad_norm": 0.5752680142693326, + "learning_rate": 1.9634993038549183e-05, + "loss": 0.9399, + "num_tokens": 8321519359.0, + "step": 1991 + }, + { + "epoch": 0.23672014260249555, + "grad_norm": 0.5680894868327601, + "learning_rate": 1.9634504636922885e-05, + "loss": 0.9723, + "num_tokens": 8325705904.0, + "step": 1992 + }, + { + "epoch": 0.2368389780154486, + "grad_norm": 0.6960837663669281, + "learning_rate": 1.9634015915531688e-05, + "loss": 0.9338, + "num_tokens": 8329896133.0, + "step": 1993 + }, + { + "epoch": 0.23695781342840166, + "grad_norm": 0.696389193199682, + "learning_rate": 1.963352687439369e-05, + "loss": 0.8734, + "num_tokens": 8334056011.0, + "step": 1994 + }, + { + "epoch": 0.23707664884135474, + "grad_norm": 0.5713814298249277, + "learning_rate": 1.9633037513526997e-05, + "loss": 0.9444, + "num_tokens": 8338243700.0, + "step": 1995 + }, + { + "epoch": 0.23719548425430778, + "grad_norm": 0.5934128928980311, + "learning_rate": 1.963254783294974e-05, + "loss": 0.9335, + "num_tokens": 8342432107.0, + "step": 1996 + }, + { + "epoch": 0.23731431966726085, + "grad_norm": 0.6194935145533786, + "learning_rate": 1.9632057832680045e-05, + "loss": 0.921, + "num_tokens": 8346597905.0, + "step": 1997 + }, + { + "epoch": 0.2374331550802139, + "grad_norm": 0.626979342437152, + "learning_rate": 1.9631567512736067e-05, + "loss": 0.9369, + "num_tokens": 8350785077.0, + "step": 1998 + }, + { + "epoch": 0.23755199049316697, + "grad_norm": 0.6299021751418925, + "learning_rate": 1.963107687313596e-05, + "loss": 0.9099, + "num_tokens": 8354973348.0, + "step": 1999 + }, + { + "epoch": 0.23767082590612001, + "grad_norm": 0.5957353477585431, + "learning_rate": 1.963058591389789e-05, + "loss": 0.9282, + "num_tokens": 8359162072.0, + "step": 2000 + }, + { + "epoch": 0.2377896613190731, + "grad_norm": 0.5970621295025091, + "learning_rate": 1.9630094635040046e-05, + "loss": 0.9012, + "num_tokens": 8363320724.0, + "step": 2001 + }, + { + "epoch": 0.23790849673202613, + "grad_norm": 0.6479239036752051, + "learning_rate": 1.9629603036580616e-05, + "loss": 0.9507, + "num_tokens": 8367485708.0, + "step": 2002 + }, + { + "epoch": 0.2380273321449792, + "grad_norm": 0.6033696711522546, + "learning_rate": 1.962911111853781e-05, + "loss": 0.8994, + "num_tokens": 8371635152.0, + "step": 2003 + }, + { + "epoch": 0.23814616755793228, + "grad_norm": 0.599602550098261, + "learning_rate": 1.9628618880929847e-05, + "loss": 0.9575, + "num_tokens": 8375824210.0, + "step": 2004 + }, + { + "epoch": 0.23826500297088532, + "grad_norm": 0.6247701355369774, + "learning_rate": 1.9628126323774946e-05, + "loss": 0.9629, + "num_tokens": 8380004288.0, + "step": 2005 + }, + { + "epoch": 0.2383838383838384, + "grad_norm": 0.6568627677146686, + "learning_rate": 1.962763344709136e-05, + "loss": 0.8938, + "num_tokens": 8384194328.0, + "step": 2006 + }, + { + "epoch": 0.23850267379679144, + "grad_norm": 0.6901212791448965, + "learning_rate": 1.9627140250897335e-05, + "loss": 0.8961, + "num_tokens": 8388383808.0, + "step": 2007 + }, + { + "epoch": 0.2386215092097445, + "grad_norm": 0.5959260280452081, + "learning_rate": 1.962664673521114e-05, + "loss": 0.9168, + "num_tokens": 8392572773.0, + "step": 2008 + }, + { + "epoch": 0.23874034462269755, + "grad_norm": 0.4665051790783217, + "learning_rate": 1.9626152900051044e-05, + "loss": 0.8965, + "num_tokens": 8396761819.0, + "step": 2009 + }, + { + "epoch": 0.23885918003565063, + "grad_norm": 0.7354197114997956, + "learning_rate": 1.9625658745435342e-05, + "loss": 0.9505, + "num_tokens": 8400951002.0, + "step": 2010 + }, + { + "epoch": 0.23897801544860367, + "grad_norm": 0.506707093155729, + "learning_rate": 1.9625164271382334e-05, + "loss": 0.93, + "num_tokens": 8405102951.0, + "step": 2011 + }, + { + "epoch": 0.23909685086155674, + "grad_norm": 0.8813441867375629, + "learning_rate": 1.962466947791033e-05, + "loss": 0.9279, + "num_tokens": 8409293078.0, + "step": 2012 + }, + { + "epoch": 0.23921568627450981, + "grad_norm": 0.5715287431351708, + "learning_rate": 1.9624174365037658e-05, + "loss": 0.9192, + "num_tokens": 8413481346.0, + "step": 2013 + }, + { + "epoch": 0.23933452168746286, + "grad_norm": 0.6322201390526515, + "learning_rate": 1.9623678932782642e-05, + "loss": 0.8865, + "num_tokens": 8417668788.0, + "step": 2014 + }, + { + "epoch": 0.23945335710041593, + "grad_norm": 0.6766554769190544, + "learning_rate": 1.9623183181163646e-05, + "loss": 0.9129, + "num_tokens": 8421857426.0, + "step": 2015 + }, + { + "epoch": 0.23957219251336898, + "grad_norm": 0.7188992258530712, + "learning_rate": 1.9622687110199016e-05, + "loss": 0.8772, + "num_tokens": 8426032397.0, + "step": 2016 + }, + { + "epoch": 0.23969102792632205, + "grad_norm": 0.5336239086257851, + "learning_rate": 1.962219071990713e-05, + "loss": 0.9337, + "num_tokens": 8430167938.0, + "step": 2017 + }, + { + "epoch": 0.2398098633392751, + "grad_norm": 0.7078016711919874, + "learning_rate": 1.962169401030637e-05, + "loss": 0.8741, + "num_tokens": 8434357166.0, + "step": 2018 + }, + { + "epoch": 0.23992869875222816, + "grad_norm": 0.5784377332354209, + "learning_rate": 1.9621196981415128e-05, + "loss": 0.9069, + "num_tokens": 8438545394.0, + "step": 2019 + }, + { + "epoch": 0.24004753416518124, + "grad_norm": 0.660191954898099, + "learning_rate": 1.9620699633251817e-05, + "loss": 0.8988, + "num_tokens": 8442734445.0, + "step": 2020 + }, + { + "epoch": 0.24016636957813428, + "grad_norm": 0.5437654259169087, + "learning_rate": 1.962020196583485e-05, + "loss": 0.8577, + "num_tokens": 8446925844.0, + "step": 2021 + }, + { + "epoch": 0.24028520499108735, + "grad_norm": 0.6687936075130444, + "learning_rate": 1.9619703979182656e-05, + "loss": 0.8815, + "num_tokens": 8451115351.0, + "step": 2022 + }, + { + "epoch": 0.2404040404040404, + "grad_norm": 0.5523131187385417, + "learning_rate": 1.961920567331368e-05, + "loss": 0.92, + "num_tokens": 8455305798.0, + "step": 2023 + }, + { + "epoch": 0.24052287581699347, + "grad_norm": 0.7272268855818718, + "learning_rate": 1.9618707048246378e-05, + "loss": 0.922, + "num_tokens": 8459445625.0, + "step": 2024 + }, + { + "epoch": 0.24064171122994651, + "grad_norm": 0.549191158280294, + "learning_rate": 1.9618208103999213e-05, + "loss": 0.9254, + "num_tokens": 8463614709.0, + "step": 2025 + }, + { + "epoch": 0.2407605466428996, + "grad_norm": 0.6666560017159132, + "learning_rate": 1.961770884059066e-05, + "loss": 0.9704, + "num_tokens": 8467803392.0, + "step": 2026 + }, + { + "epoch": 0.24087938205585263, + "grad_norm": 0.5523115412411744, + "learning_rate": 1.9617209258039216e-05, + "loss": 0.9008, + "num_tokens": 8471991940.0, + "step": 2027 + }, + { + "epoch": 0.2409982174688057, + "grad_norm": 0.6147253645783477, + "learning_rate": 1.9616709356363376e-05, + "loss": 0.9226, + "num_tokens": 8476180954.0, + "step": 2028 + }, + { + "epoch": 0.24111705288175878, + "grad_norm": 0.5405060568301726, + "learning_rate": 1.9616209135581652e-05, + "loss": 0.9515, + "num_tokens": 8480368088.0, + "step": 2029 + }, + { + "epoch": 0.24123588829471182, + "grad_norm": 0.6553671459277425, + "learning_rate": 1.9615708595712574e-05, + "loss": 0.9213, + "num_tokens": 8484558012.0, + "step": 2030 + }, + { + "epoch": 0.2413547237076649, + "grad_norm": 0.6427445669995109, + "learning_rate": 1.9615207736774676e-05, + "loss": 0.8852, + "num_tokens": 8488747140.0, + "step": 2031 + }, + { + "epoch": 0.24147355912061794, + "grad_norm": 0.6011025402144657, + "learning_rate": 1.9614706558786507e-05, + "loss": 0.9382, + "num_tokens": 8492936295.0, + "step": 2032 + }, + { + "epoch": 0.241592394533571, + "grad_norm": 0.6292951331196334, + "learning_rate": 1.9614205061766628e-05, + "loss": 0.9161, + "num_tokens": 8497125734.0, + "step": 2033 + }, + { + "epoch": 0.24171122994652405, + "grad_norm": 0.6193950430494845, + "learning_rate": 1.961370324573361e-05, + "loss": 0.9565, + "num_tokens": 8501310646.0, + "step": 2034 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.5863747820129906, + "learning_rate": 1.9613201110706034e-05, + "loss": 0.9221, + "num_tokens": 8505479026.0, + "step": 2035 + }, + { + "epoch": 0.2419489007724302, + "grad_norm": 0.6175381699779272, + "learning_rate": 1.96126986567025e-05, + "loss": 0.8983, + "num_tokens": 8509667450.0, + "step": 2036 + }, + { + "epoch": 0.24206773618538324, + "grad_norm": 0.5020637911980712, + "learning_rate": 1.961219588374162e-05, + "loss": 0.9418, + "num_tokens": 8513825953.0, + "step": 2037 + }, + { + "epoch": 0.24218657159833631, + "grad_norm": 0.6389926409690448, + "learning_rate": 1.9611692791842e-05, + "loss": 0.9313, + "num_tokens": 8518016102.0, + "step": 2038 + }, + { + "epoch": 0.24230540701128936, + "grad_norm": 0.6198672531222552, + "learning_rate": 1.9611189381022284e-05, + "loss": 0.8989, + "num_tokens": 8522157459.0, + "step": 2039 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.6465567416533001, + "learning_rate": 1.961068565130111e-05, + "loss": 0.9209, + "num_tokens": 8526306528.0, + "step": 2040 + }, + { + "epoch": 0.24254307783719548, + "grad_norm": 0.683358282539833, + "learning_rate": 1.9610181602697134e-05, + "loss": 0.894, + "num_tokens": 8530492556.0, + "step": 2041 + }, + { + "epoch": 0.24266191325014855, + "grad_norm": 0.5405633290079376, + "learning_rate": 1.960967723522902e-05, + "loss": 0.9223, + "num_tokens": 8534657634.0, + "step": 2042 + }, + { + "epoch": 0.2427807486631016, + "grad_norm": 0.6946218342198125, + "learning_rate": 1.9609172548915445e-05, + "loss": 0.9037, + "num_tokens": 8538847220.0, + "step": 2043 + }, + { + "epoch": 0.24289958407605466, + "grad_norm": 0.5427604816586977, + "learning_rate": 1.9608667543775105e-05, + "loss": 0.9042, + "num_tokens": 8543037150.0, + "step": 2044 + }, + { + "epoch": 0.24301841948900774, + "grad_norm": 0.6001010664942084, + "learning_rate": 1.96081622198267e-05, + "loss": 0.9453, + "num_tokens": 8547202762.0, + "step": 2045 + }, + { + "epoch": 0.24313725490196078, + "grad_norm": 0.7282149300008394, + "learning_rate": 1.960765657708894e-05, + "loss": 0.9255, + "num_tokens": 8551391959.0, + "step": 2046 + }, + { + "epoch": 0.24325609031491385, + "grad_norm": 0.5898094592707538, + "learning_rate": 1.9607150615580558e-05, + "loss": 0.8999, + "num_tokens": 8555563443.0, + "step": 2047 + }, + { + "epoch": 0.2433749257278669, + "grad_norm": 0.712979793587065, + "learning_rate": 1.9606644335320283e-05, + "loss": 0.8927, + "num_tokens": 8559752336.0, + "step": 2048 + }, + { + "epoch": 0.24349376114081997, + "grad_norm": 0.6176488162620116, + "learning_rate": 1.9606137736326872e-05, + "loss": 0.9195, + "num_tokens": 8563940926.0, + "step": 2049 + }, + { + "epoch": 0.24361259655377301, + "grad_norm": 0.5312705636872752, + "learning_rate": 1.960563081861908e-05, + "loss": 0.9012, + "num_tokens": 8568129629.0, + "step": 2050 + }, + { + "epoch": 0.2437314319667261, + "grad_norm": 0.6497510106801733, + "learning_rate": 1.9605123582215685e-05, + "loss": 0.8798, + "num_tokens": 8572317994.0, + "step": 2051 + }, + { + "epoch": 0.24385026737967913, + "grad_norm": 0.5194648246441538, + "learning_rate": 1.960461602713547e-05, + "loss": 0.9257, + "num_tokens": 8576509335.0, + "step": 2052 + }, + { + "epoch": 0.2439691027926322, + "grad_norm": 0.6435061701536439, + "learning_rate": 1.9604108153397226e-05, + "loss": 0.9155, + "num_tokens": 8580682749.0, + "step": 2053 + }, + { + "epoch": 0.24408793820558528, + "grad_norm": 0.48020168504608157, + "learning_rate": 1.9603599961019767e-05, + "loss": 0.8997, + "num_tokens": 8584851239.0, + "step": 2054 + }, + { + "epoch": 0.24420677361853832, + "grad_norm": 0.6106172045751073, + "learning_rate": 1.9603091450021914e-05, + "loss": 0.9172, + "num_tokens": 8589025677.0, + "step": 2055 + }, + { + "epoch": 0.2443256090314914, + "grad_norm": 0.5900552364506273, + "learning_rate": 1.9602582620422494e-05, + "loss": 0.9159, + "num_tokens": 8593213868.0, + "step": 2056 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.5784700259175389, + "learning_rate": 1.9602073472240356e-05, + "loss": 0.9024, + "num_tokens": 8597403285.0, + "step": 2057 + }, + { + "epoch": 0.2445632798573975, + "grad_norm": 0.732123589731466, + "learning_rate": 1.9601564005494354e-05, + "loss": 0.9089, + "num_tokens": 8601567373.0, + "step": 2058 + }, + { + "epoch": 0.24468211527035055, + "grad_norm": 0.5843023052560419, + "learning_rate": 1.9601054220203354e-05, + "loss": 0.9216, + "num_tokens": 8605756378.0, + "step": 2059 + }, + { + "epoch": 0.24480095068330363, + "grad_norm": 0.5623003934196175, + "learning_rate": 1.9600544116386237e-05, + "loss": 0.8953, + "num_tokens": 8609918802.0, + "step": 2060 + }, + { + "epoch": 0.2449197860962567, + "grad_norm": 0.7187823053228104, + "learning_rate": 1.9600033694061886e-05, + "loss": 0.9065, + "num_tokens": 8614081312.0, + "step": 2061 + }, + { + "epoch": 0.24503862150920974, + "grad_norm": 0.6105892297572432, + "learning_rate": 1.9599522953249213e-05, + "loss": 0.9277, + "num_tokens": 8618268869.0, + "step": 2062 + }, + { + "epoch": 0.24515745692216281, + "grad_norm": 0.7726648987839467, + "learning_rate": 1.959901189396713e-05, + "loss": 0.9509, + "num_tokens": 8622457252.0, + "step": 2063 + }, + { + "epoch": 0.24527629233511586, + "grad_norm": 0.518839963175573, + "learning_rate": 1.9598500516234562e-05, + "loss": 0.912, + "num_tokens": 8626640723.0, + "step": 2064 + }, + { + "epoch": 0.24539512774806893, + "grad_norm": 0.7242661274216058, + "learning_rate": 1.9597988820070447e-05, + "loss": 0.9172, + "num_tokens": 8630831365.0, + "step": 2065 + }, + { + "epoch": 0.24551396316102198, + "grad_norm": 0.5232520980514248, + "learning_rate": 1.9597476805493738e-05, + "loss": 0.9385, + "num_tokens": 8635018200.0, + "step": 2066 + }, + { + "epoch": 0.24563279857397505, + "grad_norm": 0.7611204962458531, + "learning_rate": 1.959696447252339e-05, + "loss": 0.9209, + "num_tokens": 8639204992.0, + "step": 2067 + }, + { + "epoch": 0.2457516339869281, + "grad_norm": 0.500435511926882, + "learning_rate": 1.959645182117839e-05, + "loss": 0.93, + "num_tokens": 8643395675.0, + "step": 2068 + }, + { + "epoch": 0.24587046939988116, + "grad_norm": 0.7688569103260725, + "learning_rate": 1.9595938851477702e-05, + "loss": 0.9159, + "num_tokens": 8647584086.0, + "step": 2069 + }, + { + "epoch": 0.24598930481283424, + "grad_norm": 0.5644027888587121, + "learning_rate": 1.9595425563440344e-05, + "loss": 0.9634, + "num_tokens": 8651753192.0, + "step": 2070 + }, + { + "epoch": 0.24610814022578728, + "grad_norm": 0.8104442583398821, + "learning_rate": 1.9594911957085307e-05, + "loss": 0.9076, + "num_tokens": 8655943623.0, + "step": 2071 + }, + { + "epoch": 0.24622697563874035, + "grad_norm": 0.6466072813647749, + "learning_rate": 1.9594398032431626e-05, + "loss": 0.9081, + "num_tokens": 8660131966.0, + "step": 2072 + }, + { + "epoch": 0.2463458110516934, + "grad_norm": 0.7052214867074862, + "learning_rate": 1.9593883789498328e-05, + "loss": 0.9369, + "num_tokens": 8664315532.0, + "step": 2073 + }, + { + "epoch": 0.24646464646464647, + "grad_norm": 0.677324754569631, + "learning_rate": 1.9593369228304453e-05, + "loss": 0.9084, + "num_tokens": 8668503359.0, + "step": 2074 + }, + { + "epoch": 0.24658348187759951, + "grad_norm": 0.5472534156423191, + "learning_rate": 1.9592854348869063e-05, + "loss": 0.9105, + "num_tokens": 8672691776.0, + "step": 2075 + }, + { + "epoch": 0.2467023172905526, + "grad_norm": 0.8015976262484883, + "learning_rate": 1.9592339151211223e-05, + "loss": 0.9535, + "num_tokens": 8676882551.0, + "step": 2076 + }, + { + "epoch": 0.24682115270350566, + "grad_norm": 0.6245482801085782, + "learning_rate": 1.9591823635350012e-05, + "loss": 0.9174, + "num_tokens": 8681045834.0, + "step": 2077 + }, + { + "epoch": 0.2469399881164587, + "grad_norm": 0.5887245093958424, + "learning_rate": 1.959130780130452e-05, + "loss": 0.9546, + "num_tokens": 8685224075.0, + "step": 2078 + }, + { + "epoch": 0.24705882352941178, + "grad_norm": 0.6416906863885863, + "learning_rate": 1.9590791649093857e-05, + "loss": 0.9249, + "num_tokens": 8689386722.0, + "step": 2079 + }, + { + "epoch": 0.24717765894236482, + "grad_norm": 0.6455147163996605, + "learning_rate": 1.959027517873713e-05, + "loss": 0.9282, + "num_tokens": 8693576207.0, + "step": 2080 + }, + { + "epoch": 0.2472964943553179, + "grad_norm": 0.5597372163357518, + "learning_rate": 1.958975839025347e-05, + "loss": 0.8698, + "num_tokens": 8697734807.0, + "step": 2081 + }, + { + "epoch": 0.24741532976827094, + "grad_norm": 0.6624789653554196, + "learning_rate": 1.9589241283662014e-05, + "loss": 0.9164, + "num_tokens": 8701920783.0, + "step": 2082 + }, + { + "epoch": 0.247534165181224, + "grad_norm": 0.5875428728281241, + "learning_rate": 1.9588723858981913e-05, + "loss": 0.9449, + "num_tokens": 8706108899.0, + "step": 2083 + }, + { + "epoch": 0.24765300059417705, + "grad_norm": 0.6971933930186699, + "learning_rate": 1.9588206116232327e-05, + "loss": 0.8936, + "num_tokens": 8710294610.0, + "step": 2084 + }, + { + "epoch": 0.24777183600713013, + "grad_norm": 0.6179060224175442, + "learning_rate": 1.958768805543243e-05, + "loss": 0.923, + "num_tokens": 8714454430.0, + "step": 2085 + }, + { + "epoch": 0.2478906714200832, + "grad_norm": 0.6193416054662851, + "learning_rate": 1.958716967660141e-05, + "loss": 0.9395, + "num_tokens": 8718642649.0, + "step": 2086 + }, + { + "epoch": 0.24800950683303624, + "grad_norm": 0.6188709036127071, + "learning_rate": 1.9586650979758467e-05, + "loss": 0.9144, + "num_tokens": 8722832589.0, + "step": 2087 + }, + { + "epoch": 0.24812834224598931, + "grad_norm": 0.6092085993451662, + "learning_rate": 1.95861319649228e-05, + "loss": 0.8955, + "num_tokens": 8727021125.0, + "step": 2088 + }, + { + "epoch": 0.24824717765894236, + "grad_norm": 0.6443021983084369, + "learning_rate": 1.9585612632113638e-05, + "loss": 0.9176, + "num_tokens": 8731206326.0, + "step": 2089 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.5078669944962266, + "learning_rate": 1.9585092981350212e-05, + "loss": 0.9159, + "num_tokens": 8735394276.0, + "step": 2090 + }, + { + "epoch": 0.24848484848484848, + "grad_norm": 0.526898302663527, + "learning_rate": 1.9584573012651767e-05, + "loss": 0.9184, + "num_tokens": 8739582181.0, + "step": 2091 + }, + { + "epoch": 0.24860368389780155, + "grad_norm": 0.7091518218024064, + "learning_rate": 1.958405272603756e-05, + "loss": 0.91, + "num_tokens": 8743753379.0, + "step": 2092 + }, + { + "epoch": 0.2487225193107546, + "grad_norm": 0.6437303878450223, + "learning_rate": 1.958353212152685e-05, + "loss": 0.9367, + "num_tokens": 8747943712.0, + "step": 2093 + }, + { + "epoch": 0.24884135472370766, + "grad_norm": 0.5987495344478964, + "learning_rate": 1.9583011199138932e-05, + "loss": 0.9267, + "num_tokens": 8752134226.0, + "step": 2094 + }, + { + "epoch": 0.24896019013666074, + "grad_norm": 0.6370365768492379, + "learning_rate": 1.958248995889308e-05, + "loss": 0.9341, + "num_tokens": 8756322227.0, + "step": 2095 + }, + { + "epoch": 0.24907902554961378, + "grad_norm": 0.6170368016582937, + "learning_rate": 1.9581968400808614e-05, + "loss": 0.8912, + "num_tokens": 8760512192.0, + "step": 2096 + }, + { + "epoch": 0.24919786096256685, + "grad_norm": 0.6445938486018444, + "learning_rate": 1.958144652490484e-05, + "loss": 0.9383, + "num_tokens": 8764698766.0, + "step": 2097 + }, + { + "epoch": 0.2493166963755199, + "grad_norm": 0.6300448861937261, + "learning_rate": 1.9580924331201087e-05, + "loss": 0.9217, + "num_tokens": 8768888927.0, + "step": 2098 + }, + { + "epoch": 0.24943553178847297, + "grad_norm": 0.49749678954736576, + "learning_rate": 1.9580401819716696e-05, + "loss": 0.9048, + "num_tokens": 8773073171.0, + "step": 2099 + }, + { + "epoch": 0.24955436720142601, + "grad_norm": 0.6315872169585827, + "learning_rate": 1.957987899047101e-05, + "loss": 0.9389, + "num_tokens": 8777245572.0, + "step": 2100 + }, + { + "epoch": 0.2496732026143791, + "grad_norm": 0.662774246255135, + "learning_rate": 1.9579355843483396e-05, + "loss": 0.9293, + "num_tokens": 8781434972.0, + "step": 2101 + }, + { + "epoch": 0.24979203802733216, + "grad_norm": 0.6052630188364317, + "learning_rate": 1.9578832378773233e-05, + "loss": 0.9108, + "num_tokens": 8785623388.0, + "step": 2102 + }, + { + "epoch": 0.2499108734402852, + "grad_norm": 0.619097282365348, + "learning_rate": 1.9578308596359893e-05, + "loss": 0.9334, + "num_tokens": 8789811430.0, + "step": 2103 + }, + { + "epoch": 0.2500297088532383, + "grad_norm": 0.5283952609167156, + "learning_rate": 1.9577784496262787e-05, + "loss": 0.9109, + "num_tokens": 8793991535.0, + "step": 2104 + }, + { + "epoch": 0.2501485442661913, + "grad_norm": 0.6008488545122064, + "learning_rate": 1.957726007850132e-05, + "loss": 0.9059, + "num_tokens": 8798180645.0, + "step": 2105 + }, + { + "epoch": 0.25026737967914436, + "grad_norm": 0.6371199528654519, + "learning_rate": 1.9576735343094908e-05, + "loss": 0.924, + "num_tokens": 8802339830.0, + "step": 2106 + }, + { + "epoch": 0.25038621509209746, + "grad_norm": 0.602062934391445, + "learning_rate": 1.957621029006299e-05, + "loss": 0.8848, + "num_tokens": 8806502513.0, + "step": 2107 + }, + { + "epoch": 0.2505050505050505, + "grad_norm": 0.6332374726632523, + "learning_rate": 1.9575684919425007e-05, + "loss": 0.929, + "num_tokens": 8810690668.0, + "step": 2108 + }, + { + "epoch": 0.25062388591800355, + "grad_norm": 0.6142004915448466, + "learning_rate": 1.9575159231200414e-05, + "loss": 0.9414, + "num_tokens": 8814879421.0, + "step": 2109 + }, + { + "epoch": 0.25074272133095665, + "grad_norm": 0.6067402401688823, + "learning_rate": 1.9574633225408685e-05, + "loss": 0.9004, + "num_tokens": 8819069594.0, + "step": 2110 + }, + { + "epoch": 0.2508615567439097, + "grad_norm": 0.5944873772118676, + "learning_rate": 1.9574106902069295e-05, + "loss": 0.9068, + "num_tokens": 8823258773.0, + "step": 2111 + }, + { + "epoch": 0.25098039215686274, + "grad_norm": 0.5427809187055517, + "learning_rate": 1.957358026120174e-05, + "loss": 0.9317, + "num_tokens": 8827447479.0, + "step": 2112 + }, + { + "epoch": 0.2510992275698158, + "grad_norm": 0.7156554461735385, + "learning_rate": 1.9573053302825516e-05, + "loss": 0.9184, + "num_tokens": 8831636599.0, + "step": 2113 + }, + { + "epoch": 0.2512180629827689, + "grad_norm": 0.5742876065176744, + "learning_rate": 1.9572526026960144e-05, + "loss": 0.929, + "num_tokens": 8835801482.0, + "step": 2114 + }, + { + "epoch": 0.25133689839572193, + "grad_norm": 0.7122211054510876, + "learning_rate": 1.9571998433625147e-05, + "loss": 0.8966, + "num_tokens": 8839989358.0, + "step": 2115 + }, + { + "epoch": 0.251455733808675, + "grad_norm": 0.5456767447428924, + "learning_rate": 1.9571470522840067e-05, + "loss": 0.8885, + "num_tokens": 8844179165.0, + "step": 2116 + }, + { + "epoch": 0.251574569221628, + "grad_norm": 0.571443448972696, + "learning_rate": 1.957094229462445e-05, + "loss": 0.9444, + "num_tokens": 8848360401.0, + "step": 2117 + }, + { + "epoch": 0.2516934046345811, + "grad_norm": 0.7898479152564425, + "learning_rate": 1.9570413748997866e-05, + "loss": 0.9311, + "num_tokens": 8852497297.0, + "step": 2118 + }, + { + "epoch": 0.25181224004753416, + "grad_norm": 0.5640947251530195, + "learning_rate": 1.9569884885979878e-05, + "loss": 0.9196, + "num_tokens": 8856681229.0, + "step": 2119 + }, + { + "epoch": 0.2519310754604872, + "grad_norm": 0.8663724811215896, + "learning_rate": 1.956935570559008e-05, + "loss": 0.9031, + "num_tokens": 8860870059.0, + "step": 2120 + }, + { + "epoch": 0.2520499108734403, + "grad_norm": 0.5524026151751816, + "learning_rate": 1.9568826207848068e-05, + "loss": 0.9037, + "num_tokens": 8865035491.0, + "step": 2121 + }, + { + "epoch": 0.25216874628639335, + "grad_norm": 0.786045795559992, + "learning_rate": 1.9568296392773446e-05, + "loss": 0.8951, + "num_tokens": 8869226574.0, + "step": 2122 + }, + { + "epoch": 0.2522875816993464, + "grad_norm": 0.6256488602932632, + "learning_rate": 1.9567766260385842e-05, + "loss": 0.9287, + "num_tokens": 8873392654.0, + "step": 2123 + }, + { + "epoch": 0.25240641711229944, + "grad_norm": 0.6796091597405992, + "learning_rate": 1.956723581070488e-05, + "loss": 0.8849, + "num_tokens": 8877565341.0, + "step": 2124 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.6164213619431138, + "learning_rate": 1.9566705043750215e-05, + "loss": 0.8948, + "num_tokens": 8881755340.0, + "step": 2125 + }, + { + "epoch": 0.2526440879382056, + "grad_norm": 0.6347919235356595, + "learning_rate": 1.9566173959541496e-05, + "loss": 0.9095, + "num_tokens": 8885943428.0, + "step": 2126 + }, + { + "epoch": 0.25276292335115863, + "grad_norm": 0.5304595702327399, + "learning_rate": 1.956564255809839e-05, + "loss": 0.9091, + "num_tokens": 8890133124.0, + "step": 2127 + }, + { + "epoch": 0.25288175876411173, + "grad_norm": 0.7066075713558777, + "learning_rate": 1.956511083944058e-05, + "loss": 0.9247, + "num_tokens": 8894322346.0, + "step": 2128 + }, + { + "epoch": 0.2530005941770648, + "grad_norm": 0.5763152572047694, + "learning_rate": 1.9564578803587755e-05, + "loss": 0.9217, + "num_tokens": 8898512349.0, + "step": 2129 + }, + { + "epoch": 0.2531194295900178, + "grad_norm": 0.6029112799658258, + "learning_rate": 1.9564046450559624e-05, + "loss": 0.9541, + "num_tokens": 8902689646.0, + "step": 2130 + }, + { + "epoch": 0.25323826500297086, + "grad_norm": 0.5673903657946455, + "learning_rate": 1.9563513780375895e-05, + "loss": 0.9459, + "num_tokens": 8906879759.0, + "step": 2131 + }, + { + "epoch": 0.25335710041592396, + "grad_norm": 0.7146150772314115, + "learning_rate": 1.9562980793056295e-05, + "loss": 0.9286, + "num_tokens": 8911067869.0, + "step": 2132 + }, + { + "epoch": 0.253475935828877, + "grad_norm": 0.6399882141300854, + "learning_rate": 1.9562447488620564e-05, + "loss": 0.9206, + "num_tokens": 8915257858.0, + "step": 2133 + }, + { + "epoch": 0.25359477124183005, + "grad_norm": 0.482890767888067, + "learning_rate": 1.9561913867088457e-05, + "loss": 0.9444, + "num_tokens": 8919445927.0, + "step": 2134 + }, + { + "epoch": 0.25371360665478315, + "grad_norm": 0.7668996214219214, + "learning_rate": 1.9561379928479724e-05, + "loss": 0.8757, + "num_tokens": 8923605573.0, + "step": 2135 + }, + { + "epoch": 0.2538324420677362, + "grad_norm": 0.5440900944558852, + "learning_rate": 1.956084567281415e-05, + "loss": 0.9721, + "num_tokens": 8927794075.0, + "step": 2136 + }, + { + "epoch": 0.25395127748068924, + "grad_norm": 0.8337171744772404, + "learning_rate": 1.9560311100111515e-05, + "loss": 0.8766, + "num_tokens": 8931973094.0, + "step": 2137 + }, + { + "epoch": 0.2540701128936423, + "grad_norm": 0.58472723848558, + "learning_rate": 1.9559776210391617e-05, + "loss": 0.8901, + "num_tokens": 8936162644.0, + "step": 2138 + }, + { + "epoch": 0.2541889483065954, + "grad_norm": 0.650715319102484, + "learning_rate": 1.9559241003674262e-05, + "loss": 0.9263, + "num_tokens": 8940345011.0, + "step": 2139 + }, + { + "epoch": 0.25430778371954843, + "grad_norm": 0.6535335280639109, + "learning_rate": 1.955870547997928e-05, + "loss": 0.9374, + "num_tokens": 8944533788.0, + "step": 2140 + }, + { + "epoch": 0.2544266191325015, + "grad_norm": 0.49663977489245126, + "learning_rate": 1.955816963932649e-05, + "loss": 0.8831, + "num_tokens": 8948725453.0, + "step": 2141 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 0.6443472563896451, + "learning_rate": 1.9557633481735746e-05, + "loss": 0.9334, + "num_tokens": 8952885378.0, + "step": 2142 + }, + { + "epoch": 0.2546642899584076, + "grad_norm": 0.6151756575301346, + "learning_rate": 1.9557097007226897e-05, + "loss": 0.936, + "num_tokens": 8957073874.0, + "step": 2143 + }, + { + "epoch": 0.25478312537136066, + "grad_norm": 0.6467930450295116, + "learning_rate": 1.9556560215819813e-05, + "loss": 0.8854, + "num_tokens": 8961263815.0, + "step": 2144 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.5158547453821346, + "learning_rate": 1.9556023107534377e-05, + "loss": 0.9397, + "num_tokens": 8965435711.0, + "step": 2145 + }, + { + "epoch": 0.2550207961972668, + "grad_norm": 0.6949733263674305, + "learning_rate": 1.9555485682390476e-05, + "loss": 0.9305, + "num_tokens": 8969624303.0, + "step": 2146 + }, + { + "epoch": 0.25513963161021985, + "grad_norm": 0.5002481116499738, + "learning_rate": 1.9554947940408013e-05, + "loss": 0.8995, + "num_tokens": 8973812826.0, + "step": 2147 + }, + { + "epoch": 0.2552584670231729, + "grad_norm": 0.8043949583109589, + "learning_rate": 1.9554409881606903e-05, + "loss": 0.9304, + "num_tokens": 8978003194.0, + "step": 2148 + }, + { + "epoch": 0.25537730243612594, + "grad_norm": 0.5640158670377349, + "learning_rate": 1.955387150600707e-05, + "loss": 0.9083, + "num_tokens": 8982192727.0, + "step": 2149 + }, + { + "epoch": 0.25549613784907904, + "grad_norm": 0.6183771267897533, + "learning_rate": 1.9553332813628457e-05, + "loss": 0.8986, + "num_tokens": 8986381226.0, + "step": 2150 + }, + { + "epoch": 0.2556149732620321, + "grad_norm": 0.5765467352245482, + "learning_rate": 1.9552793804491007e-05, + "loss": 0.9452, + "num_tokens": 8990542414.0, + "step": 2151 + }, + { + "epoch": 0.25573380867498513, + "grad_norm": 0.5904752759146655, + "learning_rate": 1.955225447861469e-05, + "loss": 0.9496, + "num_tokens": 8994731437.0, + "step": 2152 + }, + { + "epoch": 0.25585264408793823, + "grad_norm": 0.5370206615147061, + "learning_rate": 1.9551714836019465e-05, + "loss": 0.9282, + "num_tokens": 8998922171.0, + "step": 2153 + }, + { + "epoch": 0.2559714795008913, + "grad_norm": 0.6005659803192327, + "learning_rate": 1.9551174876725334e-05, + "loss": 0.9423, + "num_tokens": 9003111067.0, + "step": 2154 + }, + { + "epoch": 0.2560903149138443, + "grad_norm": 0.5306926580971916, + "learning_rate": 1.9550634600752282e-05, + "loss": 0.9124, + "num_tokens": 9007299082.0, + "step": 2155 + }, + { + "epoch": 0.25620915032679736, + "grad_norm": 0.7352602636413033, + "learning_rate": 1.955009400812032e-05, + "loss": 0.9432, + "num_tokens": 9011487128.0, + "step": 2156 + }, + { + "epoch": 0.25632798573975046, + "grad_norm": 0.6691286957630442, + "learning_rate": 1.9549553098849468e-05, + "loss": 0.907, + "num_tokens": 9015677542.0, + "step": 2157 + }, + { + "epoch": 0.2564468211527035, + "grad_norm": 0.7200961396670024, + "learning_rate": 1.954901187295976e-05, + "loss": 0.9292, + "num_tokens": 9019866990.0, + "step": 2158 + }, + { + "epoch": 0.25656565656565655, + "grad_norm": 0.5453686668633745, + "learning_rate": 1.9548470330471235e-05, + "loss": 0.942, + "num_tokens": 9024057836.0, + "step": 2159 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 0.6121973562492916, + "learning_rate": 1.9547928471403955e-05, + "loss": 0.9387, + "num_tokens": 9028219162.0, + "step": 2160 + }, + { + "epoch": 0.2568033273915627, + "grad_norm": 0.6769728578596385, + "learning_rate": 1.9547386295777976e-05, + "loss": 0.9613, + "num_tokens": 9032408799.0, + "step": 2161 + }, + { + "epoch": 0.25692216280451574, + "grad_norm": 0.651823537377718, + "learning_rate": 1.9546843803613386e-05, + "loss": 0.927, + "num_tokens": 9036597469.0, + "step": 2162 + }, + { + "epoch": 0.2570409982174688, + "grad_norm": 0.562418620720278, + "learning_rate": 1.9546300994930273e-05, + "loss": 0.9293, + "num_tokens": 9040787193.0, + "step": 2163 + }, + { + "epoch": 0.2571598336304219, + "grad_norm": 0.5911102147503411, + "learning_rate": 1.9545757869748742e-05, + "loss": 0.8915, + "num_tokens": 9044967971.0, + "step": 2164 + }, + { + "epoch": 0.25727866904337493, + "grad_norm": 0.5994852942505396, + "learning_rate": 1.9545214428088895e-05, + "loss": 0.9335, + "num_tokens": 9049133063.0, + "step": 2165 + }, + { + "epoch": 0.257397504456328, + "grad_norm": 0.6341696148797252, + "learning_rate": 1.9544670669970875e-05, + "loss": 0.9233, + "num_tokens": 9053319707.0, + "step": 2166 + }, + { + "epoch": 0.257516339869281, + "grad_norm": 0.5912001457596177, + "learning_rate": 1.9544126595414803e-05, + "loss": 0.9374, + "num_tokens": 9057508198.0, + "step": 2167 + }, + { + "epoch": 0.2576351752822341, + "grad_norm": 0.49479169367683823, + "learning_rate": 1.9543582204440836e-05, + "loss": 0.8992, + "num_tokens": 9061688234.0, + "step": 2168 + }, + { + "epoch": 0.25775401069518716, + "grad_norm": 0.6943227815992496, + "learning_rate": 1.9543037497069136e-05, + "loss": 0.9457, + "num_tokens": 9065877645.0, + "step": 2169 + }, + { + "epoch": 0.2578728461081402, + "grad_norm": 0.5455606859642769, + "learning_rate": 1.9542492473319872e-05, + "loss": 0.8841, + "num_tokens": 9069979917.0, + "step": 2170 + }, + { + "epoch": 0.2579916815210933, + "grad_norm": 0.6425990218794239, + "learning_rate": 1.9541947133213226e-05, + "loss": 0.9206, + "num_tokens": 9074168649.0, + "step": 2171 + }, + { + "epoch": 0.25811051693404635, + "grad_norm": 0.6527887176895997, + "learning_rate": 1.9541401476769396e-05, + "loss": 0.9084, + "num_tokens": 9078337909.0, + "step": 2172 + }, + { + "epoch": 0.2582293523469994, + "grad_norm": 0.5273262743952191, + "learning_rate": 1.9540855504008593e-05, + "loss": 0.8896, + "num_tokens": 9082526894.0, + "step": 2173 + }, + { + "epoch": 0.25834818775995244, + "grad_norm": 0.6734689626003136, + "learning_rate": 1.9540309214951034e-05, + "loss": 0.921, + "num_tokens": 9086664963.0, + "step": 2174 + }, + { + "epoch": 0.25846702317290554, + "grad_norm": 0.6174228536863346, + "learning_rate": 1.953976260961695e-05, + "loss": 0.9304, + "num_tokens": 9090853302.0, + "step": 2175 + }, + { + "epoch": 0.2585858585858586, + "grad_norm": 0.5813611575947957, + "learning_rate": 1.953921568802658e-05, + "loss": 0.9068, + "num_tokens": 9095043083.0, + "step": 2176 + }, + { + "epoch": 0.25870469399881163, + "grad_norm": 0.5572746644542338, + "learning_rate": 1.9538668450200182e-05, + "loss": 0.8937, + "num_tokens": 9099231749.0, + "step": 2177 + }, + { + "epoch": 0.25882352941176473, + "grad_norm": 0.5757232045958313, + "learning_rate": 1.9538120896158022e-05, + "loss": 0.9115, + "num_tokens": 9103393037.0, + "step": 2178 + }, + { + "epoch": 0.2589423648247178, + "grad_norm": 0.5635172741490695, + "learning_rate": 1.9537573025920374e-05, + "loss": 0.9165, + "num_tokens": 9107550559.0, + "step": 2179 + }, + { + "epoch": 0.2590612002376708, + "grad_norm": 0.5542439754250561, + "learning_rate": 1.9537024839507536e-05, + "loss": 0.9676, + "num_tokens": 9111718505.0, + "step": 2180 + }, + { + "epoch": 0.25918003565062386, + "grad_norm": 0.6725192584700224, + "learning_rate": 1.9536476336939802e-05, + "loss": 0.9139, + "num_tokens": 9115905934.0, + "step": 2181 + }, + { + "epoch": 0.25929887106357696, + "grad_norm": 0.5832372890624496, + "learning_rate": 1.953592751823749e-05, + "loss": 0.9167, + "num_tokens": 9120094115.0, + "step": 2182 + }, + { + "epoch": 0.25941770647653, + "grad_norm": 0.6284188479398504, + "learning_rate": 1.9535378383420915e-05, + "loss": 0.9407, + "num_tokens": 9124252841.0, + "step": 2183 + }, + { + "epoch": 0.25953654188948305, + "grad_norm": 0.6192384372021157, + "learning_rate": 1.9534828932510425e-05, + "loss": 0.9184, + "num_tokens": 9128441407.0, + "step": 2184 + }, + { + "epoch": 0.25965537730243615, + "grad_norm": 0.5809724132596882, + "learning_rate": 1.9534279165526358e-05, + "loss": 0.9401, + "num_tokens": 9132579004.0, + "step": 2185 + }, + { + "epoch": 0.2597742127153892, + "grad_norm": 0.6379632249528439, + "learning_rate": 1.9533729082489078e-05, + "loss": 0.9317, + "num_tokens": 9136769404.0, + "step": 2186 + }, + { + "epoch": 0.25989304812834224, + "grad_norm": 0.5915712072295053, + "learning_rate": 1.953317868341896e-05, + "loss": 0.8896, + "num_tokens": 9140959612.0, + "step": 2187 + }, + { + "epoch": 0.2600118835412953, + "grad_norm": 0.5109722974356226, + "learning_rate": 1.9532627968336384e-05, + "loss": 0.9503, + "num_tokens": 9145096219.0, + "step": 2188 + }, + { + "epoch": 0.2601307189542484, + "grad_norm": 0.6495642516143342, + "learning_rate": 1.9532076937261743e-05, + "loss": 0.9217, + "num_tokens": 9149279780.0, + "step": 2189 + }, + { + "epoch": 0.26024955436720143, + "grad_norm": 0.5269335568134449, + "learning_rate": 1.9531525590215447e-05, + "loss": 0.9134, + "num_tokens": 9153470166.0, + "step": 2190 + }, + { + "epoch": 0.2603683897801545, + "grad_norm": 0.7274471721935067, + "learning_rate": 1.9530973927217913e-05, + "loss": 0.9502, + "num_tokens": 9157602065.0, + "step": 2191 + }, + { + "epoch": 0.2604872251931075, + "grad_norm": 0.5287450498511609, + "learning_rate": 1.953042194828957e-05, + "loss": 0.9448, + "num_tokens": 9161766462.0, + "step": 2192 + }, + { + "epoch": 0.2606060606060606, + "grad_norm": 0.6061610576707108, + "learning_rate": 1.9529869653450858e-05, + "loss": 0.907, + "num_tokens": 9165955833.0, + "step": 2193 + }, + { + "epoch": 0.26072489601901366, + "grad_norm": 0.5807662011308741, + "learning_rate": 1.9529317042722237e-05, + "loss": 0.9418, + "num_tokens": 9170145064.0, + "step": 2194 + }, + { + "epoch": 0.2608437314319667, + "grad_norm": 0.6121379180397349, + "learning_rate": 1.952876411612416e-05, + "loss": 0.9284, + "num_tokens": 9174333811.0, + "step": 2195 + }, + { + "epoch": 0.2609625668449198, + "grad_norm": 0.5756250946058699, + "learning_rate": 1.952821087367712e-05, + "loss": 0.9312, + "num_tokens": 9178522904.0, + "step": 2196 + }, + { + "epoch": 0.26108140225787285, + "grad_norm": 0.6234359256553589, + "learning_rate": 1.9527657315401593e-05, + "loss": 0.9132, + "num_tokens": 9182682930.0, + "step": 2197 + }, + { + "epoch": 0.2612002376708259, + "grad_norm": 0.5009720237001936, + "learning_rate": 1.9527103441318084e-05, + "loss": 0.9133, + "num_tokens": 9186839206.0, + "step": 2198 + }, + { + "epoch": 0.26131907308377894, + "grad_norm": 0.7643610736370572, + "learning_rate": 1.95265492514471e-05, + "loss": 0.9132, + "num_tokens": 9191027076.0, + "step": 2199 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.6500570714732978, + "learning_rate": 1.9525994745809172e-05, + "loss": 0.912, + "num_tokens": 9195187716.0, + "step": 2200 + }, + { + "epoch": 0.2615567439096851, + "grad_norm": 0.6022309362799054, + "learning_rate": 1.952543992442483e-05, + "loss": 0.8853, + "num_tokens": 9199375995.0, + "step": 2201 + }, + { + "epoch": 0.26167557932263813, + "grad_norm": 0.6215039398148146, + "learning_rate": 1.9524884787314624e-05, + "loss": 0.9109, + "num_tokens": 9203549632.0, + "step": 2202 + }, + { + "epoch": 0.26179441473559123, + "grad_norm": 0.617627711216749, + "learning_rate": 1.952432933449911e-05, + "loss": 0.9233, + "num_tokens": 9207739916.0, + "step": 2203 + }, + { + "epoch": 0.2619132501485443, + "grad_norm": 0.6866195284409099, + "learning_rate": 1.952377356599886e-05, + "loss": 0.9387, + "num_tokens": 9211929127.0, + "step": 2204 + }, + { + "epoch": 0.2620320855614973, + "grad_norm": 0.5269217333339725, + "learning_rate": 1.9523217481834454e-05, + "loss": 0.9365, + "num_tokens": 9216113246.0, + "step": 2205 + }, + { + "epoch": 0.26215092097445036, + "grad_norm": 0.6788765352079273, + "learning_rate": 1.9522661082026486e-05, + "loss": 0.9557, + "num_tokens": 9220302223.0, + "step": 2206 + }, + { + "epoch": 0.26226975638740346, + "grad_norm": 0.5515344974835737, + "learning_rate": 1.9522104366595563e-05, + "loss": 0.9488, + "num_tokens": 9224489498.0, + "step": 2207 + }, + { + "epoch": 0.2623885918003565, + "grad_norm": 0.6620075031943555, + "learning_rate": 1.95215473355623e-05, + "loss": 0.9336, + "num_tokens": 9228677963.0, + "step": 2208 + }, + { + "epoch": 0.26250742721330955, + "grad_norm": 0.6507962753081858, + "learning_rate": 1.9520989988947336e-05, + "loss": 0.9365, + "num_tokens": 9232868276.0, + "step": 2209 + }, + { + "epoch": 0.26262626262626265, + "grad_norm": 0.5744366701264428, + "learning_rate": 1.9520432326771292e-05, + "loss": 0.9012, + "num_tokens": 9237057779.0, + "step": 2210 + }, + { + "epoch": 0.2627450980392157, + "grad_norm": 0.6607521263650329, + "learning_rate": 1.9519874349054835e-05, + "loss": 0.9054, + "num_tokens": 9241234372.0, + "step": 2211 + }, + { + "epoch": 0.26286393345216874, + "grad_norm": 0.532315372174596, + "learning_rate": 1.9519316055818626e-05, + "loss": 0.9242, + "num_tokens": 9245422196.0, + "step": 2212 + }, + { + "epoch": 0.2629827688651218, + "grad_norm": 0.6264726064254571, + "learning_rate": 1.9518757447083335e-05, + "loss": 0.9133, + "num_tokens": 9249602166.0, + "step": 2213 + }, + { + "epoch": 0.2631016042780749, + "grad_norm": 0.6739249240188959, + "learning_rate": 1.9518198522869657e-05, + "loss": 0.8863, + "num_tokens": 9253791756.0, + "step": 2214 + }, + { + "epoch": 0.26322043969102793, + "grad_norm": 0.5790435530211565, + "learning_rate": 1.9517639283198284e-05, + "loss": 0.9273, + "num_tokens": 9257978936.0, + "step": 2215 + }, + { + "epoch": 0.263339275103981, + "grad_norm": 0.5890881561970603, + "learning_rate": 1.951707972808993e-05, + "loss": 0.9267, + "num_tokens": 9262169216.0, + "step": 2216 + }, + { + "epoch": 0.263458110516934, + "grad_norm": 0.6976375358839565, + "learning_rate": 1.9516519857565318e-05, + "loss": 0.8789, + "num_tokens": 9266359239.0, + "step": 2217 + }, + { + "epoch": 0.2635769459298871, + "grad_norm": 0.5358410923040999, + "learning_rate": 1.9515959671645182e-05, + "loss": 0.9372, + "num_tokens": 9270539722.0, + "step": 2218 + }, + { + "epoch": 0.26369578134284016, + "grad_norm": 0.6775249311178785, + "learning_rate": 1.9515399170350266e-05, + "loss": 0.8779, + "num_tokens": 9274710394.0, + "step": 2219 + }, + { + "epoch": 0.2638146167557932, + "grad_norm": 0.5359290250176787, + "learning_rate": 1.9514838353701323e-05, + "loss": 0.9424, + "num_tokens": 9278875478.0, + "step": 2220 + }, + { + "epoch": 0.2639334521687463, + "grad_norm": 0.6316680979691246, + "learning_rate": 1.951427722171913e-05, + "loss": 0.9471, + "num_tokens": 9283046546.0, + "step": 2221 + }, + { + "epoch": 0.26405228758169935, + "grad_norm": 0.5669366725049299, + "learning_rate": 1.9513715774424463e-05, + "loss": 0.9404, + "num_tokens": 9287214204.0, + "step": 2222 + }, + { + "epoch": 0.2641711229946524, + "grad_norm": 0.6039980581870489, + "learning_rate": 1.9513154011838114e-05, + "loss": 0.9487, + "num_tokens": 9291402894.0, + "step": 2223 + }, + { + "epoch": 0.26428995840760544, + "grad_norm": 0.6169005017813964, + "learning_rate": 1.951259193398089e-05, + "loss": 0.9078, + "num_tokens": 9295591133.0, + "step": 2224 + }, + { + "epoch": 0.26440879382055854, + "grad_norm": 0.5648716407832859, + "learning_rate": 1.9512029540873605e-05, + "loss": 0.9323, + "num_tokens": 9299781308.0, + "step": 2225 + }, + { + "epoch": 0.2645276292335116, + "grad_norm": 0.6579733417901859, + "learning_rate": 1.9511466832537088e-05, + "loss": 0.9075, + "num_tokens": 9303970849.0, + "step": 2226 + }, + { + "epoch": 0.26464646464646463, + "grad_norm": 0.607160477135259, + "learning_rate": 1.951090380899217e-05, + "loss": 0.9036, + "num_tokens": 9308159751.0, + "step": 2227 + }, + { + "epoch": 0.26476530005941773, + "grad_norm": 0.5132198349287392, + "learning_rate": 1.9510340470259714e-05, + "loss": 0.896, + "num_tokens": 9312348763.0, + "step": 2228 + }, + { + "epoch": 0.2648841354723708, + "grad_norm": 0.6830872640011204, + "learning_rate": 1.9509776816360575e-05, + "loss": 0.8866, + "num_tokens": 9316537617.0, + "step": 2229 + }, + { + "epoch": 0.2650029708853238, + "grad_norm": 0.5788519188202913, + "learning_rate": 1.9509212847315625e-05, + "loss": 0.9065, + "num_tokens": 9320720232.0, + "step": 2230 + }, + { + "epoch": 0.26512180629827686, + "grad_norm": 0.6097982286203518, + "learning_rate": 1.9508648563145755e-05, + "loss": 0.8846, + "num_tokens": 9324910241.0, + "step": 2231 + }, + { + "epoch": 0.26524064171122996, + "grad_norm": 0.6194498209113867, + "learning_rate": 1.9508083963871863e-05, + "loss": 0.9459, + "num_tokens": 9329087602.0, + "step": 2232 + }, + { + "epoch": 0.265359477124183, + "grad_norm": 0.6438238891714714, + "learning_rate": 1.9507519049514854e-05, + "loss": 0.9152, + "num_tokens": 9333254237.0, + "step": 2233 + }, + { + "epoch": 0.26547831253713605, + "grad_norm": 0.5808826889334107, + "learning_rate": 1.950695382009565e-05, + "loss": 0.9188, + "num_tokens": 9337442918.0, + "step": 2234 + }, + { + "epoch": 0.26559714795008915, + "grad_norm": 0.6383807122868539, + "learning_rate": 1.950638827563518e-05, + "loss": 0.9695, + "num_tokens": 9341631694.0, + "step": 2235 + }, + { + "epoch": 0.2657159833630422, + "grad_norm": 0.5180931475416347, + "learning_rate": 1.9505822416154394e-05, + "loss": 0.884, + "num_tokens": 9345820118.0, + "step": 2236 + }, + { + "epoch": 0.26583481877599524, + "grad_norm": 0.5160225664189393, + "learning_rate": 1.9505256241674245e-05, + "loss": 0.9138, + "num_tokens": 9350008219.0, + "step": 2237 + }, + { + "epoch": 0.2659536541889483, + "grad_norm": 0.6094388079515508, + "learning_rate": 1.9504689752215703e-05, + "loss": 0.8949, + "num_tokens": 9354198210.0, + "step": 2238 + }, + { + "epoch": 0.2660724896019014, + "grad_norm": 0.5787557761295888, + "learning_rate": 1.950412294779974e-05, + "loss": 0.9012, + "num_tokens": 9358386880.0, + "step": 2239 + }, + { + "epoch": 0.26619132501485443, + "grad_norm": 0.6925900438398293, + "learning_rate": 1.950355582844735e-05, + "loss": 0.9218, + "num_tokens": 9362576402.0, + "step": 2240 + }, + { + "epoch": 0.2663101604278075, + "grad_norm": 0.6324516293622225, + "learning_rate": 1.950298839417954e-05, + "loss": 0.8832, + "num_tokens": 9366765190.0, + "step": 2241 + }, + { + "epoch": 0.2664289958407606, + "grad_norm": 0.568664355490055, + "learning_rate": 1.9502420645017324e-05, + "loss": 0.9058, + "num_tokens": 9370955126.0, + "step": 2242 + }, + { + "epoch": 0.2665478312537136, + "grad_norm": 0.5213066886661651, + "learning_rate": 1.9501852580981717e-05, + "loss": 0.9196, + "num_tokens": 9375143820.0, + "step": 2243 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.6485663172458738, + "learning_rate": 1.950128420209377e-05, + "loss": 0.895, + "num_tokens": 9379333421.0, + "step": 2244 + }, + { + "epoch": 0.2667855020796197, + "grad_norm": 0.5283804297696185, + "learning_rate": 1.950071550837452e-05, + "loss": 0.9171, + "num_tokens": 9383499705.0, + "step": 2245 + }, + { + "epoch": 0.2669043374925728, + "grad_norm": 0.5940172163747525, + "learning_rate": 1.9500146499845034e-05, + "loss": 0.901, + "num_tokens": 9387688617.0, + "step": 2246 + }, + { + "epoch": 0.26702317290552585, + "grad_norm": 0.6177012218259036, + "learning_rate": 1.9499577176526385e-05, + "loss": 0.9297, + "num_tokens": 9391863460.0, + "step": 2247 + }, + { + "epoch": 0.2671420083184789, + "grad_norm": 0.5612094743653608, + "learning_rate": 1.9499007538439654e-05, + "loss": 0.896, + "num_tokens": 9396051309.0, + "step": 2248 + }, + { + "epoch": 0.26726084373143194, + "grad_norm": 0.5597159216303973, + "learning_rate": 1.949843758560594e-05, + "loss": 0.8923, + "num_tokens": 9400226157.0, + "step": 2249 + }, + { + "epoch": 0.26737967914438504, + "grad_norm": 0.6003472930812966, + "learning_rate": 1.9497867318046345e-05, + "loss": 0.925, + "num_tokens": 9404415189.0, + "step": 2250 + }, + { + "epoch": 0.2674985145573381, + "grad_norm": 0.6451515738781002, + "learning_rate": 1.9497296735781996e-05, + "loss": 0.9314, + "num_tokens": 9408603659.0, + "step": 2251 + }, + { + "epoch": 0.26761734997029113, + "grad_norm": 0.6134492227913381, + "learning_rate": 1.9496725838834015e-05, + "loss": 0.9209, + "num_tokens": 9412775966.0, + "step": 2252 + }, + { + "epoch": 0.26773618538324423, + "grad_norm": 0.6270921876439055, + "learning_rate": 1.9496154627223553e-05, + "loss": 0.9113, + "num_tokens": 9416965372.0, + "step": 2253 + }, + { + "epoch": 0.2678550207961973, + "grad_norm": 0.5436845510771152, + "learning_rate": 1.9495583100971752e-05, + "loss": 0.8763, + "num_tokens": 9421154187.0, + "step": 2254 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.6190025869945265, + "learning_rate": 1.949501126009979e-05, + "loss": 0.9241, + "num_tokens": 9425311529.0, + "step": 2255 + }, + { + "epoch": 0.26809269162210336, + "grad_norm": 0.5812305022613999, + "learning_rate": 1.9494439104628834e-05, + "loss": 0.8829, + "num_tokens": 9429500495.0, + "step": 2256 + }, + { + "epoch": 0.26821152703505646, + "grad_norm": 0.5812111454659026, + "learning_rate": 1.949386663458008e-05, + "loss": 0.9043, + "num_tokens": 9433674370.0, + "step": 2257 + }, + { + "epoch": 0.2683303624480095, + "grad_norm": 0.5708659174489654, + "learning_rate": 1.9493293849974726e-05, + "loss": 0.9051, + "num_tokens": 9437864065.0, + "step": 2258 + }, + { + "epoch": 0.26844919786096255, + "grad_norm": 0.5682260352848474, + "learning_rate": 1.9492720750833988e-05, + "loss": 0.9374, + "num_tokens": 9442049196.0, + "step": 2259 + }, + { + "epoch": 0.26856803327391565, + "grad_norm": 0.5781509296301868, + "learning_rate": 1.949214733717908e-05, + "loss": 0.9413, + "num_tokens": 9446237632.0, + "step": 2260 + }, + { + "epoch": 0.2686868686868687, + "grad_norm": 0.6237233996185737, + "learning_rate": 1.9491573609031246e-05, + "loss": 0.8733, + "num_tokens": 9450425838.0, + "step": 2261 + }, + { + "epoch": 0.26880570409982174, + "grad_norm": 0.5883462790478227, + "learning_rate": 1.949099956641173e-05, + "loss": 0.9529, + "num_tokens": 9454593873.0, + "step": 2262 + }, + { + "epoch": 0.2689245395127748, + "grad_norm": 0.5866900011796361, + "learning_rate": 1.949042520934179e-05, + "loss": 0.9211, + "num_tokens": 9458746877.0, + "step": 2263 + }, + { + "epoch": 0.2690433749257279, + "grad_norm": 0.652603877350035, + "learning_rate": 1.9489850537842704e-05, + "loss": 0.9283, + "num_tokens": 9462915851.0, + "step": 2264 + }, + { + "epoch": 0.26916221033868093, + "grad_norm": 0.6011200667286848, + "learning_rate": 1.948927555193574e-05, + "loss": 0.9148, + "num_tokens": 9467089231.0, + "step": 2265 + }, + { + "epoch": 0.269281045751634, + "grad_norm": 0.5750037772493861, + "learning_rate": 1.9488700251642205e-05, + "loss": 0.8451, + "num_tokens": 9471279890.0, + "step": 2266 + }, + { + "epoch": 0.2693998811645871, + "grad_norm": 0.5882691120592529, + "learning_rate": 1.9488124636983396e-05, + "loss": 0.9061, + "num_tokens": 9475439023.0, + "step": 2267 + }, + { + "epoch": 0.2695187165775401, + "grad_norm": 0.5412940040870893, + "learning_rate": 1.948754870798063e-05, + "loss": 0.9176, + "num_tokens": 9479626930.0, + "step": 2268 + }, + { + "epoch": 0.26963755199049316, + "grad_norm": 0.5356145845483046, + "learning_rate": 1.9486972464655243e-05, + "loss": 0.9308, + "num_tokens": 9483784933.0, + "step": 2269 + }, + { + "epoch": 0.2697563874034462, + "grad_norm": 0.5320512036615107, + "learning_rate": 1.948639590702857e-05, + "loss": 0.9063, + "num_tokens": 9487975993.0, + "step": 2270 + }, + { + "epoch": 0.2698752228163993, + "grad_norm": 0.66005042569645, + "learning_rate": 1.948581903512196e-05, + "loss": 0.9176, + "num_tokens": 9492164774.0, + "step": 2271 + }, + { + "epoch": 0.26999405822935235, + "grad_norm": 0.5442677724015411, + "learning_rate": 1.9485241848956783e-05, + "loss": 0.9111, + "num_tokens": 9496353763.0, + "step": 2272 + }, + { + "epoch": 0.2701128936423054, + "grad_norm": 0.5591160319432397, + "learning_rate": 1.948466434855441e-05, + "loss": 0.9413, + "num_tokens": 9500543063.0, + "step": 2273 + }, + { + "epoch": 0.27023172905525844, + "grad_norm": 0.573981142455423, + "learning_rate": 1.948408653393623e-05, + "loss": 0.9158, + "num_tokens": 9504719708.0, + "step": 2274 + }, + { + "epoch": 0.27035056446821154, + "grad_norm": 0.515333478208716, + "learning_rate": 1.948350840512364e-05, + "loss": 0.8951, + "num_tokens": 9508910355.0, + "step": 2275 + }, + { + "epoch": 0.2704693998811646, + "grad_norm": 0.6077902370389877, + "learning_rate": 1.948292996213805e-05, + "loss": 0.9022, + "num_tokens": 9513101011.0, + "step": 2276 + }, + { + "epoch": 0.27058823529411763, + "grad_norm": 0.6506210959997356, + "learning_rate": 1.9482351205000877e-05, + "loss": 0.9352, + "num_tokens": 9517289417.0, + "step": 2277 + }, + { + "epoch": 0.27070707070707073, + "grad_norm": 0.5395574799138666, + "learning_rate": 1.9481772133733564e-05, + "loss": 0.925, + "num_tokens": 9521478297.0, + "step": 2278 + }, + { + "epoch": 0.2708259061200238, + "grad_norm": 0.7026117405301122, + "learning_rate": 1.9481192748357553e-05, + "loss": 0.9305, + "num_tokens": 9525667224.0, + "step": 2279 + }, + { + "epoch": 0.2709447415329768, + "grad_norm": 0.5455482965841217, + "learning_rate": 1.9480613048894296e-05, + "loss": 0.8992, + "num_tokens": 9529855703.0, + "step": 2280 + }, + { + "epoch": 0.27106357694592986, + "grad_norm": 0.618700944291872, + "learning_rate": 1.9480033035365266e-05, + "loss": 0.9271, + "num_tokens": 9534026839.0, + "step": 2281 + }, + { + "epoch": 0.27118241235888296, + "grad_norm": 0.5334555252953829, + "learning_rate": 1.9479452707791943e-05, + "loss": 0.9337, + "num_tokens": 9538217235.0, + "step": 2282 + }, + { + "epoch": 0.271301247771836, + "grad_norm": 0.6600665234962421, + "learning_rate": 1.9478872066195812e-05, + "loss": 0.9143, + "num_tokens": 9542405497.0, + "step": 2283 + }, + { + "epoch": 0.27142008318478905, + "grad_norm": 0.5729838053896589, + "learning_rate": 1.9478291110598384e-05, + "loss": 0.9133, + "num_tokens": 9546595491.0, + "step": 2284 + }, + { + "epoch": 0.27153891859774215, + "grad_norm": 0.5786968696125442, + "learning_rate": 1.9477709841021168e-05, + "loss": 0.877, + "num_tokens": 9550784294.0, + "step": 2285 + }, + { + "epoch": 0.2716577540106952, + "grad_norm": 0.5575431232264451, + "learning_rate": 1.9477128257485694e-05, + "loss": 0.8708, + "num_tokens": 9554941289.0, + "step": 2286 + }, + { + "epoch": 0.27177658942364824, + "grad_norm": 0.5468554429788293, + "learning_rate": 1.94765463600135e-05, + "loss": 0.9487, + "num_tokens": 9559111929.0, + "step": 2287 + }, + { + "epoch": 0.2718954248366013, + "grad_norm": 0.5936523748558382, + "learning_rate": 1.9475964148626133e-05, + "loss": 0.865, + "num_tokens": 9563258061.0, + "step": 2288 + }, + { + "epoch": 0.2720142602495544, + "grad_norm": 0.5210725711174026, + "learning_rate": 1.9475381623345154e-05, + "loss": 0.9371, + "num_tokens": 9567446425.0, + "step": 2289 + }, + { + "epoch": 0.27213309566250743, + "grad_norm": 0.581505038979266, + "learning_rate": 1.947479878419214e-05, + "loss": 0.8981, + "num_tokens": 9571635703.0, + "step": 2290 + }, + { + "epoch": 0.2722519310754605, + "grad_norm": 0.5751911047706753, + "learning_rate": 1.9474215631188673e-05, + "loss": 0.9172, + "num_tokens": 9575824261.0, + "step": 2291 + }, + { + "epoch": 0.2723707664884136, + "grad_norm": 0.5944030458551114, + "learning_rate": 1.947363216435635e-05, + "loss": 0.9202, + "num_tokens": 9580009241.0, + "step": 2292 + }, + { + "epoch": 0.2724896019013666, + "grad_norm": 0.5323616521100957, + "learning_rate": 1.9473048383716773e-05, + "loss": 0.9505, + "num_tokens": 9584197775.0, + "step": 2293 + }, + { + "epoch": 0.27260843731431966, + "grad_norm": 0.5880542405979234, + "learning_rate": 1.947246428929157e-05, + "loss": 0.8938, + "num_tokens": 9588368901.0, + "step": 2294 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.6039220688943, + "learning_rate": 1.9471879881102366e-05, + "loss": 0.9077, + "num_tokens": 9592558005.0, + "step": 2295 + }, + { + "epoch": 0.2728461081402258, + "grad_norm": 0.6268452854754222, + "learning_rate": 1.947129515917081e-05, + "loss": 0.9444, + "num_tokens": 9596746161.0, + "step": 2296 + }, + { + "epoch": 0.27296494355317885, + "grad_norm": 0.5241581388117575, + "learning_rate": 1.947071012351855e-05, + "loss": 0.9, + "num_tokens": 9600934718.0, + "step": 2297 + }, + { + "epoch": 0.2730837789661319, + "grad_norm": 0.5399315881548512, + "learning_rate": 1.9470124774167254e-05, + "loss": 0.9022, + "num_tokens": 9605106918.0, + "step": 2298 + }, + { + "epoch": 0.27320261437908494, + "grad_norm": 0.5728028714063439, + "learning_rate": 1.94695391111386e-05, + "loss": 0.9415, + "num_tokens": 9609296136.0, + "step": 2299 + }, + { + "epoch": 0.27332144979203804, + "grad_norm": 0.5752751830707885, + "learning_rate": 1.9468953134454277e-05, + "loss": 0.9275, + "num_tokens": 9613483168.0, + "step": 2300 + }, + { + "epoch": 0.2734402852049911, + "grad_norm": 0.5558026155617841, + "learning_rate": 1.946836684413598e-05, + "loss": 0.932, + "num_tokens": 9617671467.0, + "step": 2301 + }, + { + "epoch": 0.27355912061794413, + "grad_norm": 0.6168228694919712, + "learning_rate": 1.946778024020543e-05, + "loss": 0.9157, + "num_tokens": 9621860618.0, + "step": 2302 + }, + { + "epoch": 0.27367795603089723, + "grad_norm": 0.588243599349337, + "learning_rate": 1.946719332268435e-05, + "loss": 0.8954, + "num_tokens": 9626019442.0, + "step": 2303 + }, + { + "epoch": 0.2737967914438503, + "grad_norm": 0.6462759647237293, + "learning_rate": 1.946660609159447e-05, + "loss": 0.9204, + "num_tokens": 9630209291.0, + "step": 2304 + }, + { + "epoch": 0.2739156268568033, + "grad_norm": 0.5652567997666581, + "learning_rate": 1.9466018546957542e-05, + "loss": 0.8892, + "num_tokens": 9634399530.0, + "step": 2305 + }, + { + "epoch": 0.27403446226975636, + "grad_norm": 0.67931722528057, + "learning_rate": 1.946543068879532e-05, + "loss": 0.8925, + "num_tokens": 9638589605.0, + "step": 2306 + }, + { + "epoch": 0.27415329768270946, + "grad_norm": 0.5988591347031502, + "learning_rate": 1.946484251712958e-05, + "loss": 0.9701, + "num_tokens": 9642778624.0, + "step": 2307 + }, + { + "epoch": 0.2742721330956625, + "grad_norm": 0.6490100163921442, + "learning_rate": 1.9464254031982104e-05, + "loss": 0.938, + "num_tokens": 9646945711.0, + "step": 2308 + }, + { + "epoch": 0.27439096850861555, + "grad_norm": 0.5343932905311125, + "learning_rate": 1.946366523337468e-05, + "loss": 0.9005, + "num_tokens": 9651133851.0, + "step": 2309 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.5803080162781835, + "learning_rate": 1.9463076121329112e-05, + "loss": 0.8912, + "num_tokens": 9655309355.0, + "step": 2310 + }, + { + "epoch": 0.2746286393345217, + "grad_norm": 0.6174597663420632, + "learning_rate": 1.9462486695867226e-05, + "loss": 0.9438, + "num_tokens": 9659452905.0, + "step": 2311 + }, + { + "epoch": 0.27474747474747474, + "grad_norm": 0.5453817808673476, + "learning_rate": 1.9461896957010844e-05, + "loss": 0.8861, + "num_tokens": 9663643580.0, + "step": 2312 + }, + { + "epoch": 0.2748663101604278, + "grad_norm": 0.5839917514032433, + "learning_rate": 1.9461306904781806e-05, + "loss": 0.9332, + "num_tokens": 9667804489.0, + "step": 2313 + }, + { + "epoch": 0.2749851455733809, + "grad_norm": 0.570254110516627, + "learning_rate": 1.9460716539201966e-05, + "loss": 0.9123, + "num_tokens": 9671992508.0, + "step": 2314 + }, + { + "epoch": 0.27510398098633393, + "grad_norm": 0.5634229040703077, + "learning_rate": 1.946012586029319e-05, + "loss": 0.9122, + "num_tokens": 9676183406.0, + "step": 2315 + }, + { + "epoch": 0.275222816399287, + "grad_norm": 0.5121797185958076, + "learning_rate": 1.9459534868077344e-05, + "loss": 0.9294, + "num_tokens": 9680371640.0, + "step": 2316 + }, + { + "epoch": 0.2753416518122401, + "grad_norm": 0.5996377003418119, + "learning_rate": 1.945894356257632e-05, + "loss": 0.9047, + "num_tokens": 9684561494.0, + "step": 2317 + }, + { + "epoch": 0.2754604872251931, + "grad_norm": 0.6439855098929609, + "learning_rate": 1.9458351943812017e-05, + "loss": 0.9289, + "num_tokens": 9688726482.0, + "step": 2318 + }, + { + "epoch": 0.27557932263814616, + "grad_norm": 0.6013560527281085, + "learning_rate": 1.945776001180634e-05, + "loss": 0.8937, + "num_tokens": 9692916526.0, + "step": 2319 + }, + { + "epoch": 0.2756981580510992, + "grad_norm": 0.6837558545624418, + "learning_rate": 1.9457167766581216e-05, + "loss": 0.9215, + "num_tokens": 9697107748.0, + "step": 2320 + }, + { + "epoch": 0.2758169934640523, + "grad_norm": 0.5022569392605406, + "learning_rate": 1.9456575208158578e-05, + "loss": 0.9157, + "num_tokens": 9701296056.0, + "step": 2321 + }, + { + "epoch": 0.27593582887700535, + "grad_norm": 0.7603455061656841, + "learning_rate": 1.9455982336560365e-05, + "loss": 0.9002, + "num_tokens": 9705486326.0, + "step": 2322 + }, + { + "epoch": 0.2760546642899584, + "grad_norm": 0.5113592698721658, + "learning_rate": 1.9455389151808535e-05, + "loss": 0.9458, + "num_tokens": 9709652981.0, + "step": 2323 + }, + { + "epoch": 0.27617349970291144, + "grad_norm": 0.5978043135504189, + "learning_rate": 1.9454795653925056e-05, + "loss": 0.9152, + "num_tokens": 9713841792.0, + "step": 2324 + }, + { + "epoch": 0.27629233511586454, + "grad_norm": 0.637648721914358, + "learning_rate": 1.945420184293191e-05, + "loss": 0.904, + "num_tokens": 9718030614.0, + "step": 2325 + }, + { + "epoch": 0.2764111705288176, + "grad_norm": 0.5511408086246421, + "learning_rate": 1.945360771885108e-05, + "loss": 0.9454, + "num_tokens": 9722172365.0, + "step": 2326 + }, + { + "epoch": 0.27653000594177063, + "grad_norm": 0.5429108777783517, + "learning_rate": 1.945301328170458e-05, + "loss": 0.9364, + "num_tokens": 9726354039.0, + "step": 2327 + }, + { + "epoch": 0.27664884135472373, + "grad_norm": 0.6596889548042008, + "learning_rate": 1.9452418531514417e-05, + "loss": 0.9189, + "num_tokens": 9730486075.0, + "step": 2328 + }, + { + "epoch": 0.2767676767676768, + "grad_norm": 0.5798307489232968, + "learning_rate": 1.9451823468302618e-05, + "loss": 0.9059, + "num_tokens": 9734640624.0, + "step": 2329 + }, + { + "epoch": 0.2768865121806298, + "grad_norm": 0.5923586165571064, + "learning_rate": 1.9451228092091215e-05, + "loss": 0.9135, + "num_tokens": 9738829821.0, + "step": 2330 + }, + { + "epoch": 0.27700534759358286, + "grad_norm": 0.6186427695753577, + "learning_rate": 1.9450632402902264e-05, + "loss": 0.9473, + "num_tokens": 9743019828.0, + "step": 2331 + }, + { + "epoch": 0.27712418300653596, + "grad_norm": 0.5606019566599751, + "learning_rate": 1.9450036400757823e-05, + "loss": 0.8736, + "num_tokens": 9747209539.0, + "step": 2332 + }, + { + "epoch": 0.277243018419489, + "grad_norm": 0.6606185037755359, + "learning_rate": 1.944944008567996e-05, + "loss": 0.9728, + "num_tokens": 9751400339.0, + "step": 2333 + }, + { + "epoch": 0.27736185383244205, + "grad_norm": 0.5253862772904494, + "learning_rate": 1.944884345769077e-05, + "loss": 0.9183, + "num_tokens": 9755590033.0, + "step": 2334 + }, + { + "epoch": 0.27748068924539515, + "grad_norm": 0.5213731715659183, + "learning_rate": 1.9448246516812335e-05, + "loss": 0.9329, + "num_tokens": 9759770369.0, + "step": 2335 + }, + { + "epoch": 0.2775995246583482, + "grad_norm": 0.5269673228679257, + "learning_rate": 1.9447649263066774e-05, + "loss": 0.912, + "num_tokens": 9763947967.0, + "step": 2336 + }, + { + "epoch": 0.27771836007130124, + "grad_norm": 0.6136649670509581, + "learning_rate": 1.944705169647619e-05, + "loss": 0.9318, + "num_tokens": 9768137826.0, + "step": 2337 + }, + { + "epoch": 0.2778371954842543, + "grad_norm": 0.6012627292006569, + "learning_rate": 1.944645381706273e-05, + "loss": 0.9078, + "num_tokens": 9772294654.0, + "step": 2338 + }, + { + "epoch": 0.2779560308972074, + "grad_norm": 0.4497022313779991, + "learning_rate": 1.944585562484852e-05, + "loss": 0.9434, + "num_tokens": 9776465993.0, + "step": 2339 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 0.5656538934790313, + "learning_rate": 1.9445257119855722e-05, + "loss": 0.9048, + "num_tokens": 9780655421.0, + "step": 2340 + }, + { + "epoch": 0.2781937017231135, + "grad_norm": 0.5947240041842254, + "learning_rate": 1.9444658302106497e-05, + "loss": 0.945, + "num_tokens": 9784844265.0, + "step": 2341 + }, + { + "epoch": 0.2783125371360666, + "grad_norm": 0.6937632928289514, + "learning_rate": 1.9444059171623028e-05, + "loss": 0.8853, + "num_tokens": 9789024274.0, + "step": 2342 + }, + { + "epoch": 0.2784313725490196, + "grad_norm": 0.5946797769829888, + "learning_rate": 1.9443459728427493e-05, + "loss": 0.9152, + "num_tokens": 9793212838.0, + "step": 2343 + }, + { + "epoch": 0.27855020796197266, + "grad_norm": 0.5322537365954145, + "learning_rate": 1.94428599725421e-05, + "loss": 0.9152, + "num_tokens": 9797400905.0, + "step": 2344 + }, + { + "epoch": 0.2786690433749257, + "grad_norm": 0.6253162693231316, + "learning_rate": 1.9442259903989055e-05, + "loss": 0.9087, + "num_tokens": 9801589908.0, + "step": 2345 + }, + { + "epoch": 0.2787878787878788, + "grad_norm": 0.5639768074646198, + "learning_rate": 1.944165952279058e-05, + "loss": 0.9044, + "num_tokens": 9805712285.0, + "step": 2346 + }, + { + "epoch": 0.27890671420083185, + "grad_norm": 0.5547879451114768, + "learning_rate": 1.9441058828968914e-05, + "loss": 0.9303, + "num_tokens": 9809900764.0, + "step": 2347 + }, + { + "epoch": 0.2790255496137849, + "grad_norm": 0.7542594452584174, + "learning_rate": 1.9440457822546296e-05, + "loss": 0.9077, + "num_tokens": 9814074930.0, + "step": 2348 + }, + { + "epoch": 0.279144385026738, + "grad_norm": 0.5157951238773656, + "learning_rate": 1.9439856503544986e-05, + "loss": 0.9081, + "num_tokens": 9818263517.0, + "step": 2349 + }, + { + "epoch": 0.27926322043969104, + "grad_norm": 0.6002399982190864, + "learning_rate": 1.943925487198726e-05, + "loss": 0.8736, + "num_tokens": 9822431099.0, + "step": 2350 + }, + { + "epoch": 0.2793820558526441, + "grad_norm": 0.5695762148780952, + "learning_rate": 1.9438652927895387e-05, + "loss": 0.948, + "num_tokens": 9826620671.0, + "step": 2351 + }, + { + "epoch": 0.27950089126559713, + "grad_norm": 0.5668034888153429, + "learning_rate": 1.943805067129167e-05, + "loss": 0.8778, + "num_tokens": 9830811267.0, + "step": 2352 + }, + { + "epoch": 0.27961972667855023, + "grad_norm": 0.6335770646644532, + "learning_rate": 1.94374481021984e-05, + "loss": 0.9002, + "num_tokens": 9835001293.0, + "step": 2353 + }, + { + "epoch": 0.2797385620915033, + "grad_norm": 0.6067917558732545, + "learning_rate": 1.9436845220637903e-05, + "loss": 0.9157, + "num_tokens": 9839190693.0, + "step": 2354 + }, + { + "epoch": 0.2798573975044563, + "grad_norm": 0.45933760861465517, + "learning_rate": 1.94362420266325e-05, + "loss": 0.9222, + "num_tokens": 9843366932.0, + "step": 2355 + }, + { + "epoch": 0.27997623291740936, + "grad_norm": 0.6035076362309068, + "learning_rate": 1.9435638520204534e-05, + "loss": 0.9303, + "num_tokens": 9847556188.0, + "step": 2356 + }, + { + "epoch": 0.28009506833036246, + "grad_norm": 0.6286462262702828, + "learning_rate": 1.943503470137635e-05, + "loss": 0.8984, + "num_tokens": 9851746309.0, + "step": 2357 + }, + { + "epoch": 0.2802139037433155, + "grad_norm": 0.5865697620537622, + "learning_rate": 1.943443057017031e-05, + "loss": 0.9167, + "num_tokens": 9855936005.0, + "step": 2358 + }, + { + "epoch": 0.28033273915626855, + "grad_norm": 0.5415656127795447, + "learning_rate": 1.943382612660879e-05, + "loss": 0.9165, + "num_tokens": 9860115074.0, + "step": 2359 + }, + { + "epoch": 0.28045157456922165, + "grad_norm": 0.651849582659351, + "learning_rate": 1.9433221370714174e-05, + "loss": 0.9311, + "num_tokens": 9864280427.0, + "step": 2360 + }, + { + "epoch": 0.2805704099821747, + "grad_norm": 0.5202659532628104, + "learning_rate": 1.9432616302508857e-05, + "loss": 0.9294, + "num_tokens": 9868458287.0, + "step": 2361 + }, + { + "epoch": 0.28068924539512774, + "grad_norm": 0.5770026601965685, + "learning_rate": 1.943201092201525e-05, + "loss": 0.8839, + "num_tokens": 9872644216.0, + "step": 2362 + }, + { + "epoch": 0.2808080808080808, + "grad_norm": 0.724829878593457, + "learning_rate": 1.943140522925576e-05, + "loss": 0.8789, + "num_tokens": 9876833525.0, + "step": 2363 + }, + { + "epoch": 0.2809269162210339, + "grad_norm": 0.536271484753607, + "learning_rate": 1.9430799224252834e-05, + "loss": 0.9271, + "num_tokens": 9881009581.0, + "step": 2364 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.6152564620235895, + "learning_rate": 1.9430192907028905e-05, + "loss": 0.8973, + "num_tokens": 9885161208.0, + "step": 2365 + }, + { + "epoch": 0.28116458704694, + "grad_norm": 0.6426861275648013, + "learning_rate": 1.9429586277606435e-05, + "loss": 0.9621, + "num_tokens": 9889320793.0, + "step": 2366 + }, + { + "epoch": 0.2812834224598931, + "grad_norm": 0.6268880104418965, + "learning_rate": 1.942897933600788e-05, + "loss": 0.8884, + "num_tokens": 9893456620.0, + "step": 2367 + }, + { + "epoch": 0.2814022578728461, + "grad_norm": 0.5319276263685184, + "learning_rate": 1.942837208225572e-05, + "loss": 0.8974, + "num_tokens": 9897618476.0, + "step": 2368 + }, + { + "epoch": 0.28152109328579916, + "grad_norm": 0.6325128326072101, + "learning_rate": 1.9427764516372446e-05, + "loss": 0.8704, + "num_tokens": 9901807900.0, + "step": 2369 + }, + { + "epoch": 0.2816399286987522, + "grad_norm": 0.5288658792366477, + "learning_rate": 1.942715663838056e-05, + "loss": 0.8947, + "num_tokens": 9905973105.0, + "step": 2370 + }, + { + "epoch": 0.2817587641117053, + "grad_norm": 0.5462931848168381, + "learning_rate": 1.9426548448302564e-05, + "loss": 0.9417, + "num_tokens": 9910133143.0, + "step": 2371 + }, + { + "epoch": 0.28187759952465835, + "grad_norm": 0.5995389482517564, + "learning_rate": 1.9425939946160993e-05, + "loss": 0.9183, + "num_tokens": 9914311480.0, + "step": 2372 + }, + { + "epoch": 0.2819964349376114, + "grad_norm": 0.4851988118602142, + "learning_rate": 1.942533113197838e-05, + "loss": 0.9237, + "num_tokens": 9918501336.0, + "step": 2373 + }, + { + "epoch": 0.2821152703505645, + "grad_norm": 0.5578063694743337, + "learning_rate": 1.942472200577726e-05, + "loss": 0.9308, + "num_tokens": 9922633674.0, + "step": 2374 + }, + { + "epoch": 0.28223410576351754, + "grad_norm": 0.6650172336688152, + "learning_rate": 1.9424112567580205e-05, + "loss": 0.9182, + "num_tokens": 9926822799.0, + "step": 2375 + }, + { + "epoch": 0.2823529411764706, + "grad_norm": 0.5831768696477403, + "learning_rate": 1.9423502817409776e-05, + "loss": 0.9251, + "num_tokens": 9931013686.0, + "step": 2376 + }, + { + "epoch": 0.28247177658942363, + "grad_norm": 0.6380519442833257, + "learning_rate": 1.9422892755288557e-05, + "loss": 0.9381, + "num_tokens": 9935203955.0, + "step": 2377 + }, + { + "epoch": 0.28259061200237673, + "grad_norm": 0.5756940086464541, + "learning_rate": 1.9422282381239144e-05, + "loss": 0.8773, + "num_tokens": 9939377593.0, + "step": 2378 + }, + { + "epoch": 0.2827094474153298, + "grad_norm": 0.6856855194558251, + "learning_rate": 1.942167169528414e-05, + "loss": 0.8905, + "num_tokens": 9943566889.0, + "step": 2379 + }, + { + "epoch": 0.2828282828282828, + "grad_norm": 0.5345819228509507, + "learning_rate": 1.9421060697446156e-05, + "loss": 0.9123, + "num_tokens": 9947756187.0, + "step": 2380 + }, + { + "epoch": 0.28294711824123586, + "grad_norm": 0.5181970464441518, + "learning_rate": 1.942044938774782e-05, + "loss": 0.888, + "num_tokens": 9951946389.0, + "step": 2381 + }, + { + "epoch": 0.28306595365418896, + "grad_norm": 0.5435948027468884, + "learning_rate": 1.9419837766211772e-05, + "loss": 0.9122, + "num_tokens": 9956135018.0, + "step": 2382 + }, + { + "epoch": 0.283184789067142, + "grad_norm": 0.5887836443746911, + "learning_rate": 1.941922583286067e-05, + "loss": 0.8846, + "num_tokens": 9960325299.0, + "step": 2383 + }, + { + "epoch": 0.28330362448009505, + "grad_norm": 0.6074918240201526, + "learning_rate": 1.9418613587717164e-05, + "loss": 0.946, + "num_tokens": 9964513597.0, + "step": 2384 + }, + { + "epoch": 0.28342245989304815, + "grad_norm": 0.5550889115291149, + "learning_rate": 1.941800103080393e-05, + "loss": 0.918, + "num_tokens": 9968700802.0, + "step": 2385 + }, + { + "epoch": 0.2835412953060012, + "grad_norm": 0.5507170952894391, + "learning_rate": 1.9417388162143663e-05, + "loss": 0.934, + "num_tokens": 9972884401.0, + "step": 2386 + }, + { + "epoch": 0.28366013071895424, + "grad_norm": 0.5538839168462105, + "learning_rate": 1.941677498175905e-05, + "loss": 0.9734, + "num_tokens": 9977074464.0, + "step": 2387 + }, + { + "epoch": 0.2837789661319073, + "grad_norm": 0.7440893204418567, + "learning_rate": 1.9416161489672802e-05, + "loss": 0.9257, + "num_tokens": 9981259165.0, + "step": 2388 + }, + { + "epoch": 0.2838978015448604, + "grad_norm": 0.522304796197485, + "learning_rate": 1.9415547685907636e-05, + "loss": 0.9484, + "num_tokens": 9985445747.0, + "step": 2389 + }, + { + "epoch": 0.28401663695781343, + "grad_norm": 0.6640792984294646, + "learning_rate": 1.9414933570486288e-05, + "loss": 0.9445, + "num_tokens": 9989597118.0, + "step": 2390 + }, + { + "epoch": 0.2841354723707665, + "grad_norm": 0.4999900410320351, + "learning_rate": 1.9414319143431498e-05, + "loss": 0.9339, + "num_tokens": 9993785294.0, + "step": 2391 + }, + { + "epoch": 0.2842543077837196, + "grad_norm": 0.5615558015158467, + "learning_rate": 1.9413704404766018e-05, + "loss": 0.9216, + "num_tokens": 9997975113.0, + "step": 2392 + }, + { + "epoch": 0.2843731431966726, + "grad_norm": 0.6306818564859762, + "learning_rate": 1.9413089354512618e-05, + "loss": 0.8807, + "num_tokens": 10002151476.0, + "step": 2393 + }, + { + "epoch": 0.28449197860962566, + "grad_norm": 0.5738286025368237, + "learning_rate": 1.9412473992694073e-05, + "loss": 0.9012, + "num_tokens": 10006341198.0, + "step": 2394 + }, + { + "epoch": 0.2846108140225787, + "grad_norm": 0.5733638343701835, + "learning_rate": 1.9411858319333178e-05, + "loss": 0.9384, + "num_tokens": 10010529522.0, + "step": 2395 + }, + { + "epoch": 0.2847296494355318, + "grad_norm": 0.5327724255001719, + "learning_rate": 1.941124233445272e-05, + "loss": 0.9107, + "num_tokens": 10014718906.0, + "step": 2396 + }, + { + "epoch": 0.28484848484848485, + "grad_norm": 0.6666731093894481, + "learning_rate": 1.9410626038075525e-05, + "loss": 0.9385, + "num_tokens": 10018897788.0, + "step": 2397 + }, + { + "epoch": 0.2849673202614379, + "grad_norm": 0.5038619204690906, + "learning_rate": 1.941000943022441e-05, + "loss": 0.8653, + "num_tokens": 10023061050.0, + "step": 2398 + }, + { + "epoch": 0.285086155674391, + "grad_norm": 0.6308704895855844, + "learning_rate": 1.9409392510922212e-05, + "loss": 0.9408, + "num_tokens": 10027250314.0, + "step": 2399 + }, + { + "epoch": 0.28520499108734404, + "grad_norm": 0.5205955943293993, + "learning_rate": 1.9408775280191775e-05, + "loss": 0.9145, + "num_tokens": 10031439481.0, + "step": 2400 + }, + { + "epoch": 0.2853238265002971, + "grad_norm": 0.6138543840201632, + "learning_rate": 1.9408157738055956e-05, + "loss": 0.9498, + "num_tokens": 10035628392.0, + "step": 2401 + }, + { + "epoch": 0.28544266191325013, + "grad_norm": 0.5390025364979628, + "learning_rate": 1.9407539884537628e-05, + "loss": 0.9435, + "num_tokens": 10039803060.0, + "step": 2402 + }, + { + "epoch": 0.28556149732620323, + "grad_norm": 0.632424576032908, + "learning_rate": 1.9406921719659676e-05, + "loss": 0.8872, + "num_tokens": 10043993011.0, + "step": 2403 + }, + { + "epoch": 0.2856803327391563, + "grad_norm": 0.656225168644369, + "learning_rate": 1.9406303243444986e-05, + "loss": 0.9022, + "num_tokens": 10048181053.0, + "step": 2404 + }, + { + "epoch": 0.2857991681521093, + "grad_norm": 0.5448861486608042, + "learning_rate": 1.9405684455916466e-05, + "loss": 0.9153, + "num_tokens": 10052346994.0, + "step": 2405 + }, + { + "epoch": 0.28591800356506236, + "grad_norm": 0.5152489734980695, + "learning_rate": 1.9405065357097025e-05, + "loss": 0.8882, + "num_tokens": 10056534397.0, + "step": 2406 + }, + { + "epoch": 0.28603683897801546, + "grad_norm": 0.6977174159403836, + "learning_rate": 1.94044459470096e-05, + "loss": 0.9197, + "num_tokens": 10060722992.0, + "step": 2407 + }, + { + "epoch": 0.2861556743909685, + "grad_norm": 0.43097706158116295, + "learning_rate": 1.9403826225677126e-05, + "loss": 0.887, + "num_tokens": 10064886697.0, + "step": 2408 + }, + { + "epoch": 0.28627450980392155, + "grad_norm": 0.7946952806625233, + "learning_rate": 1.940320619312255e-05, + "loss": 0.8923, + "num_tokens": 10069072930.0, + "step": 2409 + }, + { + "epoch": 0.28639334521687465, + "grad_norm": 0.5466096913439437, + "learning_rate": 1.940258584936884e-05, + "loss": 0.9133, + "num_tokens": 10073249453.0, + "step": 2410 + }, + { + "epoch": 0.2865121806298277, + "grad_norm": 0.6843198483437047, + "learning_rate": 1.9401965194438963e-05, + "loss": 0.9034, + "num_tokens": 10077438507.0, + "step": 2411 + }, + { + "epoch": 0.28663101604278074, + "grad_norm": 0.5002729353125986, + "learning_rate": 1.940134422835591e-05, + "loss": 0.8828, + "num_tokens": 10081627436.0, + "step": 2412 + }, + { + "epoch": 0.2867498514557338, + "grad_norm": 0.5849396535721417, + "learning_rate": 1.940072295114267e-05, + "loss": 0.8972, + "num_tokens": 10085789867.0, + "step": 2413 + }, + { + "epoch": 0.2868686868686869, + "grad_norm": 0.5824123074506251, + "learning_rate": 1.940010136282226e-05, + "loss": 0.9258, + "num_tokens": 10089975937.0, + "step": 2414 + }, + { + "epoch": 0.28698752228163993, + "grad_norm": 0.5972309972180022, + "learning_rate": 1.9399479463417694e-05, + "loss": 0.9013, + "num_tokens": 10094155826.0, + "step": 2415 + }, + { + "epoch": 0.287106357694593, + "grad_norm": 0.6402682844145421, + "learning_rate": 1.9398857252952004e-05, + "loss": 0.9133, + "num_tokens": 10098320491.0, + "step": 2416 + }, + { + "epoch": 0.2872251931075461, + "grad_norm": 0.6181138109666255, + "learning_rate": 1.939823473144823e-05, + "loss": 0.8918, + "num_tokens": 10102508892.0, + "step": 2417 + }, + { + "epoch": 0.2873440285204991, + "grad_norm": 0.6641166923547015, + "learning_rate": 1.9397611898929435e-05, + "loss": 0.8938, + "num_tokens": 10106699665.0, + "step": 2418 + }, + { + "epoch": 0.28746286393345216, + "grad_norm": 0.47049921112000875, + "learning_rate": 1.939698875541867e-05, + "loss": 0.9329, + "num_tokens": 10110887634.0, + "step": 2419 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.6102180462184794, + "learning_rate": 1.939636530093903e-05, + "loss": 0.8967, + "num_tokens": 10115076491.0, + "step": 2420 + }, + { + "epoch": 0.2877005347593583, + "grad_norm": 0.5330431553372121, + "learning_rate": 1.939574153551359e-05, + "loss": 0.9049, + "num_tokens": 10119266188.0, + "step": 2421 + }, + { + "epoch": 0.28781937017231135, + "grad_norm": 0.5587189053713396, + "learning_rate": 1.9395117459165447e-05, + "loss": 0.8682, + "num_tokens": 10123433724.0, + "step": 2422 + }, + { + "epoch": 0.2879382055852644, + "grad_norm": 0.5287184795056293, + "learning_rate": 1.9394493071917732e-05, + "loss": 0.9137, + "num_tokens": 10127592344.0, + "step": 2423 + }, + { + "epoch": 0.2880570409982175, + "grad_norm": 0.5725547179340318, + "learning_rate": 1.939386837379355e-05, + "loss": 0.8798, + "num_tokens": 10131782236.0, + "step": 2424 + }, + { + "epoch": 0.28817587641117054, + "grad_norm": 0.6110797533936577, + "learning_rate": 1.939324336481604e-05, + "loss": 0.8966, + "num_tokens": 10135972441.0, + "step": 2425 + }, + { + "epoch": 0.2882947118241236, + "grad_norm": 0.5045243186989619, + "learning_rate": 1.9392618045008353e-05, + "loss": 0.9112, + "num_tokens": 10140161254.0, + "step": 2426 + }, + { + "epoch": 0.28841354723707663, + "grad_norm": 0.5181908677209917, + "learning_rate": 1.9391992414393642e-05, + "loss": 0.9292, + "num_tokens": 10144325660.0, + "step": 2427 + }, + { + "epoch": 0.28853238265002973, + "grad_norm": 0.6078690584520431, + "learning_rate": 1.939136647299508e-05, + "loss": 0.9071, + "num_tokens": 10148515624.0, + "step": 2428 + }, + { + "epoch": 0.2886512180629828, + "grad_norm": 0.5527438118459989, + "learning_rate": 1.9390740220835845e-05, + "loss": 0.9204, + "num_tokens": 10152687654.0, + "step": 2429 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 0.7242738186823073, + "learning_rate": 1.939011365793913e-05, + "loss": 0.9054, + "num_tokens": 10156876571.0, + "step": 2430 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.5453894257709186, + "learning_rate": 1.9389486784328138e-05, + "loss": 0.9081, + "num_tokens": 10161024241.0, + "step": 2431 + }, + { + "epoch": 0.28900772430184196, + "grad_norm": 0.6467147580128527, + "learning_rate": 1.9388859600026083e-05, + "loss": 0.8737, + "num_tokens": 10165213127.0, + "step": 2432 + }, + { + "epoch": 0.289126559714795, + "grad_norm": 0.6402794777070456, + "learning_rate": 1.9388232105056195e-05, + "loss": 0.9281, + "num_tokens": 10169375701.0, + "step": 2433 + }, + { + "epoch": 0.28924539512774805, + "grad_norm": 0.6296695928788533, + "learning_rate": 1.9387604299441713e-05, + "loss": 0.8919, + "num_tokens": 10173561575.0, + "step": 2434 + }, + { + "epoch": 0.28936423054070115, + "grad_norm": 0.48080973481289485, + "learning_rate": 1.938697618320588e-05, + "loss": 0.8883, + "num_tokens": 10177751904.0, + "step": 2435 + }, + { + "epoch": 0.2894830659536542, + "grad_norm": 0.6305774578340372, + "learning_rate": 1.9386347756371965e-05, + "loss": 0.9202, + "num_tokens": 10181941828.0, + "step": 2436 + }, + { + "epoch": 0.28960190136660724, + "grad_norm": 0.6559522886116742, + "learning_rate": 1.938571901896324e-05, + "loss": 0.8909, + "num_tokens": 10186110305.0, + "step": 2437 + }, + { + "epoch": 0.2897207367795603, + "grad_norm": 0.5574662879520824, + "learning_rate": 1.9385089971002987e-05, + "loss": 0.8895, + "num_tokens": 10190300497.0, + "step": 2438 + }, + { + "epoch": 0.2898395721925134, + "grad_norm": 0.6003132969249334, + "learning_rate": 1.9384460612514496e-05, + "loss": 0.8892, + "num_tokens": 10194466939.0, + "step": 2439 + }, + { + "epoch": 0.28995840760546643, + "grad_norm": 0.559101067985961, + "learning_rate": 1.9383830943521084e-05, + "loss": 0.9075, + "num_tokens": 10198626814.0, + "step": 2440 + }, + { + "epoch": 0.2900772430184195, + "grad_norm": 0.5731615764296815, + "learning_rate": 1.9383200964046064e-05, + "loss": 0.9153, + "num_tokens": 10202764364.0, + "step": 2441 + }, + { + "epoch": 0.2901960784313726, + "grad_norm": 0.6318196827014804, + "learning_rate": 1.938257067411277e-05, + "loss": 0.8972, + "num_tokens": 10206923832.0, + "step": 2442 + }, + { + "epoch": 0.2903149138443256, + "grad_norm": 0.5702559004395896, + "learning_rate": 1.938194007374454e-05, + "loss": 0.8955, + "num_tokens": 10211114023.0, + "step": 2443 + }, + { + "epoch": 0.29043374925727866, + "grad_norm": 0.6477520920755481, + "learning_rate": 1.938130916296473e-05, + "loss": 0.8837, + "num_tokens": 10215304273.0, + "step": 2444 + }, + { + "epoch": 0.2905525846702317, + "grad_norm": 0.5964009774233007, + "learning_rate": 1.9380677941796706e-05, + "loss": 0.915, + "num_tokens": 10219494443.0, + "step": 2445 + }, + { + "epoch": 0.2906714200831848, + "grad_norm": 0.5048589644200572, + "learning_rate": 1.9380046410263837e-05, + "loss": 0.9205, + "num_tokens": 10223684324.0, + "step": 2446 + }, + { + "epoch": 0.29079025549613785, + "grad_norm": 0.5252315951548567, + "learning_rate": 1.9379414568389517e-05, + "loss": 0.8956, + "num_tokens": 10227872433.0, + "step": 2447 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.5950911065596313, + "learning_rate": 1.9378782416197146e-05, + "loss": 0.8697, + "num_tokens": 10232040295.0, + "step": 2448 + }, + { + "epoch": 0.291027926322044, + "grad_norm": 0.5875514939238984, + "learning_rate": 1.9378149953710134e-05, + "loss": 0.9358, + "num_tokens": 10236211307.0, + "step": 2449 + }, + { + "epoch": 0.29114676173499704, + "grad_norm": 0.6161429352571229, + "learning_rate": 1.93775171809519e-05, + "loss": 0.8982, + "num_tokens": 10240367365.0, + "step": 2450 + }, + { + "epoch": 0.2912655971479501, + "grad_norm": 0.4888226405530671, + "learning_rate": 1.937688409794588e-05, + "loss": 0.875, + "num_tokens": 10244557472.0, + "step": 2451 + }, + { + "epoch": 0.29138443256090313, + "grad_norm": 0.6618723674267406, + "learning_rate": 1.937625070471552e-05, + "loss": 0.9162, + "num_tokens": 10248747274.0, + "step": 2452 + }, + { + "epoch": 0.29150326797385623, + "grad_norm": 0.5361184014926035, + "learning_rate": 1.9375617001284276e-05, + "loss": 0.9441, + "num_tokens": 10252911089.0, + "step": 2453 + }, + { + "epoch": 0.2916221033868093, + "grad_norm": 0.5770802313905803, + "learning_rate": 1.9374982987675613e-05, + "loss": 0.894, + "num_tokens": 10257080018.0, + "step": 2454 + }, + { + "epoch": 0.2917409387997623, + "grad_norm": 0.5908094940181462, + "learning_rate": 1.9374348663913013e-05, + "loss": 0.8985, + "num_tokens": 10261270130.0, + "step": 2455 + }, + { + "epoch": 0.29185977421271536, + "grad_norm": 0.573942548524021, + "learning_rate": 1.937371403001997e-05, + "loss": 0.8836, + "num_tokens": 10265459777.0, + "step": 2456 + }, + { + "epoch": 0.29197860962566846, + "grad_norm": 0.6145560767036476, + "learning_rate": 1.9373079086019986e-05, + "loss": 0.939, + "num_tokens": 10269628718.0, + "step": 2457 + }, + { + "epoch": 0.2920974450386215, + "grad_norm": 0.5580567248171163, + "learning_rate": 1.9372443831936572e-05, + "loss": 0.9381, + "num_tokens": 10273818241.0, + "step": 2458 + }, + { + "epoch": 0.29221628045157455, + "grad_norm": 0.49444747555978974, + "learning_rate": 1.9371808267793252e-05, + "loss": 0.9003, + "num_tokens": 10277997086.0, + "step": 2459 + }, + { + "epoch": 0.29233511586452765, + "grad_norm": 0.6257037222558832, + "learning_rate": 1.937117239361357e-05, + "loss": 0.8987, + "num_tokens": 10282175735.0, + "step": 2460 + }, + { + "epoch": 0.2924539512774807, + "grad_norm": 0.6425476955499453, + "learning_rate": 1.937053620942107e-05, + "loss": 0.9367, + "num_tokens": 10286358673.0, + "step": 2461 + }, + { + "epoch": 0.29257278669043374, + "grad_norm": 0.47983905808076005, + "learning_rate": 1.936989971523931e-05, + "loss": 0.8843, + "num_tokens": 10290549567.0, + "step": 2462 + }, + { + "epoch": 0.2926916221033868, + "grad_norm": 0.6784714589244152, + "learning_rate": 1.9369262911091868e-05, + "loss": 0.9197, + "num_tokens": 10294736944.0, + "step": 2463 + }, + { + "epoch": 0.2928104575163399, + "grad_norm": 0.572705001695848, + "learning_rate": 1.936862579700232e-05, + "loss": 0.937, + "num_tokens": 10298916506.0, + "step": 2464 + }, + { + "epoch": 0.29292929292929293, + "grad_norm": 0.5447800371099082, + "learning_rate": 1.9367988372994264e-05, + "loss": 0.8901, + "num_tokens": 10303105553.0, + "step": 2465 + }, + { + "epoch": 0.293048128342246, + "grad_norm": 0.7205234186231925, + "learning_rate": 1.9367350639091306e-05, + "loss": 0.9091, + "num_tokens": 10307293888.0, + "step": 2466 + }, + { + "epoch": 0.2931669637551991, + "grad_norm": 0.4993323579893363, + "learning_rate": 1.9366712595317064e-05, + "loss": 0.933, + "num_tokens": 10311456369.0, + "step": 2467 + }, + { + "epoch": 0.2932857991681521, + "grad_norm": 0.712823372442568, + "learning_rate": 1.9366074241695166e-05, + "loss": 0.891, + "num_tokens": 10315645384.0, + "step": 2468 + }, + { + "epoch": 0.29340463458110516, + "grad_norm": 0.523054181228716, + "learning_rate": 1.9365435578249252e-05, + "loss": 0.9088, + "num_tokens": 10319834643.0, + "step": 2469 + }, + { + "epoch": 0.2935234699940582, + "grad_norm": 0.6367610454330865, + "learning_rate": 1.9364796605002976e-05, + "loss": 0.9068, + "num_tokens": 10323989927.0, + "step": 2470 + }, + { + "epoch": 0.2936423054070113, + "grad_norm": 0.5277938482818769, + "learning_rate": 1.936415732198e-05, + "loss": 0.9486, + "num_tokens": 10328179089.0, + "step": 2471 + }, + { + "epoch": 0.29376114081996435, + "grad_norm": 0.6714768960744151, + "learning_rate": 1.9363517729203995e-05, + "loss": 0.9097, + "num_tokens": 10332368128.0, + "step": 2472 + }, + { + "epoch": 0.2938799762329174, + "grad_norm": 0.6456640923702556, + "learning_rate": 1.9362877826698655e-05, + "loss": 0.8873, + "num_tokens": 10336492541.0, + "step": 2473 + }, + { + "epoch": 0.2939988116458705, + "grad_norm": 0.5186373327993737, + "learning_rate": 1.936223761448767e-05, + "loss": 0.9401, + "num_tokens": 10340649438.0, + "step": 2474 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.5790461843124732, + "learning_rate": 1.936159709259475e-05, + "loss": 0.8729, + "num_tokens": 10344830730.0, + "step": 2475 + }, + { + "epoch": 0.2942364824717766, + "grad_norm": 0.4940261497524641, + "learning_rate": 1.9360956261043624e-05, + "loss": 0.9096, + "num_tokens": 10349021036.0, + "step": 2476 + }, + { + "epoch": 0.29435531788472963, + "grad_norm": 0.568831697587722, + "learning_rate": 1.9360315119858018e-05, + "loss": 0.9362, + "num_tokens": 10353183580.0, + "step": 2477 + }, + { + "epoch": 0.29447415329768273, + "grad_norm": 0.6628846411893861, + "learning_rate": 1.9359673669061675e-05, + "loss": 0.9356, + "num_tokens": 10357372821.0, + "step": 2478 + }, + { + "epoch": 0.2945929887106358, + "grad_norm": 0.4889363906181328, + "learning_rate": 1.935903190867835e-05, + "loss": 0.8846, + "num_tokens": 10361563722.0, + "step": 2479 + }, + { + "epoch": 0.2947118241235888, + "grad_norm": 0.544107849275185, + "learning_rate": 1.9358389838731812e-05, + "loss": 0.9478, + "num_tokens": 10365752872.0, + "step": 2480 + }, + { + "epoch": 0.2948306595365419, + "grad_norm": 0.6347084690106534, + "learning_rate": 1.935774745924584e-05, + "loss": 0.91, + "num_tokens": 10369941611.0, + "step": 2481 + }, + { + "epoch": 0.29494949494949496, + "grad_norm": 0.6076875776244245, + "learning_rate": 1.9357104770244216e-05, + "loss": 0.8734, + "num_tokens": 10374131507.0, + "step": 2482 + }, + { + "epoch": 0.295068330362448, + "grad_norm": 0.6674846251122484, + "learning_rate": 1.9356461771750753e-05, + "loss": 0.9288, + "num_tokens": 10378321007.0, + "step": 2483 + }, + { + "epoch": 0.29518716577540105, + "grad_norm": 0.5399751894796788, + "learning_rate": 1.9355818463789253e-05, + "loss": 0.943, + "num_tokens": 10382510348.0, + "step": 2484 + }, + { + "epoch": 0.29530600118835415, + "grad_norm": 0.6373366737535765, + "learning_rate": 1.9355174846383544e-05, + "loss": 0.8817, + "num_tokens": 10386699667.0, + "step": 2485 + }, + { + "epoch": 0.2954248366013072, + "grad_norm": 0.5491296063655028, + "learning_rate": 1.935453091955746e-05, + "loss": 0.8987, + "num_tokens": 10390889039.0, + "step": 2486 + }, + { + "epoch": 0.29554367201426024, + "grad_norm": 0.6793201061099128, + "learning_rate": 1.935388668333485e-05, + "loss": 0.941, + "num_tokens": 10395070822.0, + "step": 2487 + }, + { + "epoch": 0.2956625074272133, + "grad_norm": 0.5693932296798228, + "learning_rate": 1.9353242137739573e-05, + "loss": 0.8813, + "num_tokens": 10399211111.0, + "step": 2488 + }, + { + "epoch": 0.2957813428401664, + "grad_norm": 0.5622101913745062, + "learning_rate": 1.9352597282795492e-05, + "loss": 0.8985, + "num_tokens": 10403400106.0, + "step": 2489 + }, + { + "epoch": 0.29590017825311943, + "grad_norm": 0.6108103094802569, + "learning_rate": 1.9351952118526495e-05, + "loss": 0.9003, + "num_tokens": 10407591102.0, + "step": 2490 + }, + { + "epoch": 0.2960190136660725, + "grad_norm": 0.525207894667025, + "learning_rate": 1.9351306644956472e-05, + "loss": 0.93, + "num_tokens": 10411779325.0, + "step": 2491 + }, + { + "epoch": 0.2961378490790256, + "grad_norm": 0.6968176399146863, + "learning_rate": 1.9350660862109332e-05, + "loss": 0.9159, + "num_tokens": 10415966495.0, + "step": 2492 + }, + { + "epoch": 0.2962566844919786, + "grad_norm": 0.48006756390040767, + "learning_rate": 1.935001477000898e-05, + "loss": 0.9141, + "num_tokens": 10420155969.0, + "step": 2493 + }, + { + "epoch": 0.29637551990493166, + "grad_norm": 0.5041912743461172, + "learning_rate": 1.9349368368679355e-05, + "loss": 0.908, + "num_tokens": 10424345548.0, + "step": 2494 + }, + { + "epoch": 0.2964943553178847, + "grad_norm": 0.5979275405310503, + "learning_rate": 1.934872165814438e-05, + "loss": 0.877, + "num_tokens": 10428534389.0, + "step": 2495 + }, + { + "epoch": 0.2966131907308378, + "grad_norm": 0.6474065326442425, + "learning_rate": 1.9348074638428022e-05, + "loss": 0.8727, + "num_tokens": 10432707556.0, + "step": 2496 + }, + { + "epoch": 0.29673202614379085, + "grad_norm": 0.5311312214015549, + "learning_rate": 1.9347427309554233e-05, + "loss": 0.8845, + "num_tokens": 10436866318.0, + "step": 2497 + }, + { + "epoch": 0.2968508615567439, + "grad_norm": 0.635918386997025, + "learning_rate": 1.9346779671546987e-05, + "loss": 0.9074, + "num_tokens": 10441023165.0, + "step": 2498 + }, + { + "epoch": 0.296969696969697, + "grad_norm": 0.5618948611795564, + "learning_rate": 1.9346131724430267e-05, + "loss": 0.8919, + "num_tokens": 10445193421.0, + "step": 2499 + }, + { + "epoch": 0.29708853238265004, + "grad_norm": 0.595282576232755, + "learning_rate": 1.9345483468228073e-05, + "loss": 0.9112, + "num_tokens": 10449352140.0, + "step": 2500 + }, + { + "epoch": 0.2972073677956031, + "grad_norm": 0.5891038183792684, + "learning_rate": 1.9344834902964408e-05, + "loss": 0.9173, + "num_tokens": 10453541127.0, + "step": 2501 + }, + { + "epoch": 0.29732620320855613, + "grad_norm": 0.5605422487663534, + "learning_rate": 1.9344186028663295e-05, + "loss": 0.8874, + "num_tokens": 10457729731.0, + "step": 2502 + }, + { + "epoch": 0.29744503862150923, + "grad_norm": 0.6149500958979608, + "learning_rate": 1.9343536845348758e-05, + "loss": 0.9161, + "num_tokens": 10461902681.0, + "step": 2503 + }, + { + "epoch": 0.2975638740344623, + "grad_norm": 0.6186293747471876, + "learning_rate": 1.934288735304484e-05, + "loss": 0.9115, + "num_tokens": 10466092876.0, + "step": 2504 + }, + { + "epoch": 0.2976827094474153, + "grad_norm": 0.49107028791105684, + "learning_rate": 1.93422375517756e-05, + "loss": 0.9239, + "num_tokens": 10470266067.0, + "step": 2505 + }, + { + "epoch": 0.2978015448603684, + "grad_norm": 0.5851095795828678, + "learning_rate": 1.934158744156509e-05, + "loss": 0.9186, + "num_tokens": 10474455689.0, + "step": 2506 + }, + { + "epoch": 0.29792038027332146, + "grad_norm": 0.5620738211425116, + "learning_rate": 1.93409370224374e-05, + "loss": 0.9172, + "num_tokens": 10478644562.0, + "step": 2507 + }, + { + "epoch": 0.2980392156862745, + "grad_norm": 0.6409404031298493, + "learning_rate": 1.9340286294416608e-05, + "loss": 0.8727, + "num_tokens": 10482804404.0, + "step": 2508 + }, + { + "epoch": 0.29815805109922755, + "grad_norm": 0.6078092580081466, + "learning_rate": 1.9339635257526818e-05, + "loss": 0.9167, + "num_tokens": 10486993588.0, + "step": 2509 + }, + { + "epoch": 0.29827688651218065, + "grad_norm": 0.49791732599515093, + "learning_rate": 1.9338983911792132e-05, + "loss": 0.8752, + "num_tokens": 10491175329.0, + "step": 2510 + }, + { + "epoch": 0.2983957219251337, + "grad_norm": 0.7211560800609602, + "learning_rate": 1.9338332257236684e-05, + "loss": 0.9569, + "num_tokens": 10495340134.0, + "step": 2511 + }, + { + "epoch": 0.29851455733808674, + "grad_norm": 0.48261276686766513, + "learning_rate": 1.9337680293884595e-05, + "loss": 0.922, + "num_tokens": 10499528370.0, + "step": 2512 + }, + { + "epoch": 0.2986333927510398, + "grad_norm": 0.6607001712387749, + "learning_rate": 1.9337028021760017e-05, + "loss": 0.9031, + "num_tokens": 10503691799.0, + "step": 2513 + }, + { + "epoch": 0.2987522281639929, + "grad_norm": 0.5529672968086428, + "learning_rate": 1.9336375440887102e-05, + "loss": 0.9025, + "num_tokens": 10507880394.0, + "step": 2514 + }, + { + "epoch": 0.29887106357694593, + "grad_norm": 0.6550246918559728, + "learning_rate": 1.9335722551290017e-05, + "loss": 0.9633, + "num_tokens": 10512069266.0, + "step": 2515 + }, + { + "epoch": 0.298989898989899, + "grad_norm": 0.532145885708116, + "learning_rate": 1.9335069352992947e-05, + "loss": 0.9077, + "num_tokens": 10516236113.0, + "step": 2516 + }, + { + "epoch": 0.2991087344028521, + "grad_norm": 0.5211927952149753, + "learning_rate": 1.9334415846020072e-05, + "loss": 0.9337, + "num_tokens": 10520424510.0, + "step": 2517 + }, + { + "epoch": 0.2992275698158051, + "grad_norm": 0.749180309788141, + "learning_rate": 1.93337620303956e-05, + "loss": 0.9052, + "num_tokens": 10524612687.0, + "step": 2518 + }, + { + "epoch": 0.29934640522875816, + "grad_norm": 0.4828646817933909, + "learning_rate": 1.9333107906143743e-05, + "loss": 0.9173, + "num_tokens": 10528803645.0, + "step": 2519 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 0.6464356360258247, + "learning_rate": 1.933245347328873e-05, + "loss": 0.9068, + "num_tokens": 10532985561.0, + "step": 2520 + }, + { + "epoch": 0.2995840760546643, + "grad_norm": 0.5501999059918846, + "learning_rate": 1.9331798731854787e-05, + "loss": 0.9271, + "num_tokens": 10537173859.0, + "step": 2521 + }, + { + "epoch": 0.29970291146761735, + "grad_norm": 0.6785430041413649, + "learning_rate": 1.933114368186617e-05, + "loss": 0.9127, + "num_tokens": 10541363059.0, + "step": 2522 + }, + { + "epoch": 0.2998217468805704, + "grad_norm": 0.5339263562812002, + "learning_rate": 1.933048832334713e-05, + "loss": 0.9513, + "num_tokens": 10545551486.0, + "step": 2523 + }, + { + "epoch": 0.2999405822935235, + "grad_norm": 0.790442326497857, + "learning_rate": 1.9329832656321944e-05, + "loss": 0.9456, + "num_tokens": 10549740213.0, + "step": 2524 + }, + { + "epoch": 0.30005941770647654, + "grad_norm": 0.5544127599553562, + "learning_rate": 1.932917668081489e-05, + "loss": 0.942, + "num_tokens": 10553896618.0, + "step": 2525 + }, + { + "epoch": 0.3001782531194296, + "grad_norm": 0.5841571537977787, + "learning_rate": 1.9328520396850265e-05, + "loss": 0.882, + "num_tokens": 10558086573.0, + "step": 2526 + }, + { + "epoch": 0.30029708853238263, + "grad_norm": 0.6820808058250559, + "learning_rate": 1.9327863804452364e-05, + "loss": 0.9192, + "num_tokens": 10562275358.0, + "step": 2527 + }, + { + "epoch": 0.3004159239453357, + "grad_norm": 0.5110392729628013, + "learning_rate": 1.9327206903645516e-05, + "loss": 0.9377, + "num_tokens": 10566461278.0, + "step": 2528 + }, + { + "epoch": 0.30053475935828877, + "grad_norm": 0.6736258961036522, + "learning_rate": 1.9326549694454036e-05, + "loss": 0.9195, + "num_tokens": 10570615077.0, + "step": 2529 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.5463348559923559, + "learning_rate": 1.932589217690227e-05, + "loss": 0.9084, + "num_tokens": 10574803677.0, + "step": 2530 + }, + { + "epoch": 0.3007724301841949, + "grad_norm": 0.6168850756721415, + "learning_rate": 1.9325234351014565e-05, + "loss": 0.9434, + "num_tokens": 10578991977.0, + "step": 2531 + }, + { + "epoch": 0.30089126559714796, + "grad_norm": 0.5803495000353959, + "learning_rate": 1.9324576216815287e-05, + "loss": 0.8968, + "num_tokens": 10583180603.0, + "step": 2532 + }, + { + "epoch": 0.301010101010101, + "grad_norm": 0.5614455808475942, + "learning_rate": 1.9323917774328802e-05, + "loss": 0.9255, + "num_tokens": 10587370259.0, + "step": 2533 + }, + { + "epoch": 0.30112893642305405, + "grad_norm": 0.4767534176068326, + "learning_rate": 1.93232590235795e-05, + "loss": 0.9415, + "num_tokens": 10591541292.0, + "step": 2534 + }, + { + "epoch": 0.30124777183600715, + "grad_norm": 0.7430956849729125, + "learning_rate": 1.9322599964591773e-05, + "loss": 0.9264, + "num_tokens": 10595731386.0, + "step": 2535 + }, + { + "epoch": 0.3013666072489602, + "grad_norm": 0.6108195955770406, + "learning_rate": 1.9321940597390035e-05, + "loss": 0.8817, + "num_tokens": 10599920216.0, + "step": 2536 + }, + { + "epoch": 0.30148544266191324, + "grad_norm": 0.7048923406792369, + "learning_rate": 1.9321280921998693e-05, + "loss": 0.8895, + "num_tokens": 10604108943.0, + "step": 2537 + }, + { + "epoch": 0.3016042780748663, + "grad_norm": 0.5499932831820241, + "learning_rate": 1.9320620938442186e-05, + "loss": 0.9128, + "num_tokens": 10608295382.0, + "step": 2538 + }, + { + "epoch": 0.3017231134878194, + "grad_norm": 0.6552923672745739, + "learning_rate": 1.9319960646744958e-05, + "loss": 0.9026, + "num_tokens": 10612477576.0, + "step": 2539 + }, + { + "epoch": 0.3018419489007724, + "grad_norm": 0.5642572554383019, + "learning_rate": 1.931930004693145e-05, + "loss": 0.8918, + "num_tokens": 10616610887.0, + "step": 2540 + }, + { + "epoch": 0.30196078431372547, + "grad_norm": 0.6240399584151249, + "learning_rate": 1.9318639139026135e-05, + "loss": 0.9197, + "num_tokens": 10620799083.0, + "step": 2541 + }, + { + "epoch": 0.30207961972667857, + "grad_norm": 0.5640610293553756, + "learning_rate": 1.9317977923053488e-05, + "loss": 0.932, + "num_tokens": 10624987881.0, + "step": 2542 + }, + { + "epoch": 0.3021984551396316, + "grad_norm": 0.6144154690745794, + "learning_rate": 1.9317316399037993e-05, + "loss": 0.8971, + "num_tokens": 10629177774.0, + "step": 2543 + }, + { + "epoch": 0.30231729055258466, + "grad_norm": 0.544165488406227, + "learning_rate": 1.9316654567004148e-05, + "loss": 0.9066, + "num_tokens": 10633360482.0, + "step": 2544 + }, + { + "epoch": 0.3024361259655377, + "grad_norm": 0.6107375459666237, + "learning_rate": 1.9315992426976472e-05, + "loss": 0.9186, + "num_tokens": 10637544679.0, + "step": 2545 + }, + { + "epoch": 0.3025549613784908, + "grad_norm": 0.4393979949776145, + "learning_rate": 1.9315329978979475e-05, + "loss": 0.9067, + "num_tokens": 10641724272.0, + "step": 2546 + }, + { + "epoch": 0.30267379679144385, + "grad_norm": 0.5574088821226509, + "learning_rate": 1.9314667223037693e-05, + "loss": 0.88, + "num_tokens": 10645913041.0, + "step": 2547 + }, + { + "epoch": 0.3027926322043969, + "grad_norm": 0.584499308481123, + "learning_rate": 1.931400415917567e-05, + "loss": 0.9043, + "num_tokens": 10650086947.0, + "step": 2548 + }, + { + "epoch": 0.30291146761735, + "grad_norm": 0.5861680601057552, + "learning_rate": 1.931334078741797e-05, + "loss": 0.911, + "num_tokens": 10654275660.0, + "step": 2549 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.4866726703667211, + "learning_rate": 1.9312677107789143e-05, + "loss": 0.9132, + "num_tokens": 10658465089.0, + "step": 2550 + }, + { + "epoch": 0.3031491384432561, + "grad_norm": 0.739115392333312, + "learning_rate": 1.9312013120313786e-05, + "loss": 0.8988, + "num_tokens": 10662603163.0, + "step": 2551 + }, + { + "epoch": 0.3032679738562091, + "grad_norm": 0.5104293841533375, + "learning_rate": 1.9311348825016474e-05, + "loss": 0.8797, + "num_tokens": 10666793225.0, + "step": 2552 + }, + { + "epoch": 0.3033868092691622, + "grad_norm": 0.5205049128811535, + "learning_rate": 1.9310684221921813e-05, + "loss": 0.8932, + "num_tokens": 10670969081.0, + "step": 2553 + }, + { + "epoch": 0.30350564468211527, + "grad_norm": 0.6271753865749996, + "learning_rate": 1.9310019311054424e-05, + "loss": 0.9283, + "num_tokens": 10675156163.0, + "step": 2554 + }, + { + "epoch": 0.3036244800950683, + "grad_norm": 0.5085783810131829, + "learning_rate": 1.9309354092438914e-05, + "loss": 0.9119, + "num_tokens": 10679345877.0, + "step": 2555 + }, + { + "epoch": 0.3037433155080214, + "grad_norm": 0.6081050842743033, + "learning_rate": 1.9308688566099933e-05, + "loss": 0.8702, + "num_tokens": 10683535839.0, + "step": 2556 + }, + { + "epoch": 0.30386215092097446, + "grad_norm": 0.5521554127125357, + "learning_rate": 1.930802273206212e-05, + "loss": 0.8996, + "num_tokens": 10687712600.0, + "step": 2557 + }, + { + "epoch": 0.3039809863339275, + "grad_norm": 0.583131331526859, + "learning_rate": 1.9307356590350133e-05, + "loss": 0.929, + "num_tokens": 10691876798.0, + "step": 2558 + }, + { + "epoch": 0.30409982174688055, + "grad_norm": 0.6242067704153966, + "learning_rate": 1.9306690140988643e-05, + "loss": 0.9135, + "num_tokens": 10696064363.0, + "step": 2559 + }, + { + "epoch": 0.30421865715983365, + "grad_norm": 0.5103014030014185, + "learning_rate": 1.9306023384002333e-05, + "loss": 0.9004, + "num_tokens": 10700254286.0, + "step": 2560 + }, + { + "epoch": 0.3043374925727867, + "grad_norm": 0.4867682818914083, + "learning_rate": 1.9305356319415897e-05, + "loss": 0.8959, + "num_tokens": 10704443894.0, + "step": 2561 + }, + { + "epoch": 0.30445632798573974, + "grad_norm": 0.6507989191370601, + "learning_rate": 1.9304688947254033e-05, + "loss": 0.8859, + "num_tokens": 10708632892.0, + "step": 2562 + }, + { + "epoch": 0.3045751633986928, + "grad_norm": 0.5217529438155983, + "learning_rate": 1.9304021267541456e-05, + "loss": 0.8673, + "num_tokens": 10712822175.0, + "step": 2563 + }, + { + "epoch": 0.3046939988116459, + "grad_norm": 0.6255553932530105, + "learning_rate": 1.9303353280302898e-05, + "loss": 0.8978, + "num_tokens": 10716979097.0, + "step": 2564 + }, + { + "epoch": 0.3048128342245989, + "grad_norm": 0.5385409941717547, + "learning_rate": 1.9302684985563096e-05, + "loss": 0.9099, + "num_tokens": 10721167181.0, + "step": 2565 + }, + { + "epoch": 0.30493166963755197, + "grad_norm": 0.5872351267838807, + "learning_rate": 1.9302016383346793e-05, + "loss": 0.9039, + "num_tokens": 10725356579.0, + "step": 2566 + }, + { + "epoch": 0.30505050505050507, + "grad_norm": 0.5946121226431673, + "learning_rate": 1.9301347473678757e-05, + "loss": 0.9386, + "num_tokens": 10729516905.0, + "step": 2567 + }, + { + "epoch": 0.3051693404634581, + "grad_norm": 0.58966098668216, + "learning_rate": 1.9300678256583755e-05, + "loss": 0.9098, + "num_tokens": 10733706342.0, + "step": 2568 + }, + { + "epoch": 0.30528817587641116, + "grad_norm": 0.5908607086941166, + "learning_rate": 1.930000873208657e-05, + "loss": 0.9022, + "num_tokens": 10737885953.0, + "step": 2569 + }, + { + "epoch": 0.3054070112893642, + "grad_norm": 0.5716965621346136, + "learning_rate": 1.9299338900212e-05, + "loss": 0.904, + "num_tokens": 10742061546.0, + "step": 2570 + }, + { + "epoch": 0.3055258467023173, + "grad_norm": 0.5374276394461234, + "learning_rate": 1.929866876098485e-05, + "loss": 0.8871, + "num_tokens": 10746251002.0, + "step": 2571 + }, + { + "epoch": 0.30564468211527035, + "grad_norm": 0.6554314983119116, + "learning_rate": 1.9297998314429935e-05, + "loss": 0.9355, + "num_tokens": 10750440319.0, + "step": 2572 + }, + { + "epoch": 0.3057635175282234, + "grad_norm": 0.44222282516952177, + "learning_rate": 1.929732756057209e-05, + "loss": 0.8905, + "num_tokens": 10754603758.0, + "step": 2573 + }, + { + "epoch": 0.3058823529411765, + "grad_norm": 0.6136115755200882, + "learning_rate": 1.929665649943615e-05, + "loss": 0.9063, + "num_tokens": 10758794172.0, + "step": 2574 + }, + { + "epoch": 0.30600118835412954, + "grad_norm": 0.6758297485832171, + "learning_rate": 1.9295985131046966e-05, + "loss": 0.8945, + "num_tokens": 10762958431.0, + "step": 2575 + }, + { + "epoch": 0.3061200237670826, + "grad_norm": 0.5583040493431014, + "learning_rate": 1.9295313455429402e-05, + "loss": 0.9233, + "num_tokens": 10767108939.0, + "step": 2576 + }, + { + "epoch": 0.3062388591800356, + "grad_norm": 0.5476573411180063, + "learning_rate": 1.929464147260834e-05, + "loss": 0.8854, + "num_tokens": 10771299181.0, + "step": 2577 + }, + { + "epoch": 0.3063576945929887, + "grad_norm": 0.7513986922015069, + "learning_rate": 1.9293969182608655e-05, + "loss": 0.9195, + "num_tokens": 10775443170.0, + "step": 2578 + }, + { + "epoch": 0.30647653000594177, + "grad_norm": 0.43198733532156214, + "learning_rate": 1.929329658545525e-05, + "loss": 0.9147, + "num_tokens": 10779632316.0, + "step": 2579 + }, + { + "epoch": 0.3065953654188948, + "grad_norm": 0.6568300227719539, + "learning_rate": 1.9292623681173032e-05, + "loss": 0.8758, + "num_tokens": 10783811019.0, + "step": 2580 + }, + { + "epoch": 0.3067142008318479, + "grad_norm": 0.5914249992983828, + "learning_rate": 1.929195046978692e-05, + "loss": 0.9433, + "num_tokens": 10787989197.0, + "step": 2581 + }, + { + "epoch": 0.30683303624480096, + "grad_norm": 0.5525584184729522, + "learning_rate": 1.9291276951321846e-05, + "loss": 0.9006, + "num_tokens": 10792140217.0, + "step": 2582 + }, + { + "epoch": 0.306951871657754, + "grad_norm": 0.5384409889569234, + "learning_rate": 1.9290603125802756e-05, + "loss": 0.894, + "num_tokens": 10796300709.0, + "step": 2583 + }, + { + "epoch": 0.30707070707070705, + "grad_norm": 0.5514317716261332, + "learning_rate": 1.9289928993254598e-05, + "loss": 0.9097, + "num_tokens": 10800491445.0, + "step": 2584 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.5481453787231526, + "learning_rate": 1.9289254553702344e-05, + "loss": 0.911, + "num_tokens": 10804654716.0, + "step": 2585 + }, + { + "epoch": 0.3073083778966132, + "grad_norm": 0.604177295688281, + "learning_rate": 1.9288579807170963e-05, + "loss": 0.8793, + "num_tokens": 10808844246.0, + "step": 2586 + }, + { + "epoch": 0.30742721330956624, + "grad_norm": 0.5325665854085815, + "learning_rate": 1.928790475368545e-05, + "loss": 0.904, + "num_tokens": 10813033084.0, + "step": 2587 + }, + { + "epoch": 0.30754604872251934, + "grad_norm": 0.6330198555084869, + "learning_rate": 1.92872293932708e-05, + "loss": 0.8951, + "num_tokens": 10817220701.0, + "step": 2588 + }, + { + "epoch": 0.3076648841354724, + "grad_norm": 0.4757867840106233, + "learning_rate": 1.9286553725952028e-05, + "loss": 0.9011, + "num_tokens": 10821409543.0, + "step": 2589 + }, + { + "epoch": 0.3077837195484254, + "grad_norm": 0.5700080131005988, + "learning_rate": 1.9285877751754153e-05, + "loss": 0.9018, + "num_tokens": 10825575129.0, + "step": 2590 + }, + { + "epoch": 0.30790255496137847, + "grad_norm": 0.6301476520765427, + "learning_rate": 1.928520147070221e-05, + "loss": 0.9003, + "num_tokens": 10829732340.0, + "step": 2591 + }, + { + "epoch": 0.30802139037433157, + "grad_norm": 0.5007575503677059, + "learning_rate": 1.928452488282124e-05, + "loss": 0.9099, + "num_tokens": 10833922841.0, + "step": 2592 + }, + { + "epoch": 0.3081402257872846, + "grad_norm": 0.48104374890282625, + "learning_rate": 1.9283847988136312e-05, + "loss": 0.9161, + "num_tokens": 10838111872.0, + "step": 2593 + }, + { + "epoch": 0.30825906120023766, + "grad_norm": 0.6573685390264747, + "learning_rate": 1.9283170786672475e-05, + "loss": 0.9158, + "num_tokens": 10842281815.0, + "step": 2594 + }, + { + "epoch": 0.3083778966131907, + "grad_norm": 0.6339183645803278, + "learning_rate": 1.9282493278454822e-05, + "loss": 0.9228, + "num_tokens": 10846470923.0, + "step": 2595 + }, + { + "epoch": 0.3084967320261438, + "grad_norm": 0.6430715537574867, + "learning_rate": 1.9281815463508436e-05, + "loss": 0.9276, + "num_tokens": 10850658049.0, + "step": 2596 + }, + { + "epoch": 0.30861556743909685, + "grad_norm": 0.5398096797837615, + "learning_rate": 1.928113734185843e-05, + "loss": 0.9014, + "num_tokens": 10854809095.0, + "step": 2597 + }, + { + "epoch": 0.3087344028520499, + "grad_norm": 0.6776818589631132, + "learning_rate": 1.9280458913529904e-05, + "loss": 0.9093, + "num_tokens": 10858971024.0, + "step": 2598 + }, + { + "epoch": 0.308853238265003, + "grad_norm": 0.6030748216728108, + "learning_rate": 1.9279780178547986e-05, + "loss": 0.9067, + "num_tokens": 10863160547.0, + "step": 2599 + }, + { + "epoch": 0.30897207367795604, + "grad_norm": 0.5123867387939542, + "learning_rate": 1.9279101136937813e-05, + "loss": 0.8986, + "num_tokens": 10867349974.0, + "step": 2600 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 0.5605954208692528, + "learning_rate": 1.9278421788724537e-05, + "loss": 0.9029, + "num_tokens": 10871520507.0, + "step": 2601 + }, + { + "epoch": 0.3092097445038621, + "grad_norm": 0.5050189829407776, + "learning_rate": 1.927774213393331e-05, + "loss": 0.888, + "num_tokens": 10875709500.0, + "step": 2602 + }, + { + "epoch": 0.3093285799168152, + "grad_norm": 0.5890397612654619, + "learning_rate": 1.9277062172589304e-05, + "loss": 0.9218, + "num_tokens": 10879888333.0, + "step": 2603 + }, + { + "epoch": 0.30944741532976827, + "grad_norm": 0.600331366731954, + "learning_rate": 1.9276381904717695e-05, + "loss": 0.8787, + "num_tokens": 10884077899.0, + "step": 2604 + }, + { + "epoch": 0.3095662507427213, + "grad_norm": 0.5784173264956277, + "learning_rate": 1.9275701330343688e-05, + "loss": 0.8996, + "num_tokens": 10888253616.0, + "step": 2605 + }, + { + "epoch": 0.3096850861556744, + "grad_norm": 0.6975660073578589, + "learning_rate": 1.9275020449492476e-05, + "loss": 0.9169, + "num_tokens": 10892423673.0, + "step": 2606 + }, + { + "epoch": 0.30980392156862746, + "grad_norm": 0.5066424688166827, + "learning_rate": 1.927433926218928e-05, + "loss": 0.935, + "num_tokens": 10896611118.0, + "step": 2607 + }, + { + "epoch": 0.3099227569815805, + "grad_norm": 0.5974662846959404, + "learning_rate": 1.9273657768459326e-05, + "loss": 0.8841, + "num_tokens": 10900800147.0, + "step": 2608 + }, + { + "epoch": 0.31004159239453355, + "grad_norm": 0.5560518302324056, + "learning_rate": 1.927297596832785e-05, + "loss": 0.9058, + "num_tokens": 10904986945.0, + "step": 2609 + }, + { + "epoch": 0.31016042780748665, + "grad_norm": 0.559920274335001, + "learning_rate": 1.9272293861820102e-05, + "loss": 0.9042, + "num_tokens": 10909176255.0, + "step": 2610 + }, + { + "epoch": 0.3102792632204397, + "grad_norm": 0.6076298202397794, + "learning_rate": 1.9271611448961344e-05, + "loss": 0.8977, + "num_tokens": 10913365176.0, + "step": 2611 + }, + { + "epoch": 0.31039809863339274, + "grad_norm": 0.5663473091145642, + "learning_rate": 1.9270928729776847e-05, + "loss": 0.8963, + "num_tokens": 10917554246.0, + "step": 2612 + }, + { + "epoch": 0.31051693404634584, + "grad_norm": 0.5739095881161751, + "learning_rate": 1.9270245704291894e-05, + "loss": 0.9037, + "num_tokens": 10921722245.0, + "step": 2613 + }, + { + "epoch": 0.3106357694592989, + "grad_norm": 0.6034434839456033, + "learning_rate": 1.9269562372531777e-05, + "loss": 0.8824, + "num_tokens": 10925911545.0, + "step": 2614 + }, + { + "epoch": 0.3107546048722519, + "grad_norm": 0.5868277665862341, + "learning_rate": 1.926887873452181e-05, + "loss": 0.9147, + "num_tokens": 10930070843.0, + "step": 2615 + }, + { + "epoch": 0.31087344028520497, + "grad_norm": 0.49061209326242, + "learning_rate": 1.9268194790287303e-05, + "loss": 0.8994, + "num_tokens": 10934242224.0, + "step": 2616 + }, + { + "epoch": 0.31099227569815807, + "grad_norm": 0.6035162430938723, + "learning_rate": 1.9267510539853588e-05, + "loss": 0.8992, + "num_tokens": 10938429849.0, + "step": 2617 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.5667964528412964, + "learning_rate": 1.9266825983246004e-05, + "loss": 0.8953, + "num_tokens": 10942588208.0, + "step": 2618 + }, + { + "epoch": 0.31122994652406416, + "grad_norm": 0.557788937482023, + "learning_rate": 1.9266141120489905e-05, + "loss": 0.9393, + "num_tokens": 10946762051.0, + "step": 2619 + }, + { + "epoch": 0.3113487819370172, + "grad_norm": 0.48136189560532616, + "learning_rate": 1.926545595161065e-05, + "loss": 0.8976, + "num_tokens": 10950930773.0, + "step": 2620 + }, + { + "epoch": 0.3114676173499703, + "grad_norm": 0.6074901913288999, + "learning_rate": 1.9264770476633615e-05, + "loss": 0.8908, + "num_tokens": 10955120716.0, + "step": 2621 + }, + { + "epoch": 0.31158645276292335, + "grad_norm": 0.5913173003115562, + "learning_rate": 1.9264084695584185e-05, + "loss": 0.9264, + "num_tokens": 10959282671.0, + "step": 2622 + }, + { + "epoch": 0.3117052881758764, + "grad_norm": 0.5959275095923476, + "learning_rate": 1.9263398608487758e-05, + "loss": 0.8978, + "num_tokens": 10963471596.0, + "step": 2623 + }, + { + "epoch": 0.3118241235888295, + "grad_norm": 0.5296810507128633, + "learning_rate": 1.9262712215369742e-05, + "loss": 0.8983, + "num_tokens": 10967620692.0, + "step": 2624 + }, + { + "epoch": 0.31194295900178254, + "grad_norm": 0.5710909951903036, + "learning_rate": 1.9262025516255558e-05, + "loss": 0.903, + "num_tokens": 10971808967.0, + "step": 2625 + }, + { + "epoch": 0.3120617944147356, + "grad_norm": 0.623476327098478, + "learning_rate": 1.926133851117063e-05, + "loss": 0.9199, + "num_tokens": 10975964803.0, + "step": 2626 + }, + { + "epoch": 0.3121806298276886, + "grad_norm": 0.5416485799562137, + "learning_rate": 1.926065120014041e-05, + "loss": 0.8927, + "num_tokens": 10980154780.0, + "step": 2627 + }, + { + "epoch": 0.3122994652406417, + "grad_norm": 0.557910143769856, + "learning_rate": 1.925996358319034e-05, + "loss": 0.8944, + "num_tokens": 10984345736.0, + "step": 2628 + }, + { + "epoch": 0.31241830065359477, + "grad_norm": 0.6014859578818186, + "learning_rate": 1.925927566034589e-05, + "loss": 0.9056, + "num_tokens": 10988512441.0, + "step": 2629 + }, + { + "epoch": 0.3125371360665478, + "grad_norm": 0.5273007510876588, + "learning_rate": 1.9258587431632543e-05, + "loss": 0.9224, + "num_tokens": 10992676038.0, + "step": 2630 + }, + { + "epoch": 0.3126559714795009, + "grad_norm": 0.5589348346287725, + "learning_rate": 1.925789889707578e-05, + "loss": 0.9153, + "num_tokens": 10996863114.0, + "step": 2631 + }, + { + "epoch": 0.31277480689245396, + "grad_norm": 0.5452527557217964, + "learning_rate": 1.92572100567011e-05, + "loss": 0.9161, + "num_tokens": 11001020964.0, + "step": 2632 + }, + { + "epoch": 0.312893642305407, + "grad_norm": 0.517000196176641, + "learning_rate": 1.925652091053401e-05, + "loss": 0.904, + "num_tokens": 11005210155.0, + "step": 2633 + }, + { + "epoch": 0.31301247771836005, + "grad_norm": 0.6976428078294873, + "learning_rate": 1.9255831458600037e-05, + "loss": 0.9368, + "num_tokens": 11009380403.0, + "step": 2634 + }, + { + "epoch": 0.31313131313131315, + "grad_norm": 0.4906154296392283, + "learning_rate": 1.925514170092471e-05, + "loss": 0.8855, + "num_tokens": 11013570062.0, + "step": 2635 + }, + { + "epoch": 0.3132501485442662, + "grad_norm": 0.6245577748019635, + "learning_rate": 1.9254451637533574e-05, + "loss": 0.8819, + "num_tokens": 11017759479.0, + "step": 2636 + }, + { + "epoch": 0.31336898395721924, + "grad_norm": 0.5489589394067668, + "learning_rate": 1.925376126845219e-05, + "loss": 0.8679, + "num_tokens": 11021947280.0, + "step": 2637 + }, + { + "epoch": 0.31348781937017234, + "grad_norm": 0.5441421765471358, + "learning_rate": 1.9253070593706113e-05, + "loss": 0.9332, + "num_tokens": 11026121711.0, + "step": 2638 + }, + { + "epoch": 0.3136066547831254, + "grad_norm": 0.547871801850694, + "learning_rate": 1.925237961332093e-05, + "loss": 0.8777, + "num_tokens": 11030290760.0, + "step": 2639 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.5489112467103361, + "learning_rate": 1.925168832732223e-05, + "loss": 0.9207, + "num_tokens": 11034465072.0, + "step": 2640 + }, + { + "epoch": 0.31384432560903147, + "grad_norm": 0.5352330368938183, + "learning_rate": 1.925099673573561e-05, + "loss": 0.9166, + "num_tokens": 11038653884.0, + "step": 2641 + }, + { + "epoch": 0.31396316102198457, + "grad_norm": 0.704278202078987, + "learning_rate": 1.9250304838586686e-05, + "loss": 0.8991, + "num_tokens": 11042829560.0, + "step": 2642 + }, + { + "epoch": 0.3140819964349376, + "grad_norm": 0.4774956918698066, + "learning_rate": 1.9249612635901077e-05, + "loss": 0.9202, + "num_tokens": 11047018463.0, + "step": 2643 + }, + { + "epoch": 0.31420083184789066, + "grad_norm": 0.691458775709358, + "learning_rate": 1.9248920127704416e-05, + "loss": 0.8895, + "num_tokens": 11051178069.0, + "step": 2644 + }, + { + "epoch": 0.3143196672608437, + "grad_norm": 0.5489251718660471, + "learning_rate": 1.9248227314022358e-05, + "loss": 0.9268, + "num_tokens": 11055367554.0, + "step": 2645 + }, + { + "epoch": 0.3144385026737968, + "grad_norm": 0.5959001324939877, + "learning_rate": 1.924753419488055e-05, + "loss": 0.9118, + "num_tokens": 11059549800.0, + "step": 2646 + }, + { + "epoch": 0.31455733808674985, + "grad_norm": 0.6468277797744626, + "learning_rate": 1.9246840770304667e-05, + "loss": 0.8662, + "num_tokens": 11063738234.0, + "step": 2647 + }, + { + "epoch": 0.3146761734997029, + "grad_norm": 0.6095348907676008, + "learning_rate": 1.9246147040320385e-05, + "loss": 0.9214, + "num_tokens": 11067900267.0, + "step": 2648 + }, + { + "epoch": 0.314795008912656, + "grad_norm": 0.6295834980547936, + "learning_rate": 1.92454530049534e-05, + "loss": 0.959, + "num_tokens": 11072065541.0, + "step": 2649 + }, + { + "epoch": 0.31491384432560904, + "grad_norm": 0.582559682558764, + "learning_rate": 1.924475866422941e-05, + "loss": 0.9199, + "num_tokens": 11076226319.0, + "step": 2650 + }, + { + "epoch": 0.3150326797385621, + "grad_norm": 0.4819406923917236, + "learning_rate": 1.9244064018174135e-05, + "loss": 0.8543, + "num_tokens": 11080415196.0, + "step": 2651 + }, + { + "epoch": 0.3151515151515151, + "grad_norm": 0.5278956215029339, + "learning_rate": 1.9243369066813293e-05, + "loss": 0.8764, + "num_tokens": 11084603905.0, + "step": 2652 + }, + { + "epoch": 0.3152703505644682, + "grad_norm": 0.6255461531141028, + "learning_rate": 1.9242673810172622e-05, + "loss": 0.8739, + "num_tokens": 11088760876.0, + "step": 2653 + }, + { + "epoch": 0.31538918597742127, + "grad_norm": 0.49227293695993485, + "learning_rate": 1.9241978248277872e-05, + "loss": 0.8843, + "num_tokens": 11092924170.0, + "step": 2654 + }, + { + "epoch": 0.3155080213903743, + "grad_norm": 0.5913352974528178, + "learning_rate": 1.9241282381154796e-05, + "loss": 0.8674, + "num_tokens": 11097113154.0, + "step": 2655 + }, + { + "epoch": 0.3156268568033274, + "grad_norm": 0.4720142916755647, + "learning_rate": 1.924058620882917e-05, + "loss": 0.934, + "num_tokens": 11101271308.0, + "step": 2656 + }, + { + "epoch": 0.31574569221628046, + "grad_norm": 0.5168230993741955, + "learning_rate": 1.923988973132678e-05, + "loss": 0.8957, + "num_tokens": 11105435843.0, + "step": 2657 + }, + { + "epoch": 0.3158645276292335, + "grad_norm": 0.5417176482766919, + "learning_rate": 1.923919294867341e-05, + "loss": 0.9112, + "num_tokens": 11109623160.0, + "step": 2658 + }, + { + "epoch": 0.31598336304218655, + "grad_norm": 0.5478056606813304, + "learning_rate": 1.923849586089487e-05, + "loss": 0.9085, + "num_tokens": 11113813124.0, + "step": 2659 + }, + { + "epoch": 0.31610219845513965, + "grad_norm": 0.6701743231613395, + "learning_rate": 1.9237798468016974e-05, + "loss": 0.8824, + "num_tokens": 11118002005.0, + "step": 2660 + }, + { + "epoch": 0.3162210338680927, + "grad_norm": 0.5111707017707632, + "learning_rate": 1.9237100770065543e-05, + "loss": 0.9004, + "num_tokens": 11122190987.0, + "step": 2661 + }, + { + "epoch": 0.31633986928104574, + "grad_norm": 0.6350848539315525, + "learning_rate": 1.9236402767066423e-05, + "loss": 0.8699, + "num_tokens": 11126355496.0, + "step": 2662 + }, + { + "epoch": 0.31645870469399884, + "grad_norm": 0.5482858030453753, + "learning_rate": 1.923570445904546e-05, + "loss": 0.9371, + "num_tokens": 11130518169.0, + "step": 2663 + }, + { + "epoch": 0.3165775401069519, + "grad_norm": 0.7385904465068607, + "learning_rate": 1.9235005846028517e-05, + "loss": 0.9444, + "num_tokens": 11134691137.0, + "step": 2664 + }, + { + "epoch": 0.3166963755199049, + "grad_norm": 0.5020876216666971, + "learning_rate": 1.9234306928041463e-05, + "loss": 0.9136, + "num_tokens": 11138849502.0, + "step": 2665 + }, + { + "epoch": 0.31681521093285797, + "grad_norm": 0.5574660661380938, + "learning_rate": 1.9233607705110182e-05, + "loss": 0.9201, + "num_tokens": 11143037797.0, + "step": 2666 + }, + { + "epoch": 0.31693404634581107, + "grad_norm": 0.5859770394000703, + "learning_rate": 1.9232908177260572e-05, + "loss": 0.8802, + "num_tokens": 11147227315.0, + "step": 2667 + }, + { + "epoch": 0.3170528817587641, + "grad_norm": 0.5891632525149713, + "learning_rate": 1.9232208344518532e-05, + "loss": 0.9178, + "num_tokens": 11151416711.0, + "step": 2668 + }, + { + "epoch": 0.31717171717171716, + "grad_norm": 0.5301873776238609, + "learning_rate": 1.9231508206909985e-05, + "loss": 0.9062, + "num_tokens": 11155594593.0, + "step": 2669 + }, + { + "epoch": 0.3172905525846702, + "grad_norm": 0.7063334510987552, + "learning_rate": 1.923080776446086e-05, + "loss": 0.9381, + "num_tokens": 11159782877.0, + "step": 2670 + }, + { + "epoch": 0.3174093879976233, + "grad_norm": 0.4720425300715867, + "learning_rate": 1.9230107017197093e-05, + "loss": 0.8557, + "num_tokens": 11163972241.0, + "step": 2671 + }, + { + "epoch": 0.31752822341057635, + "grad_norm": 0.5601148016261708, + "learning_rate": 1.9229405965144637e-05, + "loss": 0.9435, + "num_tokens": 11168162079.0, + "step": 2672 + }, + { + "epoch": 0.3176470588235294, + "grad_norm": 0.4751201321635767, + "learning_rate": 1.9228704608329453e-05, + "loss": 0.9021, + "num_tokens": 11172327711.0, + "step": 2673 + }, + { + "epoch": 0.3177658942364825, + "grad_norm": 0.6566395930374679, + "learning_rate": 1.9228002946777513e-05, + "loss": 0.8998, + "num_tokens": 11176505420.0, + "step": 2674 + }, + { + "epoch": 0.31788472964943554, + "grad_norm": 0.49770458930206, + "learning_rate": 1.9227300980514807e-05, + "loss": 0.8862, + "num_tokens": 11180694237.0, + "step": 2675 + }, + { + "epoch": 0.3180035650623886, + "grad_norm": 0.6458441540943884, + "learning_rate": 1.9226598709567328e-05, + "loss": 0.9362, + "num_tokens": 11184882883.0, + "step": 2676 + }, + { + "epoch": 0.3181224004753416, + "grad_norm": 0.5748446476937586, + "learning_rate": 1.9225896133961087e-05, + "loss": 0.8501, + "num_tokens": 11189051409.0, + "step": 2677 + }, + { + "epoch": 0.3182412358882947, + "grad_norm": 0.6191717932181855, + "learning_rate": 1.9225193253722093e-05, + "loss": 0.9099, + "num_tokens": 11193240311.0, + "step": 2678 + }, + { + "epoch": 0.31836007130124777, + "grad_norm": 0.5464060028429762, + "learning_rate": 1.9224490068876387e-05, + "loss": 0.8632, + "num_tokens": 11197399761.0, + "step": 2679 + }, + { + "epoch": 0.3184789067142008, + "grad_norm": 0.5803302333581158, + "learning_rate": 1.9223786579450007e-05, + "loss": 0.891, + "num_tokens": 11201511044.0, + "step": 2680 + }, + { + "epoch": 0.3185977421271539, + "grad_norm": 0.5561547062212201, + "learning_rate": 1.9223082785469003e-05, + "loss": 0.936, + "num_tokens": 11205700092.0, + "step": 2681 + }, + { + "epoch": 0.31871657754010696, + "grad_norm": 0.6886375860101018, + "learning_rate": 1.922237868695944e-05, + "loss": 0.9052, + "num_tokens": 11209883838.0, + "step": 2682 + }, + { + "epoch": 0.31883541295306, + "grad_norm": 0.5405831380832146, + "learning_rate": 1.9221674283947397e-05, + "loss": 0.9269, + "num_tokens": 11214072459.0, + "step": 2683 + }, + { + "epoch": 0.31895424836601305, + "grad_norm": 0.5688420212197114, + "learning_rate": 1.9220969576458955e-05, + "loss": 0.9286, + "num_tokens": 11218261827.0, + "step": 2684 + }, + { + "epoch": 0.31907308377896615, + "grad_norm": 0.6788348542086905, + "learning_rate": 1.922026456452021e-05, + "loss": 0.8899, + "num_tokens": 11222450651.0, + "step": 2685 + }, + { + "epoch": 0.3191919191919192, + "grad_norm": 0.47203973078362726, + "learning_rate": 1.921955924815728e-05, + "loss": 0.9016, + "num_tokens": 11226616060.0, + "step": 2686 + }, + { + "epoch": 0.31931075460487224, + "grad_norm": 0.6843831568658704, + "learning_rate": 1.921885362739628e-05, + "loss": 0.8891, + "num_tokens": 11230798398.0, + "step": 2687 + }, + { + "epoch": 0.31942959001782534, + "grad_norm": 0.4621209036279371, + "learning_rate": 1.9218147702263338e-05, + "loss": 0.8816, + "num_tokens": 11234982529.0, + "step": 2688 + }, + { + "epoch": 0.3195484254307784, + "grad_norm": 0.6753315174215112, + "learning_rate": 1.9217441472784605e-05, + "loss": 0.8696, + "num_tokens": 11239157946.0, + "step": 2689 + }, + { + "epoch": 0.3196672608437314, + "grad_norm": 0.5668846230184145, + "learning_rate": 1.9216734938986226e-05, + "loss": 0.9154, + "num_tokens": 11243323927.0, + "step": 2690 + }, + { + "epoch": 0.31978609625668447, + "grad_norm": 0.5794696137230839, + "learning_rate": 1.9216028100894374e-05, + "loss": 0.9056, + "num_tokens": 11247512991.0, + "step": 2691 + }, + { + "epoch": 0.31990493166963757, + "grad_norm": 0.6291900107098052, + "learning_rate": 1.921532095853522e-05, + "loss": 0.9127, + "num_tokens": 11251703744.0, + "step": 2692 + }, + { + "epoch": 0.3200237670825906, + "grad_norm": 0.4903048963844836, + "learning_rate": 1.9214613511934953e-05, + "loss": 0.8937, + "num_tokens": 11255893195.0, + "step": 2693 + }, + { + "epoch": 0.32014260249554366, + "grad_norm": 0.655570542155983, + "learning_rate": 1.9213905761119777e-05, + "loss": 0.9209, + "num_tokens": 11260066935.0, + "step": 2694 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.5465882986338835, + "learning_rate": 1.9213197706115897e-05, + "loss": 0.9307, + "num_tokens": 11264240617.0, + "step": 2695 + }, + { + "epoch": 0.3203802733214498, + "grad_norm": 0.5253058338459061, + "learning_rate": 1.9212489346949536e-05, + "loss": 0.901, + "num_tokens": 11268426657.0, + "step": 2696 + }, + { + "epoch": 0.32049910873440285, + "grad_norm": 0.6997821580924634, + "learning_rate": 1.921178068364693e-05, + "loss": 0.8499, + "num_tokens": 11272614421.0, + "step": 2697 + }, + { + "epoch": 0.3206179441473559, + "grad_norm": 0.4925080781475763, + "learning_rate": 1.921107171623432e-05, + "loss": 0.8964, + "num_tokens": 11276779476.0, + "step": 2698 + }, + { + "epoch": 0.320736779560309, + "grad_norm": 0.6234859795782298, + "learning_rate": 1.921036244473796e-05, + "loss": 0.8985, + "num_tokens": 11280968728.0, + "step": 2699 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 0.5799761036341567, + "learning_rate": 1.9209652869184118e-05, + "loss": 0.883, + "num_tokens": 11285157909.0, + "step": 2700 + }, + { + "epoch": 0.3209744503862151, + "grad_norm": 0.5366545733748836, + "learning_rate": 1.9208942989599075e-05, + "loss": 0.9102, + "num_tokens": 11289302692.0, + "step": 2701 + }, + { + "epoch": 0.3210932857991681, + "grad_norm": 0.5847315800106414, + "learning_rate": 1.920823280600912e-05, + "loss": 0.8757, + "num_tokens": 11293439142.0, + "step": 2702 + }, + { + "epoch": 0.3212121212121212, + "grad_norm": 0.5494932583300616, + "learning_rate": 1.9207522318440548e-05, + "loss": 0.8946, + "num_tokens": 11297625081.0, + "step": 2703 + }, + { + "epoch": 0.32133095662507427, + "grad_norm": 0.5388509509392767, + "learning_rate": 1.9206811526919675e-05, + "loss": 0.894, + "num_tokens": 11301762214.0, + "step": 2704 + }, + { + "epoch": 0.3214497920380273, + "grad_norm": 0.5115843078257534, + "learning_rate": 1.9206100431472822e-05, + "loss": 0.855, + "num_tokens": 11305950153.0, + "step": 2705 + }, + { + "epoch": 0.3215686274509804, + "grad_norm": 0.6354488824054126, + "learning_rate": 1.9205389032126327e-05, + "loss": 0.8792, + "num_tokens": 11310132967.0, + "step": 2706 + }, + { + "epoch": 0.32168746286393346, + "grad_norm": 0.5345837483117888, + "learning_rate": 1.9204677328906532e-05, + "loss": 0.9177, + "num_tokens": 11314322291.0, + "step": 2707 + }, + { + "epoch": 0.3218062982768865, + "grad_norm": 0.6023528887064896, + "learning_rate": 1.9203965321839796e-05, + "loss": 0.938, + "num_tokens": 11318511850.0, + "step": 2708 + }, + { + "epoch": 0.32192513368983955, + "grad_norm": 0.5489727716464149, + "learning_rate": 1.9203253010952483e-05, + "loss": 0.9251, + "num_tokens": 11322701154.0, + "step": 2709 + }, + { + "epoch": 0.32204396910279265, + "grad_norm": 0.6523637208733941, + "learning_rate": 1.9202540396270975e-05, + "loss": 0.9258, + "num_tokens": 11326890343.0, + "step": 2710 + }, + { + "epoch": 0.3221628045157457, + "grad_norm": 0.44016196834190413, + "learning_rate": 1.9201827477821667e-05, + "loss": 0.9258, + "num_tokens": 11331080287.0, + "step": 2711 + }, + { + "epoch": 0.32228163992869874, + "grad_norm": 0.5757695704551878, + "learning_rate": 1.9201114255630955e-05, + "loss": 0.9281, + "num_tokens": 11335265283.0, + "step": 2712 + }, + { + "epoch": 0.32240047534165184, + "grad_norm": 0.5429116064170431, + "learning_rate": 1.920040072972525e-05, + "loss": 0.9207, + "num_tokens": 11339455224.0, + "step": 2713 + }, + { + "epoch": 0.3225193107546049, + "grad_norm": 0.6425300784060443, + "learning_rate": 1.9199686900130987e-05, + "loss": 0.9229, + "num_tokens": 11343620801.0, + "step": 2714 + }, + { + "epoch": 0.3226381461675579, + "grad_norm": 0.5533506985372285, + "learning_rate": 1.9198972766874588e-05, + "loss": 0.8821, + "num_tokens": 11347761387.0, + "step": 2715 + }, + { + "epoch": 0.32275698158051097, + "grad_norm": 0.5018283824950911, + "learning_rate": 1.9198258329982505e-05, + "loss": 0.9246, + "num_tokens": 11351949658.0, + "step": 2716 + }, + { + "epoch": 0.32287581699346407, + "grad_norm": 0.6475624515001049, + "learning_rate": 1.91975435894812e-05, + "loss": 0.9411, + "num_tokens": 11356138336.0, + "step": 2717 + }, + { + "epoch": 0.3229946524064171, + "grad_norm": 0.560988468613617, + "learning_rate": 1.919682854539714e-05, + "loss": 0.9075, + "num_tokens": 11360297510.0, + "step": 2718 + }, + { + "epoch": 0.32311348781937016, + "grad_norm": 0.6036910380674748, + "learning_rate": 1.9196113197756806e-05, + "loss": 0.8723, + "num_tokens": 11364487437.0, + "step": 2719 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 0.5481573628025639, + "learning_rate": 1.9195397546586686e-05, + "loss": 0.9278, + "num_tokens": 11368655136.0, + "step": 2720 + }, + { + "epoch": 0.3233511586452763, + "grad_norm": 0.6106896316199393, + "learning_rate": 1.9194681591913284e-05, + "loss": 0.9002, + "num_tokens": 11372844134.0, + "step": 2721 + }, + { + "epoch": 0.32346999405822935, + "grad_norm": 0.5565612476688667, + "learning_rate": 1.9193965333763118e-05, + "loss": 0.92, + "num_tokens": 11377033094.0, + "step": 2722 + }, + { + "epoch": 0.3235888294711824, + "grad_norm": 0.47318611992230286, + "learning_rate": 1.919324877216271e-05, + "loss": 0.8786, + "num_tokens": 11381222349.0, + "step": 2723 + }, + { + "epoch": 0.3237076648841355, + "grad_norm": 0.6708855894677156, + "learning_rate": 1.91925319071386e-05, + "loss": 0.9058, + "num_tokens": 11385410428.0, + "step": 2724 + }, + { + "epoch": 0.32382650029708854, + "grad_norm": 0.5806235659416852, + "learning_rate": 1.9191814738717335e-05, + "loss": 0.9196, + "num_tokens": 11389600298.0, + "step": 2725 + }, + { + "epoch": 0.3239453357100416, + "grad_norm": 0.5578449511482293, + "learning_rate": 1.919109726692547e-05, + "loss": 0.9799, + "num_tokens": 11393789238.0, + "step": 2726 + }, + { + "epoch": 0.3240641711229946, + "grad_norm": 0.5419634349213053, + "learning_rate": 1.919037949178958e-05, + "loss": 0.9321, + "num_tokens": 11397978316.0, + "step": 2727 + }, + { + "epoch": 0.3241830065359477, + "grad_norm": 0.5436116321685326, + "learning_rate": 1.9189661413336242e-05, + "loss": 0.9298, + "num_tokens": 11402164056.0, + "step": 2728 + }, + { + "epoch": 0.32430184194890077, + "grad_norm": 0.5233958308352855, + "learning_rate": 1.9188943031592055e-05, + "loss": 0.8754, + "num_tokens": 11406353014.0, + "step": 2729 + }, + { + "epoch": 0.3244206773618538, + "grad_norm": 0.6248676461978933, + "learning_rate": 1.9188224346583618e-05, + "loss": 0.9193, + "num_tokens": 11410514088.0, + "step": 2730 + }, + { + "epoch": 0.3245395127748069, + "grad_norm": 0.6363079057377672, + "learning_rate": 1.9187505358337552e-05, + "loss": 0.902, + "num_tokens": 11414687084.0, + "step": 2731 + }, + { + "epoch": 0.32465834818775996, + "grad_norm": 0.591553058723613, + "learning_rate": 1.9186786066880477e-05, + "loss": 0.9162, + "num_tokens": 11418863525.0, + "step": 2732 + }, + { + "epoch": 0.324777183600713, + "grad_norm": 0.5547406665438948, + "learning_rate": 1.9186066472239035e-05, + "loss": 0.9268, + "num_tokens": 11423009836.0, + "step": 2733 + }, + { + "epoch": 0.32489601901366605, + "grad_norm": 0.6163839240953543, + "learning_rate": 1.9185346574439872e-05, + "loss": 0.9192, + "num_tokens": 11427199421.0, + "step": 2734 + }, + { + "epoch": 0.32501485442661915, + "grad_norm": 0.6062095370391157, + "learning_rate": 1.918462637350965e-05, + "loss": 0.8538, + "num_tokens": 11431346613.0, + "step": 2735 + }, + { + "epoch": 0.3251336898395722, + "grad_norm": 0.43862518351508745, + "learning_rate": 1.9183905869475046e-05, + "loss": 0.8834, + "num_tokens": 11435516567.0, + "step": 2736 + }, + { + "epoch": 0.32525252525252524, + "grad_norm": 0.5938395090335608, + "learning_rate": 1.918318506236273e-05, + "loss": 0.8785, + "num_tokens": 11439705091.0, + "step": 2737 + }, + { + "epoch": 0.32537136066547834, + "grad_norm": 0.618937797675182, + "learning_rate": 1.918246395219941e-05, + "loss": 0.9002, + "num_tokens": 11443886521.0, + "step": 2738 + }, + { + "epoch": 0.3254901960784314, + "grad_norm": 0.5889795627735096, + "learning_rate": 1.918174253901178e-05, + "loss": 0.9271, + "num_tokens": 11448075522.0, + "step": 2739 + }, + { + "epoch": 0.3256090314913844, + "grad_norm": 0.5091730317146232, + "learning_rate": 1.9181020822826558e-05, + "loss": 0.9284, + "num_tokens": 11452263752.0, + "step": 2740 + }, + { + "epoch": 0.32572786690433747, + "grad_norm": 0.6056599522835705, + "learning_rate": 1.918029880367048e-05, + "loss": 0.8951, + "num_tokens": 11456453494.0, + "step": 2741 + }, + { + "epoch": 0.32584670231729057, + "grad_norm": 0.5789278064439003, + "learning_rate": 1.9179576481570275e-05, + "loss": 0.9445, + "num_tokens": 11460642711.0, + "step": 2742 + }, + { + "epoch": 0.3259655377302436, + "grad_norm": 0.6271749112154738, + "learning_rate": 1.9178853856552698e-05, + "loss": 0.8906, + "num_tokens": 11464831273.0, + "step": 2743 + }, + { + "epoch": 0.32608437314319666, + "grad_norm": 0.5936467567674809, + "learning_rate": 1.9178130928644508e-05, + "loss": 0.9019, + "num_tokens": 11469009166.0, + "step": 2744 + }, + { + "epoch": 0.32620320855614976, + "grad_norm": 0.5905895152775005, + "learning_rate": 1.9177407697872484e-05, + "loss": 0.8905, + "num_tokens": 11473198170.0, + "step": 2745 + }, + { + "epoch": 0.3263220439691028, + "grad_norm": 0.5365340736844396, + "learning_rate": 1.91766841642634e-05, + "loss": 0.9433, + "num_tokens": 11477332876.0, + "step": 2746 + }, + { + "epoch": 0.32644087938205585, + "grad_norm": 0.6320035464023681, + "learning_rate": 1.917596032784406e-05, + "loss": 0.8813, + "num_tokens": 11481522243.0, + "step": 2747 + }, + { + "epoch": 0.3265597147950089, + "grad_norm": 0.5923803496148293, + "learning_rate": 1.9175236188641262e-05, + "loss": 0.9138, + "num_tokens": 11485712001.0, + "step": 2748 + }, + { + "epoch": 0.326678550207962, + "grad_norm": 0.6640639823590254, + "learning_rate": 1.9174511746681824e-05, + "loss": 0.9161, + "num_tokens": 11489891826.0, + "step": 2749 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.5619530697965693, + "learning_rate": 1.9173787001992584e-05, + "loss": 0.8927, + "num_tokens": 11494081080.0, + "step": 2750 + }, + { + "epoch": 0.3269162210338681, + "grad_norm": 0.6075387624275908, + "learning_rate": 1.917306195460037e-05, + "loss": 0.8812, + "num_tokens": 11498269294.0, + "step": 2751 + }, + { + "epoch": 0.3270350564468211, + "grad_norm": 0.5450585994538556, + "learning_rate": 1.9172336604532037e-05, + "loss": 0.9272, + "num_tokens": 11502458960.0, + "step": 2752 + }, + { + "epoch": 0.3271538918597742, + "grad_norm": 0.5896128050229056, + "learning_rate": 1.9171610951814452e-05, + "loss": 0.897, + "num_tokens": 11506621561.0, + "step": 2753 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 0.5375262526132719, + "learning_rate": 1.9170884996474485e-05, + "loss": 0.9053, + "num_tokens": 11510809125.0, + "step": 2754 + }, + { + "epoch": 0.3273915626856803, + "grad_norm": 0.5478216029876362, + "learning_rate": 1.9170158738539018e-05, + "loss": 0.9115, + "num_tokens": 11514976859.0, + "step": 2755 + }, + { + "epoch": 0.3275103980986334, + "grad_norm": 0.5760913433952518, + "learning_rate": 1.916943217803495e-05, + "loss": 0.8786, + "num_tokens": 11519148940.0, + "step": 2756 + }, + { + "epoch": 0.32762923351158646, + "grad_norm": 0.5249453985431666, + "learning_rate": 1.9168705314989183e-05, + "loss": 0.9182, + "num_tokens": 11523293659.0, + "step": 2757 + }, + { + "epoch": 0.3277480689245395, + "grad_norm": 0.5223250073095003, + "learning_rate": 1.9167978149428643e-05, + "loss": 0.916, + "num_tokens": 11527482788.0, + "step": 2758 + }, + { + "epoch": 0.32786690433749255, + "grad_norm": 0.5469979402825311, + "learning_rate": 1.9167250681380254e-05, + "loss": 0.9139, + "num_tokens": 11531668826.0, + "step": 2759 + }, + { + "epoch": 0.32798573975044565, + "grad_norm": 0.5947934782950379, + "learning_rate": 1.9166522910870958e-05, + "loss": 0.8867, + "num_tokens": 11535856989.0, + "step": 2760 + }, + { + "epoch": 0.3281045751633987, + "grad_norm": 0.5425694413294765, + "learning_rate": 1.916579483792771e-05, + "loss": 0.938, + "num_tokens": 11540042740.0, + "step": 2761 + }, + { + "epoch": 0.32822341057635174, + "grad_norm": 0.6684725594173851, + "learning_rate": 1.916506646257747e-05, + "loss": 0.881, + "num_tokens": 11544231720.0, + "step": 2762 + }, + { + "epoch": 0.32834224598930484, + "grad_norm": 0.5824129845926053, + "learning_rate": 1.9164337784847207e-05, + "loss": 0.8988, + "num_tokens": 11548391431.0, + "step": 2763 + }, + { + "epoch": 0.3284610814022579, + "grad_norm": 0.5355255362148549, + "learning_rate": 1.9163608804763915e-05, + "loss": 0.8924, + "num_tokens": 11552550000.0, + "step": 2764 + }, + { + "epoch": 0.3285799168152109, + "grad_norm": 0.5825696610610642, + "learning_rate": 1.9162879522354585e-05, + "loss": 0.9224, + "num_tokens": 11556738596.0, + "step": 2765 + }, + { + "epoch": 0.32869875222816397, + "grad_norm": 0.5603889535885023, + "learning_rate": 1.916214993764623e-05, + "loss": 0.8772, + "num_tokens": 11560864999.0, + "step": 2766 + }, + { + "epoch": 0.32881758764111707, + "grad_norm": 0.6433952784917091, + "learning_rate": 1.9161420050665866e-05, + "loss": 0.9158, + "num_tokens": 11565013272.0, + "step": 2767 + }, + { + "epoch": 0.3289364230540701, + "grad_norm": 0.5152653814372994, + "learning_rate": 1.916068986144052e-05, + "loss": 0.9012, + "num_tokens": 11569202335.0, + "step": 2768 + }, + { + "epoch": 0.32905525846702316, + "grad_norm": 0.5860783725735221, + "learning_rate": 1.915995936999724e-05, + "loss": 0.91, + "num_tokens": 11573379095.0, + "step": 2769 + }, + { + "epoch": 0.32917409387997626, + "grad_norm": 0.5504725452658215, + "learning_rate": 1.9159228576363072e-05, + "loss": 0.8492, + "num_tokens": 11577568135.0, + "step": 2770 + }, + { + "epoch": 0.3292929292929293, + "grad_norm": 0.5556119087894335, + "learning_rate": 1.9158497480565083e-05, + "loss": 0.8852, + "num_tokens": 11581730140.0, + "step": 2771 + }, + { + "epoch": 0.32941176470588235, + "grad_norm": 0.5441900858374995, + "learning_rate": 1.9157766082630353e-05, + "loss": 0.9009, + "num_tokens": 11585920507.0, + "step": 2772 + }, + { + "epoch": 0.3295306001188354, + "grad_norm": 0.5375449297770148, + "learning_rate": 1.9157034382585957e-05, + "loss": 0.9195, + "num_tokens": 11590110771.0, + "step": 2773 + }, + { + "epoch": 0.3296494355317885, + "grad_norm": 0.49203112001666993, + "learning_rate": 1.9156302380459e-05, + "loss": 0.9038, + "num_tokens": 11594263668.0, + "step": 2774 + }, + { + "epoch": 0.32976827094474154, + "grad_norm": 0.5271292420903579, + "learning_rate": 1.9155570076276585e-05, + "loss": 0.8858, + "num_tokens": 11598452045.0, + "step": 2775 + }, + { + "epoch": 0.3298871063576946, + "grad_norm": 0.5360717237907203, + "learning_rate": 1.915483747006584e-05, + "loss": 0.9013, + "num_tokens": 11602611260.0, + "step": 2776 + }, + { + "epoch": 0.3300059417706476, + "grad_norm": 0.5796302358228448, + "learning_rate": 1.9154104561853892e-05, + "loss": 0.9225, + "num_tokens": 11606800610.0, + "step": 2777 + }, + { + "epoch": 0.3301247771836007, + "grad_norm": 0.48379396361640836, + "learning_rate": 1.915337135166788e-05, + "loss": 0.8733, + "num_tokens": 11610982701.0, + "step": 2778 + }, + { + "epoch": 0.33024361259655377, + "grad_norm": 0.6262476300488831, + "learning_rate": 1.915263783953496e-05, + "loss": 0.9027, + "num_tokens": 11615171914.0, + "step": 2779 + }, + { + "epoch": 0.3303624480095068, + "grad_norm": 0.5088336149112113, + "learning_rate": 1.9151904025482298e-05, + "loss": 0.9, + "num_tokens": 11619361225.0, + "step": 2780 + }, + { + "epoch": 0.3304812834224599, + "grad_norm": 0.6225123951503306, + "learning_rate": 1.9151169909537066e-05, + "loss": 0.9307, + "num_tokens": 11623550848.0, + "step": 2781 + }, + { + "epoch": 0.33060011883541296, + "grad_norm": 0.5453252448571351, + "learning_rate": 1.9150435491726452e-05, + "loss": 0.8843, + "num_tokens": 11627739221.0, + "step": 2782 + }, + { + "epoch": 0.330718954248366, + "grad_norm": 0.47655531089422515, + "learning_rate": 1.9149700772077657e-05, + "loss": 0.876, + "num_tokens": 11631928274.0, + "step": 2783 + }, + { + "epoch": 0.33083778966131905, + "grad_norm": 0.6814925658777055, + "learning_rate": 1.9148965750617884e-05, + "loss": 0.8788, + "num_tokens": 11636118175.0, + "step": 2784 + }, + { + "epoch": 0.33095662507427215, + "grad_norm": 0.578014758347293, + "learning_rate": 1.9148230427374363e-05, + "loss": 0.9121, + "num_tokens": 11640307256.0, + "step": 2785 + }, + { + "epoch": 0.3310754604872252, + "grad_norm": 0.4823402927774146, + "learning_rate": 1.914749480237431e-05, + "loss": 0.9019, + "num_tokens": 11644426540.0, + "step": 2786 + }, + { + "epoch": 0.33119429590017824, + "grad_norm": 0.6554675596290386, + "learning_rate": 1.9146758875644984e-05, + "loss": 0.8877, + "num_tokens": 11648616262.0, + "step": 2787 + }, + { + "epoch": 0.33131313131313134, + "grad_norm": 0.546352679269645, + "learning_rate": 1.9146022647213633e-05, + "loss": 0.929, + "num_tokens": 11652780879.0, + "step": 2788 + }, + { + "epoch": 0.3314319667260844, + "grad_norm": 0.515648906300196, + "learning_rate": 1.9145286117107512e-05, + "loss": 0.9185, + "num_tokens": 11656971378.0, + "step": 2789 + }, + { + "epoch": 0.3315508021390374, + "grad_norm": 0.5846820090772696, + "learning_rate": 1.9144549285353918e-05, + "loss": 0.9019, + "num_tokens": 11661161107.0, + "step": 2790 + }, + { + "epoch": 0.33166963755199047, + "grad_norm": 0.5858382216445217, + "learning_rate": 1.914381215198012e-05, + "loss": 0.8907, + "num_tokens": 11665352097.0, + "step": 2791 + }, + { + "epoch": 0.33178847296494357, + "grad_norm": 0.5165190169361191, + "learning_rate": 1.9143074717013422e-05, + "loss": 0.8895, + "num_tokens": 11669491541.0, + "step": 2792 + }, + { + "epoch": 0.3319073083778966, + "grad_norm": 0.49919640768504553, + "learning_rate": 1.914233698048114e-05, + "loss": 0.8963, + "num_tokens": 11673680538.0, + "step": 2793 + }, + { + "epoch": 0.33202614379084966, + "grad_norm": 0.6729747992776159, + "learning_rate": 1.9141598942410586e-05, + "loss": 0.8747, + "num_tokens": 11677866344.0, + "step": 2794 + }, + { + "epoch": 0.33214497920380276, + "grad_norm": 0.5044474191875548, + "learning_rate": 1.9140860602829095e-05, + "loss": 0.887, + "num_tokens": 11682055758.0, + "step": 2795 + }, + { + "epoch": 0.3322638146167558, + "grad_norm": 0.482882499709646, + "learning_rate": 1.9140121961764015e-05, + "loss": 0.9312, + "num_tokens": 11686240018.0, + "step": 2796 + }, + { + "epoch": 0.33238265002970885, + "grad_norm": 0.6199898380928295, + "learning_rate": 1.9139383019242693e-05, + "loss": 0.9073, + "num_tokens": 11690429102.0, + "step": 2797 + }, + { + "epoch": 0.3325014854426619, + "grad_norm": 0.5565700525567479, + "learning_rate": 1.9138643775292497e-05, + "loss": 0.8783, + "num_tokens": 11694616444.0, + "step": 2798 + }, + { + "epoch": 0.332620320855615, + "grad_norm": 0.5341548035333049, + "learning_rate": 1.9137904229940808e-05, + "loss": 0.9067, + "num_tokens": 11698774182.0, + "step": 2799 + }, + { + "epoch": 0.33273915626856804, + "grad_norm": 0.5109299208730725, + "learning_rate": 1.913716438321501e-05, + "loss": 0.8949, + "num_tokens": 11702934527.0, + "step": 2800 + }, + { + "epoch": 0.3328579916815211, + "grad_norm": 0.5323943695488692, + "learning_rate": 1.91364242351425e-05, + "loss": 0.8685, + "num_tokens": 11707122782.0, + "step": 2801 + }, + { + "epoch": 0.3329768270944741, + "grad_norm": 0.6853788845781061, + "learning_rate": 1.913568378575069e-05, + "loss": 0.8887, + "num_tokens": 11711312213.0, + "step": 2802 + }, + { + "epoch": 0.3330956625074272, + "grad_norm": 0.4275429930680734, + "learning_rate": 1.9134943035067008e-05, + "loss": 0.8728, + "num_tokens": 11715502246.0, + "step": 2803 + }, + { + "epoch": 0.33321449792038027, + "grad_norm": 0.5609269302429019, + "learning_rate": 1.9134201983118873e-05, + "loss": 0.9038, + "num_tokens": 11719691656.0, + "step": 2804 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.6609677865743109, + "learning_rate": 1.9133460629933744e-05, + "loss": 0.8816, + "num_tokens": 11723881428.0, + "step": 2805 + }, + { + "epoch": 0.3334521687462864, + "grad_norm": 0.5401362740615945, + "learning_rate": 1.9132718975539063e-05, + "loss": 0.9058, + "num_tokens": 11728071351.0, + "step": 2806 + }, + { + "epoch": 0.33357100415923946, + "grad_norm": 0.479134153368209, + "learning_rate": 1.9131977019962303e-05, + "loss": 0.8994, + "num_tokens": 11732261025.0, + "step": 2807 + }, + { + "epoch": 0.3336898395721925, + "grad_norm": 0.5872188399433658, + "learning_rate": 1.9131234763230933e-05, + "loss": 0.8833, + "num_tokens": 11736439550.0, + "step": 2808 + }, + { + "epoch": 0.33380867498514555, + "grad_norm": 0.5417229362401654, + "learning_rate": 1.9130492205372453e-05, + "loss": 0.8863, + "num_tokens": 11740598422.0, + "step": 2809 + }, + { + "epoch": 0.33392751039809865, + "grad_norm": 0.49813225067716926, + "learning_rate": 1.912974934641436e-05, + "loss": 0.9158, + "num_tokens": 11744786832.0, + "step": 2810 + }, + { + "epoch": 0.3340463458110517, + "grad_norm": 0.6085308291490471, + "learning_rate": 1.9129006186384154e-05, + "loss": 0.9182, + "num_tokens": 11748974947.0, + "step": 2811 + }, + { + "epoch": 0.33416518122400474, + "grad_norm": 0.5614838387566067, + "learning_rate": 1.9128262725309367e-05, + "loss": 0.9315, + "num_tokens": 11753136651.0, + "step": 2812 + }, + { + "epoch": 0.33428401663695784, + "grad_norm": 0.5330550936119112, + "learning_rate": 1.9127518963217528e-05, + "loss": 0.9248, + "num_tokens": 11757275861.0, + "step": 2813 + }, + { + "epoch": 0.3344028520499109, + "grad_norm": 0.5360911481887843, + "learning_rate": 1.9126774900136184e-05, + "loss": 0.8841, + "num_tokens": 11761448207.0, + "step": 2814 + }, + { + "epoch": 0.3345216874628639, + "grad_norm": 0.5494558747654421, + "learning_rate": 1.912603053609289e-05, + "loss": 0.8701, + "num_tokens": 11765628738.0, + "step": 2815 + }, + { + "epoch": 0.33464052287581697, + "grad_norm": 0.6339741507925861, + "learning_rate": 1.912528587111521e-05, + "loss": 0.9287, + "num_tokens": 11769819024.0, + "step": 2816 + }, + { + "epoch": 0.33475935828877007, + "grad_norm": 0.4895023791810423, + "learning_rate": 1.9124540905230717e-05, + "loss": 0.8825, + "num_tokens": 11773986640.0, + "step": 2817 + }, + { + "epoch": 0.3348781937017231, + "grad_norm": 0.6414351132751301, + "learning_rate": 1.912379563846701e-05, + "loss": 0.8973, + "num_tokens": 11778147207.0, + "step": 2818 + }, + { + "epoch": 0.33499702911467616, + "grad_norm": 0.5269364581730587, + "learning_rate": 1.912305007085168e-05, + "loss": 0.8931, + "num_tokens": 11782333992.0, + "step": 2819 + }, + { + "epoch": 0.33511586452762926, + "grad_norm": 0.6044090743872824, + "learning_rate": 1.9122304202412342e-05, + "loss": 0.9275, + "num_tokens": 11786505532.0, + "step": 2820 + }, + { + "epoch": 0.3352346999405823, + "grad_norm": 0.5496971772021182, + "learning_rate": 1.9121558033176615e-05, + "loss": 0.9294, + "num_tokens": 11790694589.0, + "step": 2821 + }, + { + "epoch": 0.33535353535353535, + "grad_norm": 0.6399615662190914, + "learning_rate": 1.9120811563172137e-05, + "loss": 0.8835, + "num_tokens": 11794882279.0, + "step": 2822 + }, + { + "epoch": 0.3354723707664884, + "grad_norm": 0.5239781366012632, + "learning_rate": 1.912006479242655e-05, + "loss": 0.8968, + "num_tokens": 11799071106.0, + "step": 2823 + }, + { + "epoch": 0.3355912061794415, + "grad_norm": 0.5617895623690308, + "learning_rate": 1.911931772096751e-05, + "loss": 0.8958, + "num_tokens": 11803258770.0, + "step": 2824 + }, + { + "epoch": 0.33571004159239454, + "grad_norm": 0.5728058568476279, + "learning_rate": 1.911857034882268e-05, + "loss": 0.895, + "num_tokens": 11807446843.0, + "step": 2825 + }, + { + "epoch": 0.3358288770053476, + "grad_norm": 0.5066996546532299, + "learning_rate": 1.911782267601974e-05, + "loss": 0.9222, + "num_tokens": 11811610004.0, + "step": 2826 + }, + { + "epoch": 0.3359477124183007, + "grad_norm": 0.5988399907883928, + "learning_rate": 1.911707470258638e-05, + "loss": 0.8938, + "num_tokens": 11815755890.0, + "step": 2827 + }, + { + "epoch": 0.3360665478312537, + "grad_norm": 0.540910398573185, + "learning_rate": 1.9116326428550305e-05, + "loss": 0.9026, + "num_tokens": 11819944456.0, + "step": 2828 + }, + { + "epoch": 0.33618538324420677, + "grad_norm": 0.611189042613279, + "learning_rate": 1.9115577853939214e-05, + "loss": 0.8728, + "num_tokens": 11824131729.0, + "step": 2829 + }, + { + "epoch": 0.3363042186571598, + "grad_norm": 0.5032747217459793, + "learning_rate": 1.9114828978780837e-05, + "loss": 0.903, + "num_tokens": 11828319954.0, + "step": 2830 + }, + { + "epoch": 0.3364230540701129, + "grad_norm": 0.6334048704052132, + "learning_rate": 1.9114079803102908e-05, + "loss": 0.9079, + "num_tokens": 11832490532.0, + "step": 2831 + }, + { + "epoch": 0.33654188948306596, + "grad_norm": 0.46785215297159277, + "learning_rate": 1.9113330326933163e-05, + "loss": 0.8844, + "num_tokens": 11836680233.0, + "step": 2832 + }, + { + "epoch": 0.336660724896019, + "grad_norm": 0.5042841021227392, + "learning_rate": 1.911258055029937e-05, + "loss": 0.9038, + "num_tokens": 11840865697.0, + "step": 2833 + }, + { + "epoch": 0.33677956030897205, + "grad_norm": 0.5985062919431832, + "learning_rate": 1.911183047322929e-05, + "loss": 0.8766, + "num_tokens": 11845030590.0, + "step": 2834 + }, + { + "epoch": 0.33689839572192515, + "grad_norm": 0.5724087282488517, + "learning_rate": 1.91110800957507e-05, + "loss": 0.9133, + "num_tokens": 11849220112.0, + "step": 2835 + }, + { + "epoch": 0.3370172311348782, + "grad_norm": 0.575143980549273, + "learning_rate": 1.911032941789139e-05, + "loss": 0.8867, + "num_tokens": 11853409330.0, + "step": 2836 + }, + { + "epoch": 0.33713606654783124, + "grad_norm": 0.5633263083784722, + "learning_rate": 1.910957843967916e-05, + "loss": 0.8913, + "num_tokens": 11857599433.0, + "step": 2837 + }, + { + "epoch": 0.33725490196078434, + "grad_norm": 0.6093184036327441, + "learning_rate": 1.9108827161141824e-05, + "loss": 0.9228, + "num_tokens": 11861786318.0, + "step": 2838 + }, + { + "epoch": 0.3373737373737374, + "grad_norm": 0.492736968285072, + "learning_rate": 1.9108075582307195e-05, + "loss": 0.9006, + "num_tokens": 11865974618.0, + "step": 2839 + }, + { + "epoch": 0.3374925727866904, + "grad_norm": 0.5845666265125936, + "learning_rate": 1.910732370320312e-05, + "loss": 0.9335, + "num_tokens": 11870141405.0, + "step": 2840 + }, + { + "epoch": 0.33761140819964347, + "grad_norm": 0.6120314297349795, + "learning_rate": 1.9106571523857428e-05, + "loss": 0.8685, + "num_tokens": 11874330587.0, + "step": 2841 + }, + { + "epoch": 0.33773024361259657, + "grad_norm": 0.4681266812930482, + "learning_rate": 1.910581904429799e-05, + "loss": 0.9136, + "num_tokens": 11878519232.0, + "step": 2842 + }, + { + "epoch": 0.3378490790255496, + "grad_norm": 0.5787780910826822, + "learning_rate": 1.9105066264552667e-05, + "loss": 0.8787, + "num_tokens": 11882708564.0, + "step": 2843 + }, + { + "epoch": 0.33796791443850266, + "grad_norm": 0.5861065298378285, + "learning_rate": 1.9104313184649332e-05, + "loss": 0.8752, + "num_tokens": 11886897726.0, + "step": 2844 + }, + { + "epoch": 0.33808674985145576, + "grad_norm": 0.5761915149241674, + "learning_rate": 1.9103559804615884e-05, + "loss": 0.8726, + "num_tokens": 11891086491.0, + "step": 2845 + }, + { + "epoch": 0.3382055852644088, + "grad_norm": 0.544640024818561, + "learning_rate": 1.9102806124480212e-05, + "loss": 0.8982, + "num_tokens": 11895275413.0, + "step": 2846 + }, + { + "epoch": 0.33832442067736185, + "grad_norm": 0.5628041385974469, + "learning_rate": 1.910205214427024e-05, + "loss": 0.8696, + "num_tokens": 11899465335.0, + "step": 2847 + }, + { + "epoch": 0.3384432560903149, + "grad_norm": 0.5737708379967971, + "learning_rate": 1.9101297864013876e-05, + "loss": 0.9025, + "num_tokens": 11903647670.0, + "step": 2848 + }, + { + "epoch": 0.338562091503268, + "grad_norm": 0.5012229929273246, + "learning_rate": 1.9100543283739065e-05, + "loss": 0.891, + "num_tokens": 11907837256.0, + "step": 2849 + }, + { + "epoch": 0.33868092691622104, + "grad_norm": 0.5413946855486904, + "learning_rate": 1.909978840347375e-05, + "loss": 0.9133, + "num_tokens": 11912026439.0, + "step": 2850 + }, + { + "epoch": 0.3387997623291741, + "grad_norm": 0.4767218680970663, + "learning_rate": 1.909903322324588e-05, + "loss": 0.8717, + "num_tokens": 11916214552.0, + "step": 2851 + }, + { + "epoch": 0.3389185977421272, + "grad_norm": 0.5067071901752151, + "learning_rate": 1.9098277743083424e-05, + "loss": 0.8896, + "num_tokens": 11920384144.0, + "step": 2852 + }, + { + "epoch": 0.3390374331550802, + "grad_norm": 0.6212608597128088, + "learning_rate": 1.9097521963014366e-05, + "loss": 0.908, + "num_tokens": 11924573054.0, + "step": 2853 + }, + { + "epoch": 0.33915626856803327, + "grad_norm": 0.5447136117935439, + "learning_rate": 1.9096765883066692e-05, + "loss": 0.9107, + "num_tokens": 11928761422.0, + "step": 2854 + }, + { + "epoch": 0.3392751039809863, + "grad_norm": 0.5642324426238804, + "learning_rate": 1.90960095032684e-05, + "loss": 0.9121, + "num_tokens": 11932951094.0, + "step": 2855 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.5673050957854183, + "learning_rate": 1.9095252823647505e-05, + "loss": 0.883, + "num_tokens": 11937141336.0, + "step": 2856 + }, + { + "epoch": 0.33951277480689246, + "grad_norm": 0.5569514037926959, + "learning_rate": 1.9094495844232024e-05, + "loss": 0.9182, + "num_tokens": 11941299646.0, + "step": 2857 + }, + { + "epoch": 0.3396316102198455, + "grad_norm": 0.5563942213217985, + "learning_rate": 1.9093738565049994e-05, + "loss": 0.9077, + "num_tokens": 11945488477.0, + "step": 2858 + }, + { + "epoch": 0.33975044563279855, + "grad_norm": 0.5938403741363298, + "learning_rate": 1.9092980986129462e-05, + "loss": 0.877, + "num_tokens": 11949641211.0, + "step": 2859 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.5587170573087668, + "learning_rate": 1.909222310749848e-05, + "loss": 0.8824, + "num_tokens": 11953829882.0, + "step": 2860 + }, + { + "epoch": 0.3399881164587047, + "grad_norm": 0.5867144871176357, + "learning_rate": 1.9091464929185114e-05, + "loss": 0.859, + "num_tokens": 11958020471.0, + "step": 2861 + }, + { + "epoch": 0.34010695187165774, + "grad_norm": 0.5686551107172397, + "learning_rate": 1.9090706451217446e-05, + "loss": 0.9223, + "num_tokens": 11962210167.0, + "step": 2862 + }, + { + "epoch": 0.34022578728461084, + "grad_norm": 0.5460197435775356, + "learning_rate": 1.9089947673623565e-05, + "loss": 0.8935, + "num_tokens": 11966367417.0, + "step": 2863 + }, + { + "epoch": 0.3403446226975639, + "grad_norm": 0.59353274812637, + "learning_rate": 1.9089188596431566e-05, + "loss": 0.8927, + "num_tokens": 11970530479.0, + "step": 2864 + }, + { + "epoch": 0.3404634581105169, + "grad_norm": 0.5787687552128356, + "learning_rate": 1.908842921966956e-05, + "loss": 0.8803, + "num_tokens": 11974719978.0, + "step": 2865 + }, + { + "epoch": 0.34058229352346997, + "grad_norm": 0.5161398498405443, + "learning_rate": 1.9087669543365678e-05, + "loss": 0.9025, + "num_tokens": 11978904943.0, + "step": 2866 + }, + { + "epoch": 0.34070112893642307, + "grad_norm": 0.5302202797744929, + "learning_rate": 1.9086909567548043e-05, + "loss": 0.8993, + "num_tokens": 11983095357.0, + "step": 2867 + }, + { + "epoch": 0.3408199643493761, + "grad_norm": 0.623335016116919, + "learning_rate": 1.9086149292244806e-05, + "loss": 0.904, + "num_tokens": 11987282618.0, + "step": 2868 + }, + { + "epoch": 0.34093879976232916, + "grad_norm": 0.40528406781466486, + "learning_rate": 1.9085388717484122e-05, + "loss": 0.8626, + "num_tokens": 11991444743.0, + "step": 2869 + }, + { + "epoch": 0.34105763517528226, + "grad_norm": 0.5574456258370997, + "learning_rate": 1.9084627843294158e-05, + "loss": 0.9179, + "num_tokens": 11995631956.0, + "step": 2870 + }, + { + "epoch": 0.3411764705882353, + "grad_norm": 0.5388241357035597, + "learning_rate": 1.9083866669703085e-05, + "loss": 0.8681, + "num_tokens": 11999820849.0, + "step": 2871 + }, + { + "epoch": 0.34129530600118835, + "grad_norm": 0.6054111842649735, + "learning_rate": 1.90831051967391e-05, + "loss": 0.891, + "num_tokens": 12003994246.0, + "step": 2872 + }, + { + "epoch": 0.3414141414141414, + "grad_norm": 0.5443576765705535, + "learning_rate": 1.90823434244304e-05, + "loss": 0.8854, + "num_tokens": 12008165765.0, + "step": 2873 + }, + { + "epoch": 0.3415329768270945, + "grad_norm": 0.5769088895011106, + "learning_rate": 1.9081581352805196e-05, + "loss": 0.887, + "num_tokens": 12012354433.0, + "step": 2874 + }, + { + "epoch": 0.34165181224004754, + "grad_norm": 0.49432756839327724, + "learning_rate": 1.908081898189171e-05, + "loss": 0.8839, + "num_tokens": 12016501608.0, + "step": 2875 + }, + { + "epoch": 0.3417706476530006, + "grad_norm": 0.513273640631394, + "learning_rate": 1.9080056311718174e-05, + "loss": 0.8687, + "num_tokens": 12020663575.0, + "step": 2876 + }, + { + "epoch": 0.3418894830659537, + "grad_norm": 0.6427757061600197, + "learning_rate": 1.9079293342312834e-05, + "loss": 0.8997, + "num_tokens": 12024853886.0, + "step": 2877 + }, + { + "epoch": 0.3420083184789067, + "grad_norm": 0.4498206506540565, + "learning_rate": 1.9078530073703948e-05, + "loss": 0.8928, + "num_tokens": 12029010803.0, + "step": 2878 + }, + { + "epoch": 0.34212715389185977, + "grad_norm": 0.7107322815665686, + "learning_rate": 1.9077766505919775e-05, + "loss": 0.9403, + "num_tokens": 12033200857.0, + "step": 2879 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 0.5371578330132492, + "learning_rate": 1.9077002638988597e-05, + "loss": 0.9012, + "num_tokens": 12037383049.0, + "step": 2880 + }, + { + "epoch": 0.3423648247177659, + "grad_norm": 0.47984441542782125, + "learning_rate": 1.9076238472938704e-05, + "loss": 0.9132, + "num_tokens": 12041564368.0, + "step": 2881 + }, + { + "epoch": 0.34248366013071896, + "grad_norm": 0.5688382429736183, + "learning_rate": 1.9075474007798395e-05, + "loss": 0.961, + "num_tokens": 12045747811.0, + "step": 2882 + }, + { + "epoch": 0.342602495543672, + "grad_norm": 0.6270212426562833, + "learning_rate": 1.9074709243595976e-05, + "loss": 0.8941, + "num_tokens": 12049929541.0, + "step": 2883 + }, + { + "epoch": 0.34272133095662505, + "grad_norm": 0.5048556429971738, + "learning_rate": 1.9073944180359777e-05, + "loss": 0.9035, + "num_tokens": 12054119557.0, + "step": 2884 + }, + { + "epoch": 0.34284016636957815, + "grad_norm": 0.6126375434772711, + "learning_rate": 1.9073178818118124e-05, + "loss": 0.8964, + "num_tokens": 12058310386.0, + "step": 2885 + }, + { + "epoch": 0.3429590017825312, + "grad_norm": 0.5222774482144568, + "learning_rate": 1.907241315689936e-05, + "loss": 0.9284, + "num_tokens": 12062498891.0, + "step": 2886 + }, + { + "epoch": 0.34307783719548424, + "grad_norm": 0.5532069131134206, + "learning_rate": 1.907164719673185e-05, + "loss": 0.9162, + "num_tokens": 12066687422.0, + "step": 2887 + }, + { + "epoch": 0.34319667260843734, + "grad_norm": 0.5314521962826956, + "learning_rate": 1.9070880937643947e-05, + "loss": 0.9064, + "num_tokens": 12070877092.0, + "step": 2888 + }, + { + "epoch": 0.3433155080213904, + "grad_norm": 0.5210295698720743, + "learning_rate": 1.9070114379664037e-05, + "loss": 0.8784, + "num_tokens": 12075064750.0, + "step": 2889 + }, + { + "epoch": 0.3434343434343434, + "grad_norm": 0.5222474048727904, + "learning_rate": 1.9069347522820508e-05, + "loss": 0.8949, + "num_tokens": 12079238418.0, + "step": 2890 + }, + { + "epoch": 0.34355317884729647, + "grad_norm": 0.5530625779093873, + "learning_rate": 1.9068580367141756e-05, + "loss": 0.8748, + "num_tokens": 12083426829.0, + "step": 2891 + }, + { + "epoch": 0.34367201426024957, + "grad_norm": 0.5117328586355185, + "learning_rate": 1.9067812912656188e-05, + "loss": 0.8804, + "num_tokens": 12087602030.0, + "step": 2892 + }, + { + "epoch": 0.3437908496732026, + "grad_norm": 0.6330102342215408, + "learning_rate": 1.906704515939224e-05, + "loss": 0.8951, + "num_tokens": 12091790025.0, + "step": 2893 + }, + { + "epoch": 0.34390968508615566, + "grad_norm": 0.5088239003687239, + "learning_rate": 1.9066277107378327e-05, + "loss": 0.9207, + "num_tokens": 12095979456.0, + "step": 2894 + }, + { + "epoch": 0.34402852049910876, + "grad_norm": 0.5666629523316374, + "learning_rate": 1.9065508756642898e-05, + "loss": 0.9065, + "num_tokens": 12100168449.0, + "step": 2895 + }, + { + "epoch": 0.3441473559120618, + "grad_norm": 0.5668033688681142, + "learning_rate": 1.9064740107214415e-05, + "loss": 0.8906, + "num_tokens": 12104355298.0, + "step": 2896 + }, + { + "epoch": 0.34426619132501485, + "grad_norm": 0.4885766274996967, + "learning_rate": 1.9063971159121336e-05, + "loss": 0.9112, + "num_tokens": 12108542448.0, + "step": 2897 + }, + { + "epoch": 0.3443850267379679, + "grad_norm": 0.601828146768585, + "learning_rate": 1.906320191239214e-05, + "loss": 0.8705, + "num_tokens": 12112732842.0, + "step": 2898 + }, + { + "epoch": 0.344503862150921, + "grad_norm": 0.5511758845038527, + "learning_rate": 1.9062432367055314e-05, + "loss": 0.9182, + "num_tokens": 12116914829.0, + "step": 2899 + }, + { + "epoch": 0.34462269756387404, + "grad_norm": 0.5397276968876555, + "learning_rate": 1.906166252313936e-05, + "loss": 0.8771, + "num_tokens": 12121086867.0, + "step": 2900 + }, + { + "epoch": 0.3447415329768271, + "grad_norm": 0.6567118623028765, + "learning_rate": 1.9060892380672785e-05, + "loss": 0.8949, + "num_tokens": 12125267719.0, + "step": 2901 + }, + { + "epoch": 0.3448603683897802, + "grad_norm": 0.5048815549608442, + "learning_rate": 1.906012193968411e-05, + "loss": 0.8474, + "num_tokens": 12129409594.0, + "step": 2902 + }, + { + "epoch": 0.3449792038027332, + "grad_norm": 0.5157698245643271, + "learning_rate": 1.9059351200201867e-05, + "loss": 0.8399, + "num_tokens": 12133572434.0, + "step": 2903 + }, + { + "epoch": 0.34509803921568627, + "grad_norm": 0.583166873530862, + "learning_rate": 1.90585801622546e-05, + "loss": 0.9422, + "num_tokens": 12137764040.0, + "step": 2904 + }, + { + "epoch": 0.3452168746286393, + "grad_norm": 0.5620362125438192, + "learning_rate": 1.9057808825870863e-05, + "loss": 0.8891, + "num_tokens": 12141917538.0, + "step": 2905 + }, + { + "epoch": 0.3453357100415924, + "grad_norm": 0.5350935142420277, + "learning_rate": 1.9057037191079224e-05, + "loss": 0.8783, + "num_tokens": 12146106496.0, + "step": 2906 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 0.5597342967081965, + "learning_rate": 1.905626525790825e-05, + "loss": 0.933, + "num_tokens": 12150296128.0, + "step": 2907 + }, + { + "epoch": 0.3455733808674985, + "grad_norm": 0.5602240226576578, + "learning_rate": 1.905549302638654e-05, + "loss": 0.9004, + "num_tokens": 12154455643.0, + "step": 2908 + }, + { + "epoch": 0.34569221628045155, + "grad_norm": 0.6248659722993103, + "learning_rate": 1.9054720496542688e-05, + "loss": 0.8752, + "num_tokens": 12158618816.0, + "step": 2909 + }, + { + "epoch": 0.34581105169340465, + "grad_norm": 0.5055065354841157, + "learning_rate": 1.9053947668405295e-05, + "loss": 0.8743, + "num_tokens": 12162806760.0, + "step": 2910 + }, + { + "epoch": 0.3459298871063577, + "grad_norm": 0.6066442646457143, + "learning_rate": 1.9053174542002994e-05, + "loss": 0.8939, + "num_tokens": 12166997803.0, + "step": 2911 + }, + { + "epoch": 0.34604872251931074, + "grad_norm": 0.5457123079096118, + "learning_rate": 1.9052401117364414e-05, + "loss": 0.9164, + "num_tokens": 12171185318.0, + "step": 2912 + }, + { + "epoch": 0.34616755793226384, + "grad_norm": 0.5866325970707615, + "learning_rate": 1.905162739451819e-05, + "loss": 0.9276, + "num_tokens": 12175363121.0, + "step": 2913 + }, + { + "epoch": 0.3462863933452169, + "grad_norm": 0.5702223232680814, + "learning_rate": 1.905085337349298e-05, + "loss": 0.8589, + "num_tokens": 12179551134.0, + "step": 2914 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.5858236840275098, + "learning_rate": 1.9050079054317453e-05, + "loss": 0.9329, + "num_tokens": 12183739872.0, + "step": 2915 + }, + { + "epoch": 0.34652406417112297, + "grad_norm": 0.641890812114735, + "learning_rate": 1.904930443702028e-05, + "loss": 0.9232, + "num_tokens": 12187930278.0, + "step": 2916 + }, + { + "epoch": 0.34664289958407607, + "grad_norm": 0.4767965690837452, + "learning_rate": 1.9048529521630145e-05, + "loss": 0.9032, + "num_tokens": 12192118982.0, + "step": 2917 + }, + { + "epoch": 0.3467617349970291, + "grad_norm": 0.5245258523692667, + "learning_rate": 1.904775430817575e-05, + "loss": 0.857, + "num_tokens": 12196308405.0, + "step": 2918 + }, + { + "epoch": 0.34688057040998216, + "grad_norm": 0.6066622687522103, + "learning_rate": 1.9046978796685806e-05, + "loss": 0.8926, + "num_tokens": 12200497447.0, + "step": 2919 + }, + { + "epoch": 0.34699940582293526, + "grad_norm": 0.5182538779033079, + "learning_rate": 1.904620298718903e-05, + "loss": 0.9243, + "num_tokens": 12204686677.0, + "step": 2920 + }, + { + "epoch": 0.3471182412358883, + "grad_norm": 0.40811162485321256, + "learning_rate": 1.9045426879714145e-05, + "loss": 0.8719, + "num_tokens": 12208876207.0, + "step": 2921 + }, + { + "epoch": 0.34723707664884135, + "grad_norm": 0.6780683382475163, + "learning_rate": 1.9044650474289906e-05, + "loss": 0.8463, + "num_tokens": 12213045124.0, + "step": 2922 + }, + { + "epoch": 0.3473559120617944, + "grad_norm": 0.4899292207788672, + "learning_rate": 1.904387377094506e-05, + "loss": 0.8935, + "num_tokens": 12217233192.0, + "step": 2923 + }, + { + "epoch": 0.3474747474747475, + "grad_norm": 0.6006659643963821, + "learning_rate": 1.9043096769708373e-05, + "loss": 0.8673, + "num_tokens": 12221422164.0, + "step": 2924 + }, + { + "epoch": 0.34759358288770054, + "grad_norm": 0.5705367113644165, + "learning_rate": 1.9042319470608616e-05, + "loss": 0.865, + "num_tokens": 12225611562.0, + "step": 2925 + }, + { + "epoch": 0.3477124183006536, + "grad_norm": 0.5973860731982507, + "learning_rate": 1.9041541873674576e-05, + "loss": 0.8943, + "num_tokens": 12229801912.0, + "step": 2926 + }, + { + "epoch": 0.3478312537136067, + "grad_norm": 0.4684437472423491, + "learning_rate": 1.9040763978935054e-05, + "loss": 0.8964, + "num_tokens": 12233992263.0, + "step": 2927 + }, + { + "epoch": 0.3479500891265597, + "grad_norm": 0.48802743211287314, + "learning_rate": 1.9039985786418856e-05, + "loss": 0.9003, + "num_tokens": 12238160054.0, + "step": 2928 + }, + { + "epoch": 0.34806892453951277, + "grad_norm": 0.5795503848692832, + "learning_rate": 1.90392072961548e-05, + "loss": 0.8834, + "num_tokens": 12242341683.0, + "step": 2929 + }, + { + "epoch": 0.3481877599524658, + "grad_norm": 0.5560143541475273, + "learning_rate": 1.903842850817171e-05, + "loss": 0.9167, + "num_tokens": 12246492991.0, + "step": 2930 + }, + { + "epoch": 0.3483065953654189, + "grad_norm": 0.553146565025721, + "learning_rate": 1.903764942249844e-05, + "loss": 0.8774, + "num_tokens": 12250660405.0, + "step": 2931 + }, + { + "epoch": 0.34842543077837196, + "grad_norm": 0.5157273442323466, + "learning_rate": 1.903687003916384e-05, + "loss": 0.8878, + "num_tokens": 12254829855.0, + "step": 2932 + }, + { + "epoch": 0.348544266191325, + "grad_norm": 0.5487681862746435, + "learning_rate": 1.903609035819676e-05, + "loss": 0.8746, + "num_tokens": 12259003760.0, + "step": 2933 + }, + { + "epoch": 0.34866310160427805, + "grad_norm": 0.5247208202455296, + "learning_rate": 1.903531037962609e-05, + "loss": 0.8828, + "num_tokens": 12263192917.0, + "step": 2934 + }, + { + "epoch": 0.34878193701723115, + "grad_norm": 0.5725880203108886, + "learning_rate": 1.9034530103480708e-05, + "loss": 0.903, + "num_tokens": 12267377615.0, + "step": 2935 + }, + { + "epoch": 0.3489007724301842, + "grad_norm": 0.5866444520294762, + "learning_rate": 1.9033749529789508e-05, + "loss": 0.8924, + "num_tokens": 12271521591.0, + "step": 2936 + }, + { + "epoch": 0.34901960784313724, + "grad_norm": 0.5367753093545878, + "learning_rate": 1.9032968658581403e-05, + "loss": 0.8796, + "num_tokens": 12275709247.0, + "step": 2937 + }, + { + "epoch": 0.34913844325609034, + "grad_norm": 0.5388130941155607, + "learning_rate": 1.9032187489885305e-05, + "loss": 0.8704, + "num_tokens": 12279895458.0, + "step": 2938 + }, + { + "epoch": 0.3492572786690434, + "grad_norm": 0.5081314235196361, + "learning_rate": 1.903140602373015e-05, + "loss": 0.9224, + "num_tokens": 12284084910.0, + "step": 2939 + }, + { + "epoch": 0.3493761140819964, + "grad_norm": 0.5118185531813353, + "learning_rate": 1.903062426014487e-05, + "loss": 0.8643, + "num_tokens": 12288269073.0, + "step": 2940 + }, + { + "epoch": 0.34949494949494947, + "grad_norm": 0.6129647498759808, + "learning_rate": 1.902984219915843e-05, + "loss": 0.8878, + "num_tokens": 12292455843.0, + "step": 2941 + }, + { + "epoch": 0.34961378490790257, + "grad_norm": 0.562840413828353, + "learning_rate": 1.9029059840799776e-05, + "loss": 0.9114, + "num_tokens": 12296643738.0, + "step": 2942 + }, + { + "epoch": 0.3497326203208556, + "grad_norm": 0.5113802061183135, + "learning_rate": 1.902827718509789e-05, + "loss": 0.8857, + "num_tokens": 12300799652.0, + "step": 2943 + }, + { + "epoch": 0.34985145573380866, + "grad_norm": 0.5291347810865862, + "learning_rate": 1.902749423208176e-05, + "loss": 0.8699, + "num_tokens": 12304980142.0, + "step": 2944 + }, + { + "epoch": 0.34997029114676176, + "grad_norm": 0.48266253561082806, + "learning_rate": 1.9026710981780374e-05, + "loss": 0.912, + "num_tokens": 12309142909.0, + "step": 2945 + }, + { + "epoch": 0.3500891265597148, + "grad_norm": 0.692851172453281, + "learning_rate": 1.9025927434222744e-05, + "loss": 0.9064, + "num_tokens": 12313305781.0, + "step": 2946 + }, + { + "epoch": 0.35020796197266785, + "grad_norm": 0.4746147040561137, + "learning_rate": 1.902514358943788e-05, + "loss": 0.8956, + "num_tokens": 12317494173.0, + "step": 2947 + }, + { + "epoch": 0.3503267973856209, + "grad_norm": 0.5491759373777837, + "learning_rate": 1.902435944745482e-05, + "loss": 0.8857, + "num_tokens": 12321683419.0, + "step": 2948 + }, + { + "epoch": 0.350445632798574, + "grad_norm": 0.5788463454646706, + "learning_rate": 1.9023575008302597e-05, + "loss": 0.9047, + "num_tokens": 12325873254.0, + "step": 2949 + }, + { + "epoch": 0.35056446821152704, + "grad_norm": 0.559681373616004, + "learning_rate": 1.9022790272010265e-05, + "loss": 0.8783, + "num_tokens": 12330044987.0, + "step": 2950 + }, + { + "epoch": 0.3506833036244801, + "grad_norm": 0.5247085586134117, + "learning_rate": 1.902200523860688e-05, + "loss": 0.9118, + "num_tokens": 12334231234.0, + "step": 2951 + }, + { + "epoch": 0.3508021390374332, + "grad_norm": 0.6830976325025189, + "learning_rate": 1.902121990812152e-05, + "loss": 0.9305, + "num_tokens": 12338399909.0, + "step": 2952 + }, + { + "epoch": 0.3509209744503862, + "grad_norm": 0.5372925399208409, + "learning_rate": 1.9020434280583267e-05, + "loss": 0.9426, + "num_tokens": 12342557415.0, + "step": 2953 + }, + { + "epoch": 0.35103980986333927, + "grad_norm": 0.6341238087057463, + "learning_rate": 1.9019648356021214e-05, + "loss": 0.8451, + "num_tokens": 12346746458.0, + "step": 2954 + }, + { + "epoch": 0.3511586452762923, + "grad_norm": 0.42769952185995774, + "learning_rate": 1.9018862134464466e-05, + "loss": 0.87, + "num_tokens": 12350935643.0, + "step": 2955 + }, + { + "epoch": 0.3512774806892454, + "grad_norm": 0.6510734304199075, + "learning_rate": 1.901807561594214e-05, + "loss": 0.9224, + "num_tokens": 12355115318.0, + "step": 2956 + }, + { + "epoch": 0.35139631610219846, + "grad_norm": 0.4637011751109267, + "learning_rate": 1.9017288800483366e-05, + "loss": 0.9065, + "num_tokens": 12359301430.0, + "step": 2957 + }, + { + "epoch": 0.3515151515151515, + "grad_norm": 0.5452963674592692, + "learning_rate": 1.901650168811728e-05, + "loss": 0.8929, + "num_tokens": 12363482092.0, + "step": 2958 + }, + { + "epoch": 0.3516339869281046, + "grad_norm": 0.5844188095702104, + "learning_rate": 1.9015714278873035e-05, + "loss": 0.8949, + "num_tokens": 12367644798.0, + "step": 2959 + }, + { + "epoch": 0.35175282234105765, + "grad_norm": 0.5795346166910021, + "learning_rate": 1.9014926572779787e-05, + "loss": 0.9099, + "num_tokens": 12371833190.0, + "step": 2960 + }, + { + "epoch": 0.3518716577540107, + "grad_norm": 0.5943719333437892, + "learning_rate": 1.9014138569866706e-05, + "loss": 0.8615, + "num_tokens": 12376023398.0, + "step": 2961 + }, + { + "epoch": 0.35199049316696374, + "grad_norm": 0.46447070640152777, + "learning_rate": 1.901335027016298e-05, + "loss": 0.9035, + "num_tokens": 12380212911.0, + "step": 2962 + }, + { + "epoch": 0.35210932857991684, + "grad_norm": 0.5499299669748173, + "learning_rate": 1.90125616736978e-05, + "loss": 0.8913, + "num_tokens": 12384380670.0, + "step": 2963 + }, + { + "epoch": 0.3522281639928699, + "grad_norm": 0.5660325988805945, + "learning_rate": 1.901177278050037e-05, + "loss": 0.9043, + "num_tokens": 12388571183.0, + "step": 2964 + }, + { + "epoch": 0.3523469994058229, + "grad_norm": 0.5426123506650992, + "learning_rate": 1.9010983590599904e-05, + "loss": 0.8877, + "num_tokens": 12392761720.0, + "step": 2965 + }, + { + "epoch": 0.35246583481877597, + "grad_norm": 0.5413239477450603, + "learning_rate": 1.901019410402563e-05, + "loss": 0.8534, + "num_tokens": 12396933376.0, + "step": 2966 + }, + { + "epoch": 0.35258467023172907, + "grad_norm": 0.5573230514875948, + "learning_rate": 1.9009404320806785e-05, + "loss": 0.9307, + "num_tokens": 12401122522.0, + "step": 2967 + }, + { + "epoch": 0.3527035056446821, + "grad_norm": 0.5672759050870124, + "learning_rate": 1.9008614240972618e-05, + "loss": 0.8987, + "num_tokens": 12405269543.0, + "step": 2968 + }, + { + "epoch": 0.35282234105763516, + "grad_norm": 0.4720227219889892, + "learning_rate": 1.9007823864552388e-05, + "loss": 0.9041, + "num_tokens": 12409447483.0, + "step": 2969 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6360609511315546, + "learning_rate": 1.900703319157537e-05, + "loss": 0.9149, + "num_tokens": 12413634358.0, + "step": 2970 + }, + { + "epoch": 0.3530600118835413, + "grad_norm": 0.4889308318378883, + "learning_rate": 1.9006242222070835e-05, + "loss": 0.8664, + "num_tokens": 12417824168.0, + "step": 2971 + }, + { + "epoch": 0.35317884729649435, + "grad_norm": 0.5782668835698779, + "learning_rate": 1.9005450956068076e-05, + "loss": 0.9057, + "num_tokens": 12422012454.0, + "step": 2972 + }, + { + "epoch": 0.3532976827094474, + "grad_norm": 0.5479900268761785, + "learning_rate": 1.9004659393596406e-05, + "loss": 0.8949, + "num_tokens": 12426172659.0, + "step": 2973 + }, + { + "epoch": 0.3534165181224005, + "grad_norm": 0.5627151203262775, + "learning_rate": 1.900386753468514e-05, + "loss": 0.8796, + "num_tokens": 12430357476.0, + "step": 2974 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.6236948133362471, + "learning_rate": 1.900307537936359e-05, + "loss": 0.9365, + "num_tokens": 12434520709.0, + "step": 2975 + }, + { + "epoch": 0.3536541889483066, + "grad_norm": 0.5077914583425455, + "learning_rate": 1.9002282927661102e-05, + "loss": 0.8874, + "num_tokens": 12438709263.0, + "step": 2976 + }, + { + "epoch": 0.3537730243612597, + "grad_norm": 0.5955206352683909, + "learning_rate": 1.900149017960702e-05, + "loss": 0.9139, + "num_tokens": 12442868723.0, + "step": 2977 + }, + { + "epoch": 0.3538918597742127, + "grad_norm": 0.5065777982301927, + "learning_rate": 1.9000697135230705e-05, + "loss": 0.8954, + "num_tokens": 12447058206.0, + "step": 2978 + }, + { + "epoch": 0.35401069518716577, + "grad_norm": 0.5773549719781007, + "learning_rate": 1.8999903794561522e-05, + "loss": 0.9171, + "num_tokens": 12451247324.0, + "step": 2979 + }, + { + "epoch": 0.3541295306001188, + "grad_norm": 0.5342106590609481, + "learning_rate": 1.8999110157628857e-05, + "loss": 0.935, + "num_tokens": 12455436427.0, + "step": 2980 + }, + { + "epoch": 0.3542483660130719, + "grad_norm": 0.5194297204391473, + "learning_rate": 1.8998316224462092e-05, + "loss": 0.8636, + "num_tokens": 12459600171.0, + "step": 2981 + }, + { + "epoch": 0.35436720142602496, + "grad_norm": 0.5347505801628161, + "learning_rate": 1.8997521995090637e-05, + "loss": 0.9081, + "num_tokens": 12463788969.0, + "step": 2982 + }, + { + "epoch": 0.354486036838978, + "grad_norm": 0.5083953810349836, + "learning_rate": 1.8996727469543903e-05, + "loss": 0.9422, + "num_tokens": 12467975667.0, + "step": 2983 + }, + { + "epoch": 0.3546048722519311, + "grad_norm": 0.6122330755889241, + "learning_rate": 1.8995932647851313e-05, + "loss": 0.8827, + "num_tokens": 12472146378.0, + "step": 2984 + }, + { + "epoch": 0.35472370766488415, + "grad_norm": 0.5756779171861524, + "learning_rate": 1.89951375300423e-05, + "loss": 0.9388, + "num_tokens": 12476337374.0, + "step": 2985 + }, + { + "epoch": 0.3548425430778372, + "grad_norm": 0.4887634215988773, + "learning_rate": 1.8994342116146313e-05, + "loss": 0.8785, + "num_tokens": 12480526686.0, + "step": 2986 + }, + { + "epoch": 0.35496137849079024, + "grad_norm": 0.5556821242151144, + "learning_rate": 1.899354640619281e-05, + "loss": 0.8837, + "num_tokens": 12484693035.0, + "step": 2987 + }, + { + "epoch": 0.35508021390374334, + "grad_norm": 0.5623713818727313, + "learning_rate": 1.8992750400211252e-05, + "loss": 0.8451, + "num_tokens": 12488881739.0, + "step": 2988 + }, + { + "epoch": 0.3551990493166964, + "grad_norm": 0.4877710212253274, + "learning_rate": 1.8991954098231127e-05, + "loss": 0.9016, + "num_tokens": 12493071641.0, + "step": 2989 + }, + { + "epoch": 0.3553178847296494, + "grad_norm": 0.6127883941801465, + "learning_rate": 1.899115750028192e-05, + "loss": 0.9182, + "num_tokens": 12497258970.0, + "step": 2990 + }, + { + "epoch": 0.35543672014260247, + "grad_norm": 0.5177125539390363, + "learning_rate": 1.899036060639313e-05, + "loss": 0.9089, + "num_tokens": 12501448680.0, + "step": 2991 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.4415688416780879, + "learning_rate": 1.8989563416594274e-05, + "loss": 0.8971, + "num_tokens": 12505594798.0, + "step": 2992 + }, + { + "epoch": 0.3556743909685086, + "grad_norm": 0.5614974805764642, + "learning_rate": 1.898876593091487e-05, + "loss": 0.9148, + "num_tokens": 12509783999.0, + "step": 2993 + }, + { + "epoch": 0.35579322638146166, + "grad_norm": 0.6379199327054782, + "learning_rate": 1.898796814938445e-05, + "loss": 0.8874, + "num_tokens": 12513887558.0, + "step": 2994 + }, + { + "epoch": 0.35591206179441476, + "grad_norm": 0.5947817147306615, + "learning_rate": 1.898717007203256e-05, + "loss": 0.8831, + "num_tokens": 12518078406.0, + "step": 2995 + }, + { + "epoch": 0.3560308972073678, + "grad_norm": 0.5544334625631391, + "learning_rate": 1.898637169888876e-05, + "loss": 0.89, + "num_tokens": 12522267106.0, + "step": 2996 + }, + { + "epoch": 0.35614973262032085, + "grad_norm": 0.5450515784436715, + "learning_rate": 1.8985573029982614e-05, + "loss": 0.8977, + "num_tokens": 12526455352.0, + "step": 2997 + }, + { + "epoch": 0.3562685680332739, + "grad_norm": 0.5450690355376384, + "learning_rate": 1.89847740653437e-05, + "loss": 0.9032, + "num_tokens": 12530624357.0, + "step": 2998 + }, + { + "epoch": 0.356387403446227, + "grad_norm": 0.513459374084538, + "learning_rate": 1.8983974805001602e-05, + "loss": 0.8819, + "num_tokens": 12534794483.0, + "step": 2999 + }, + { + "epoch": 0.35650623885918004, + "grad_norm": 0.526111913480999, + "learning_rate": 1.8983175248985924e-05, + "loss": 0.8776, + "num_tokens": 12538953079.0, + "step": 3000 + }, + { + "epoch": 0.3566250742721331, + "grad_norm": 0.4911979876626541, + "learning_rate": 1.8982375397326277e-05, + "loss": 0.9107, + "num_tokens": 12543132279.0, + "step": 3001 + }, + { + "epoch": 0.3567439096850862, + "grad_norm": 0.5661131448444081, + "learning_rate": 1.898157525005228e-05, + "loss": 0.8645, + "num_tokens": 12547320614.0, + "step": 3002 + }, + { + "epoch": 0.3568627450980392, + "grad_norm": 0.5644987461489783, + "learning_rate": 1.8980774807193564e-05, + "loss": 0.8594, + "num_tokens": 12551510222.0, + "step": 3003 + }, + { + "epoch": 0.35698158051099227, + "grad_norm": 0.5597792359381806, + "learning_rate": 1.8979974068779773e-05, + "loss": 0.9148, + "num_tokens": 12555697793.0, + "step": 3004 + }, + { + "epoch": 0.3571004159239453, + "grad_norm": 0.5778015853673779, + "learning_rate": 1.8979173034840558e-05, + "loss": 0.9182, + "num_tokens": 12559887646.0, + "step": 3005 + }, + { + "epoch": 0.3572192513368984, + "grad_norm": 0.49329084438946086, + "learning_rate": 1.8978371705405588e-05, + "loss": 0.8762, + "num_tokens": 12564049044.0, + "step": 3006 + }, + { + "epoch": 0.35733808674985146, + "grad_norm": 0.5935501998706402, + "learning_rate": 1.8977570080504543e-05, + "loss": 0.8883, + "num_tokens": 12568238006.0, + "step": 3007 + }, + { + "epoch": 0.3574569221628045, + "grad_norm": 0.5270985332861909, + "learning_rate": 1.8976768160167103e-05, + "loss": 0.9621, + "num_tokens": 12572427929.0, + "step": 3008 + }, + { + "epoch": 0.3575757575757576, + "grad_norm": 0.5451816149843842, + "learning_rate": 1.8975965944422966e-05, + "loss": 0.8978, + "num_tokens": 12576617192.0, + "step": 3009 + }, + { + "epoch": 0.35769459298871065, + "grad_norm": 0.5576897146238818, + "learning_rate": 1.8975163433301845e-05, + "loss": 0.8672, + "num_tokens": 12580786277.0, + "step": 3010 + }, + { + "epoch": 0.3578134284016637, + "grad_norm": 0.5221648552640451, + "learning_rate": 1.8974360626833454e-05, + "loss": 0.8895, + "num_tokens": 12584974836.0, + "step": 3011 + }, + { + "epoch": 0.35793226381461674, + "grad_norm": 0.50424815038175, + "learning_rate": 1.897355752504753e-05, + "loss": 0.8744, + "num_tokens": 12589163323.0, + "step": 3012 + }, + { + "epoch": 0.35805109922756984, + "grad_norm": 0.5385545751234777, + "learning_rate": 1.8972754127973813e-05, + "loss": 0.8962, + "num_tokens": 12593328657.0, + "step": 3013 + }, + { + "epoch": 0.3581699346405229, + "grad_norm": 0.47996339006168826, + "learning_rate": 1.8971950435642052e-05, + "loss": 0.9016, + "num_tokens": 12597474351.0, + "step": 3014 + }, + { + "epoch": 0.3582887700534759, + "grad_norm": 0.5102694737631938, + "learning_rate": 1.8971146448082013e-05, + "loss": 0.8923, + "num_tokens": 12601664871.0, + "step": 3015 + }, + { + "epoch": 0.35840760546642897, + "grad_norm": 0.5154957350978289, + "learning_rate": 1.897034216532347e-05, + "loss": 0.8818, + "num_tokens": 12605845856.0, + "step": 3016 + }, + { + "epoch": 0.35852644087938207, + "grad_norm": 0.5443265894742372, + "learning_rate": 1.8969537587396207e-05, + "loss": 0.9278, + "num_tokens": 12610037085.0, + "step": 3017 + }, + { + "epoch": 0.3586452762923351, + "grad_norm": 0.6063262625681736, + "learning_rate": 1.896873271433002e-05, + "loss": 0.9227, + "num_tokens": 12614205048.0, + "step": 3018 + }, + { + "epoch": 0.35876411170528816, + "grad_norm": 0.4897864818194958, + "learning_rate": 1.896792754615472e-05, + "loss": 0.8983, + "num_tokens": 12618372755.0, + "step": 3019 + }, + { + "epoch": 0.35888294711824126, + "grad_norm": 0.49703744491589785, + "learning_rate": 1.8967122082900128e-05, + "loss": 0.9507, + "num_tokens": 12622561358.0, + "step": 3020 + }, + { + "epoch": 0.3590017825311943, + "grad_norm": 0.5877877969401161, + "learning_rate": 1.8966316324596058e-05, + "loss": 0.9134, + "num_tokens": 12626725204.0, + "step": 3021 + }, + { + "epoch": 0.35912061794414735, + "grad_norm": 0.5596430526370632, + "learning_rate": 1.8965510271272368e-05, + "loss": 0.8961, + "num_tokens": 12630915132.0, + "step": 3022 + }, + { + "epoch": 0.3592394533571004, + "grad_norm": 0.5165974553978213, + "learning_rate": 1.8964703922958897e-05, + "loss": 0.9318, + "num_tokens": 12635064304.0, + "step": 3023 + }, + { + "epoch": 0.3593582887700535, + "grad_norm": 0.5654341361309596, + "learning_rate": 1.8963897279685507e-05, + "loss": 0.9326, + "num_tokens": 12639253767.0, + "step": 3024 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.5002960769473231, + "learning_rate": 1.896309034148208e-05, + "loss": 0.9024, + "num_tokens": 12643432768.0, + "step": 3025 + }, + { + "epoch": 0.3595959595959596, + "grad_norm": 0.5393256726482258, + "learning_rate": 1.896228310837849e-05, + "loss": 0.9033, + "num_tokens": 12647574383.0, + "step": 3026 + }, + { + "epoch": 0.3597147950089127, + "grad_norm": 0.6601194208088333, + "learning_rate": 1.8961475580404637e-05, + "loss": 0.8467, + "num_tokens": 12651748369.0, + "step": 3027 + }, + { + "epoch": 0.3598336304218657, + "grad_norm": 0.40546034957860316, + "learning_rate": 1.8960667757590425e-05, + "loss": 0.8749, + "num_tokens": 12655916886.0, + "step": 3028 + }, + { + "epoch": 0.35995246583481877, + "grad_norm": 0.6450307922921462, + "learning_rate": 1.895985963996577e-05, + "loss": 0.9034, + "num_tokens": 12660098951.0, + "step": 3029 + }, + { + "epoch": 0.3600713012477718, + "grad_norm": 0.6004544691048215, + "learning_rate": 1.8959051227560597e-05, + "loss": 0.9319, + "num_tokens": 12664287877.0, + "step": 3030 + }, + { + "epoch": 0.3601901366607249, + "grad_norm": 0.46954234388167454, + "learning_rate": 1.895824252040485e-05, + "loss": 0.9037, + "num_tokens": 12668385615.0, + "step": 3031 + }, + { + "epoch": 0.36030897207367796, + "grad_norm": 0.6426971900762102, + "learning_rate": 1.8957433518528477e-05, + "loss": 0.9161, + "num_tokens": 12672548275.0, + "step": 3032 + }, + { + "epoch": 0.360427807486631, + "grad_norm": 0.4246051881826057, + "learning_rate": 1.8956624221961433e-05, + "loss": 0.8998, + "num_tokens": 12676734044.0, + "step": 3033 + }, + { + "epoch": 0.3605466428995841, + "grad_norm": 0.5044794522963478, + "learning_rate": 1.895581463073369e-05, + "loss": 0.8948, + "num_tokens": 12680898815.0, + "step": 3034 + }, + { + "epoch": 0.36066547831253715, + "grad_norm": 0.5832948855563481, + "learning_rate": 1.8955004744875236e-05, + "loss": 0.9121, + "num_tokens": 12685048160.0, + "step": 3035 + }, + { + "epoch": 0.3607843137254902, + "grad_norm": 0.5120237541945075, + "learning_rate": 1.895419456441606e-05, + "loss": 0.885, + "num_tokens": 12689235844.0, + "step": 3036 + }, + { + "epoch": 0.36090314913844324, + "grad_norm": 0.48109732008814227, + "learning_rate": 1.895338408938616e-05, + "loss": 0.9157, + "num_tokens": 12693424709.0, + "step": 3037 + }, + { + "epoch": 0.36102198455139634, + "grad_norm": 0.6596373661716898, + "learning_rate": 1.8952573319815563e-05, + "loss": 0.885, + "num_tokens": 12697582990.0, + "step": 3038 + }, + { + "epoch": 0.3611408199643494, + "grad_norm": 0.47973509264241954, + "learning_rate": 1.8951762255734286e-05, + "loss": 0.9172, + "num_tokens": 12701728041.0, + "step": 3039 + }, + { + "epoch": 0.3612596553773024, + "grad_norm": 0.617771147148588, + "learning_rate": 1.8950950897172363e-05, + "loss": 0.891, + "num_tokens": 12705918395.0, + "step": 3040 + }, + { + "epoch": 0.36137849079025547, + "grad_norm": 0.4794318673125197, + "learning_rate": 1.895013924415985e-05, + "loss": 0.9073, + "num_tokens": 12710106659.0, + "step": 3041 + }, + { + "epoch": 0.36149732620320857, + "grad_norm": 0.5530381425454343, + "learning_rate": 1.89493272967268e-05, + "loss": 0.903, + "num_tokens": 12714263236.0, + "step": 3042 + }, + { + "epoch": 0.3616161616161616, + "grad_norm": 0.6037942124226291, + "learning_rate": 1.894851505490328e-05, + "loss": 0.8731, + "num_tokens": 12718452467.0, + "step": 3043 + }, + { + "epoch": 0.36173499702911466, + "grad_norm": 0.4766436773493125, + "learning_rate": 1.8947702518719376e-05, + "loss": 0.8484, + "num_tokens": 12722641973.0, + "step": 3044 + }, + { + "epoch": 0.36185383244206776, + "grad_norm": 0.5371332854742796, + "learning_rate": 1.8946889688205176e-05, + "loss": 0.9087, + "num_tokens": 12726831190.0, + "step": 3045 + }, + { + "epoch": 0.3619726678550208, + "grad_norm": 0.6261880447754263, + "learning_rate": 1.8946076563390777e-05, + "loss": 0.8706, + "num_tokens": 12731020486.0, + "step": 3046 + }, + { + "epoch": 0.36209150326797385, + "grad_norm": 0.5223659363761585, + "learning_rate": 1.89452631443063e-05, + "loss": 0.8931, + "num_tokens": 12735211121.0, + "step": 3047 + }, + { + "epoch": 0.3622103386809269, + "grad_norm": 0.6173891140939031, + "learning_rate": 1.8944449430981868e-05, + "loss": 0.9095, + "num_tokens": 12739399870.0, + "step": 3048 + }, + { + "epoch": 0.36232917409388, + "grad_norm": 0.512531187847264, + "learning_rate": 1.894363542344761e-05, + "loss": 0.8809, + "num_tokens": 12743588194.0, + "step": 3049 + }, + { + "epoch": 0.36244800950683304, + "grad_norm": 0.5533817286105798, + "learning_rate": 1.8942821121733673e-05, + "loss": 0.8792, + "num_tokens": 12747776940.0, + "step": 3050 + }, + { + "epoch": 0.3625668449197861, + "grad_norm": 0.45376565247314465, + "learning_rate": 1.894200652587022e-05, + "loss": 0.9226, + "num_tokens": 12751964614.0, + "step": 3051 + }, + { + "epoch": 0.3626856803327392, + "grad_norm": 0.5431149072617504, + "learning_rate": 1.8941191635887406e-05, + "loss": 0.86, + "num_tokens": 12756154362.0, + "step": 3052 + }, + { + "epoch": 0.3628045157456922, + "grad_norm": 0.4710390364754499, + "learning_rate": 1.8940376451815418e-05, + "loss": 0.8722, + "num_tokens": 12760321057.0, + "step": 3053 + }, + { + "epoch": 0.36292335115864527, + "grad_norm": 0.5653136814078369, + "learning_rate": 1.8939560973684442e-05, + "loss": 0.8944, + "num_tokens": 12764504492.0, + "step": 3054 + }, + { + "epoch": 0.3630421865715983, + "grad_norm": 0.5982595788791651, + "learning_rate": 1.893874520152468e-05, + "loss": 0.9055, + "num_tokens": 12768693575.0, + "step": 3055 + }, + { + "epoch": 0.3631610219845514, + "grad_norm": 0.5006641356469516, + "learning_rate": 1.8937929135366344e-05, + "loss": 0.8862, + "num_tokens": 12772868966.0, + "step": 3056 + }, + { + "epoch": 0.36327985739750446, + "grad_norm": 0.6050112010739315, + "learning_rate": 1.893711277523965e-05, + "loss": 0.8754, + "num_tokens": 12777008723.0, + "step": 3057 + }, + { + "epoch": 0.3633986928104575, + "grad_norm": 0.5305192340320873, + "learning_rate": 1.8936296121174835e-05, + "loss": 0.8996, + "num_tokens": 12781198866.0, + "step": 3058 + }, + { + "epoch": 0.3635175282234106, + "grad_norm": 0.7108223599090362, + "learning_rate": 1.8935479173202142e-05, + "loss": 0.8936, + "num_tokens": 12785382333.0, + "step": 3059 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.4088850642665228, + "learning_rate": 1.8934661931351825e-05, + "loss": 0.9032, + "num_tokens": 12789571818.0, + "step": 3060 + }, + { + "epoch": 0.3637551990493167, + "grad_norm": 0.5880086253396551, + "learning_rate": 1.893384439565415e-05, + "loss": 0.9332, + "num_tokens": 12793739124.0, + "step": 3061 + }, + { + "epoch": 0.36387403446226974, + "grad_norm": 0.5655533284774685, + "learning_rate": 1.8933026566139388e-05, + "loss": 0.8955, + "num_tokens": 12797928291.0, + "step": 3062 + }, + { + "epoch": 0.36399286987522284, + "grad_norm": 0.5939992369348821, + "learning_rate": 1.8932208442837834e-05, + "loss": 0.9207, + "num_tokens": 12802117811.0, + "step": 3063 + }, + { + "epoch": 0.3641117052881759, + "grad_norm": 0.530701083830428, + "learning_rate": 1.8931390025779775e-05, + "loss": 0.8604, + "num_tokens": 12806305692.0, + "step": 3064 + }, + { + "epoch": 0.3642305407011289, + "grad_norm": 0.4527286219239381, + "learning_rate": 1.8930571314995533e-05, + "loss": 0.9242, + "num_tokens": 12810463142.0, + "step": 3065 + }, + { + "epoch": 0.364349376114082, + "grad_norm": 0.5029438909099516, + "learning_rate": 1.8929752310515423e-05, + "loss": 0.9102, + "num_tokens": 12814630611.0, + "step": 3066 + }, + { + "epoch": 0.36446821152703507, + "grad_norm": 0.5860811065456772, + "learning_rate": 1.892893301236977e-05, + "loss": 0.8944, + "num_tokens": 12818816478.0, + "step": 3067 + }, + { + "epoch": 0.3645870469399881, + "grad_norm": 0.6097150953959065, + "learning_rate": 1.8928113420588923e-05, + "loss": 0.8855, + "num_tokens": 12823004842.0, + "step": 3068 + }, + { + "epoch": 0.36470588235294116, + "grad_norm": 0.4963479309611819, + "learning_rate": 1.8927293535203228e-05, + "loss": 0.9357, + "num_tokens": 12827186257.0, + "step": 3069 + }, + { + "epoch": 0.36482471776589426, + "grad_norm": 0.5090947427940616, + "learning_rate": 1.892647335624305e-05, + "loss": 0.8952, + "num_tokens": 12831351925.0, + "step": 3070 + }, + { + "epoch": 0.3649435531788473, + "grad_norm": 0.5555787186413733, + "learning_rate": 1.892565288373876e-05, + "loss": 0.8847, + "num_tokens": 12835539241.0, + "step": 3071 + }, + { + "epoch": 0.36506238859180035, + "grad_norm": 0.4492668616389869, + "learning_rate": 1.892483211772075e-05, + "loss": 0.8987, + "num_tokens": 12839729315.0, + "step": 3072 + }, + { + "epoch": 0.3651812240047534, + "grad_norm": 0.4880504912105957, + "learning_rate": 1.8924011058219414e-05, + "loss": 0.8844, + "num_tokens": 12843918454.0, + "step": 3073 + }, + { + "epoch": 0.3653000594177065, + "grad_norm": 0.5660416286752336, + "learning_rate": 1.8923189705265157e-05, + "loss": 0.9489, + "num_tokens": 12848087514.0, + "step": 3074 + }, + { + "epoch": 0.36541889483065954, + "grad_norm": 0.5718304156021707, + "learning_rate": 1.8922368058888393e-05, + "loss": 0.9225, + "num_tokens": 12852275157.0, + "step": 3075 + }, + { + "epoch": 0.3655377302436126, + "grad_norm": 0.49011208224478153, + "learning_rate": 1.892154611911956e-05, + "loss": 0.8831, + "num_tokens": 12856464419.0, + "step": 3076 + }, + { + "epoch": 0.3656565656565657, + "grad_norm": 0.5354155685386501, + "learning_rate": 1.892072388598908e-05, + "loss": 0.8734, + "num_tokens": 12860626868.0, + "step": 3077 + }, + { + "epoch": 0.3657754010695187, + "grad_norm": 0.6437753430616272, + "learning_rate": 1.891990135952742e-05, + "loss": 0.8552, + "num_tokens": 12864814978.0, + "step": 3078 + }, + { + "epoch": 0.36589423648247177, + "grad_norm": 0.5659303764163972, + "learning_rate": 1.8919078539765037e-05, + "loss": 0.8946, + "num_tokens": 12869003208.0, + "step": 3079 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.5083221866144477, + "learning_rate": 1.89182554267324e-05, + "loss": 0.9038, + "num_tokens": 12873180510.0, + "step": 3080 + }, + { + "epoch": 0.3661319073083779, + "grad_norm": 0.4520607057100174, + "learning_rate": 1.891743202045999e-05, + "loss": 0.8621, + "num_tokens": 12877370735.0, + "step": 3081 + }, + { + "epoch": 0.36625074272133096, + "grad_norm": 0.6770457199103168, + "learning_rate": 1.8916608320978304e-05, + "loss": 0.9044, + "num_tokens": 12881558505.0, + "step": 3082 + }, + { + "epoch": 0.366369578134284, + "grad_norm": 0.4886237053083313, + "learning_rate": 1.8915784328317843e-05, + "loss": 0.8906, + "num_tokens": 12885746187.0, + "step": 3083 + }, + { + "epoch": 0.3664884135472371, + "grad_norm": 0.5200563768550027, + "learning_rate": 1.8914960042509126e-05, + "loss": 0.9007, + "num_tokens": 12889934102.0, + "step": 3084 + }, + { + "epoch": 0.36660724896019015, + "grad_norm": 0.6043260145869621, + "learning_rate": 1.891413546358268e-05, + "loss": 0.9048, + "num_tokens": 12894123724.0, + "step": 3085 + }, + { + "epoch": 0.3667260843731432, + "grad_norm": 0.5888592044869552, + "learning_rate": 1.8913310591569035e-05, + "loss": 0.8578, + "num_tokens": 12898309415.0, + "step": 3086 + }, + { + "epoch": 0.36684491978609624, + "grad_norm": 0.7081907701559046, + "learning_rate": 1.8912485426498746e-05, + "loss": 0.8886, + "num_tokens": 12902500231.0, + "step": 3087 + }, + { + "epoch": 0.36696375519904934, + "grad_norm": 0.48052776563561955, + "learning_rate": 1.891165996840237e-05, + "loss": 0.8701, + "num_tokens": 12906685238.0, + "step": 3088 + }, + { + "epoch": 0.3670825906120024, + "grad_norm": 0.6736770988093121, + "learning_rate": 1.891083421731047e-05, + "loss": 0.8293, + "num_tokens": 12910868066.0, + "step": 3089 + }, + { + "epoch": 0.3672014260249554, + "grad_norm": 0.469615167240613, + "learning_rate": 1.891000817325364e-05, + "loss": 0.9298, + "num_tokens": 12915052511.0, + "step": 3090 + }, + { + "epoch": 0.3673202614379085, + "grad_norm": 0.7438661116504434, + "learning_rate": 1.8909181836262456e-05, + "loss": 0.8944, + "num_tokens": 12919238311.0, + "step": 3091 + }, + { + "epoch": 0.36743909685086157, + "grad_norm": 0.5084643266156358, + "learning_rate": 1.890835520636753e-05, + "loss": 0.867, + "num_tokens": 12923428556.0, + "step": 3092 + }, + { + "epoch": 0.3675579322638146, + "grad_norm": 0.5560787777722435, + "learning_rate": 1.890752828359947e-05, + "loss": 0.8954, + "num_tokens": 12927618053.0, + "step": 3093 + }, + { + "epoch": 0.36767676767676766, + "grad_norm": 0.527669691293533, + "learning_rate": 1.89067010679889e-05, + "loss": 0.9223, + "num_tokens": 12931769894.0, + "step": 3094 + }, + { + "epoch": 0.36779560308972076, + "grad_norm": 0.559087832880111, + "learning_rate": 1.8905873559566463e-05, + "loss": 0.9275, + "num_tokens": 12935959620.0, + "step": 3095 + }, + { + "epoch": 0.3679144385026738, + "grad_norm": 0.46459055726201526, + "learning_rate": 1.8905045758362793e-05, + "loss": 0.8965, + "num_tokens": 12940123753.0, + "step": 3096 + }, + { + "epoch": 0.36803327391562685, + "grad_norm": 0.6711700103412891, + "learning_rate": 1.8904217664408552e-05, + "loss": 0.893, + "num_tokens": 12944290925.0, + "step": 3097 + }, + { + "epoch": 0.3681521093285799, + "grad_norm": 0.4971505789061895, + "learning_rate": 1.8903389277734405e-05, + "loss": 0.9168, + "num_tokens": 12948470808.0, + "step": 3098 + }, + { + "epoch": 0.368270944741533, + "grad_norm": 0.6377528094260109, + "learning_rate": 1.8902560598371035e-05, + "loss": 0.9246, + "num_tokens": 12952661896.0, + "step": 3099 + }, + { + "epoch": 0.36838978015448604, + "grad_norm": 0.49571562164923577, + "learning_rate": 1.890173162634912e-05, + "loss": 0.9007, + "num_tokens": 12956840777.0, + "step": 3100 + }, + { + "epoch": 0.3685086155674391, + "grad_norm": 0.5914998136957311, + "learning_rate": 1.890090236169937e-05, + "loss": 0.9085, + "num_tokens": 12961029040.0, + "step": 3101 + }, + { + "epoch": 0.3686274509803922, + "grad_norm": 0.5076709736273484, + "learning_rate": 1.8900072804452495e-05, + "loss": 0.9201, + "num_tokens": 12965194030.0, + "step": 3102 + }, + { + "epoch": 0.3687462863933452, + "grad_norm": 0.6112648815562625, + "learning_rate": 1.889924295463921e-05, + "loss": 0.8856, + "num_tokens": 12969385085.0, + "step": 3103 + }, + { + "epoch": 0.36886512180629827, + "grad_norm": 0.5235273278766017, + "learning_rate": 1.8898412812290252e-05, + "loss": 0.8766, + "num_tokens": 12973556651.0, + "step": 3104 + }, + { + "epoch": 0.3689839572192513, + "grad_norm": 0.6021682277323747, + "learning_rate": 1.889758237743636e-05, + "loss": 0.8766, + "num_tokens": 12977746701.0, + "step": 3105 + }, + { + "epoch": 0.3691027926322044, + "grad_norm": 0.47446606839225486, + "learning_rate": 1.8896751650108295e-05, + "loss": 0.869, + "num_tokens": 12981926923.0, + "step": 3106 + }, + { + "epoch": 0.36922162804515746, + "grad_norm": 0.5719530211163124, + "learning_rate": 1.889592063033681e-05, + "loss": 0.8962, + "num_tokens": 12986086323.0, + "step": 3107 + }, + { + "epoch": 0.3693404634581105, + "grad_norm": 0.5248481400578874, + "learning_rate": 1.889508931815269e-05, + "loss": 0.9488, + "num_tokens": 12990275835.0, + "step": 3108 + }, + { + "epoch": 0.3694592988710636, + "grad_norm": 0.522266424180986, + "learning_rate": 1.889425771358672e-05, + "loss": 0.8837, + "num_tokens": 12994464854.0, + "step": 3109 + }, + { + "epoch": 0.36957813428401665, + "grad_norm": 0.5076524054071675, + "learning_rate": 1.8893425816669695e-05, + "loss": 0.9104, + "num_tokens": 12998640437.0, + "step": 3110 + }, + { + "epoch": 0.3696969696969697, + "grad_norm": 0.5913199149502567, + "learning_rate": 1.889259362743243e-05, + "loss": 0.8578, + "num_tokens": 13002829560.0, + "step": 3111 + }, + { + "epoch": 0.36981580510992274, + "grad_norm": 0.6132313034281797, + "learning_rate": 1.8891761145905726e-05, + "loss": 0.9016, + "num_tokens": 13007018616.0, + "step": 3112 + }, + { + "epoch": 0.36993464052287583, + "grad_norm": 0.5061852445843621, + "learning_rate": 1.889092837212043e-05, + "loss": 0.8768, + "num_tokens": 13011205908.0, + "step": 3113 + }, + { + "epoch": 0.3700534759358289, + "grad_norm": 0.5848558636768773, + "learning_rate": 1.889009530610738e-05, + "loss": 0.9051, + "num_tokens": 13015366798.0, + "step": 3114 + }, + { + "epoch": 0.3701723113487819, + "grad_norm": 0.5024407764843151, + "learning_rate": 1.888926194789742e-05, + "loss": 0.8876, + "num_tokens": 13019554848.0, + "step": 3115 + }, + { + "epoch": 0.370291146761735, + "grad_norm": 0.56649533832625, + "learning_rate": 1.888842829752141e-05, + "loss": 0.8785, + "num_tokens": 13023693029.0, + "step": 3116 + }, + { + "epoch": 0.37040998217468807, + "grad_norm": 0.5520281120602323, + "learning_rate": 1.8887594355010237e-05, + "loss": 0.9011, + "num_tokens": 13027883374.0, + "step": 3117 + }, + { + "epoch": 0.3705288175876411, + "grad_norm": 0.514208627923457, + "learning_rate": 1.8886760120394774e-05, + "loss": 0.9144, + "num_tokens": 13032072304.0, + "step": 3118 + }, + { + "epoch": 0.37064765300059416, + "grad_norm": 0.5886856309815179, + "learning_rate": 1.8885925593705918e-05, + "loss": 0.8888, + "num_tokens": 13036240719.0, + "step": 3119 + }, + { + "epoch": 0.37076648841354726, + "grad_norm": 0.5469569921468642, + "learning_rate": 1.8885090774974576e-05, + "loss": 0.8831, + "num_tokens": 13040416313.0, + "step": 3120 + }, + { + "epoch": 0.3708853238265003, + "grad_norm": 0.5785951965189542, + "learning_rate": 1.8884255664231658e-05, + "loss": 0.8961, + "num_tokens": 13044605491.0, + "step": 3121 + }, + { + "epoch": 0.37100415923945335, + "grad_norm": 0.4708153782560378, + "learning_rate": 1.88834202615081e-05, + "loss": 0.8736, + "num_tokens": 13048794100.0, + "step": 3122 + }, + { + "epoch": 0.3711229946524064, + "grad_norm": 0.509012862252213, + "learning_rate": 1.8882584566834832e-05, + "loss": 0.8811, + "num_tokens": 13052963826.0, + "step": 3123 + }, + { + "epoch": 0.3712418300653595, + "grad_norm": 0.5581613052995168, + "learning_rate": 1.8881748580242808e-05, + "loss": 0.8882, + "num_tokens": 13057127274.0, + "step": 3124 + }, + { + "epoch": 0.37136066547831253, + "grad_norm": 0.570959024119822, + "learning_rate": 1.8880912301762982e-05, + "loss": 0.856, + "num_tokens": 13061317705.0, + "step": 3125 + }, + { + "epoch": 0.3714795008912656, + "grad_norm": 0.4739547707343926, + "learning_rate": 1.8880075731426334e-05, + "loss": 0.886, + "num_tokens": 13065505498.0, + "step": 3126 + }, + { + "epoch": 0.3715983363042187, + "grad_norm": 0.5015720250505988, + "learning_rate": 1.887923886926383e-05, + "loss": 0.8772, + "num_tokens": 13069693857.0, + "step": 3127 + }, + { + "epoch": 0.3717171717171717, + "grad_norm": 0.5302677275648499, + "learning_rate": 1.8878401715306474e-05, + "loss": 0.9162, + "num_tokens": 13073882334.0, + "step": 3128 + }, + { + "epoch": 0.37183600713012477, + "grad_norm": 0.5330605179752719, + "learning_rate": 1.8877564269585266e-05, + "loss": 0.919, + "num_tokens": 13078071324.0, + "step": 3129 + }, + { + "epoch": 0.3719548425430778, + "grad_norm": 0.5466774821498933, + "learning_rate": 1.887672653213122e-05, + "loss": 0.8964, + "num_tokens": 13082233546.0, + "step": 3130 + }, + { + "epoch": 0.3720736779560309, + "grad_norm": 0.5441207229290498, + "learning_rate": 1.8875888502975354e-05, + "loss": 0.9081, + "num_tokens": 13086407246.0, + "step": 3131 + }, + { + "epoch": 0.37219251336898396, + "grad_norm": 0.561375114664931, + "learning_rate": 1.8875050182148714e-05, + "loss": 0.89, + "num_tokens": 13090586094.0, + "step": 3132 + }, + { + "epoch": 0.372311348781937, + "grad_norm": 0.536579990310941, + "learning_rate": 1.8874211569682335e-05, + "loss": 0.8772, + "num_tokens": 13094759862.0, + "step": 3133 + }, + { + "epoch": 0.3724301841948901, + "grad_norm": 0.6172726717401265, + "learning_rate": 1.887337266560728e-05, + "loss": 0.8792, + "num_tokens": 13098948854.0, + "step": 3134 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.4833677529041084, + "learning_rate": 1.8872533469954614e-05, + "loss": 0.8822, + "num_tokens": 13103139028.0, + "step": 3135 + }, + { + "epoch": 0.3726678550207962, + "grad_norm": 0.41126229262395475, + "learning_rate": 1.8871693982755413e-05, + "loss": 0.8927, + "num_tokens": 13107329410.0, + "step": 3136 + }, + { + "epoch": 0.37278669043374923, + "grad_norm": 0.5627761173724041, + "learning_rate": 1.8870854204040774e-05, + "loss": 0.857, + "num_tokens": 13111504949.0, + "step": 3137 + }, + { + "epoch": 0.37290552584670233, + "grad_norm": 0.5255702678644582, + "learning_rate": 1.887001413384179e-05, + "loss": 0.8886, + "num_tokens": 13115694707.0, + "step": 3138 + }, + { + "epoch": 0.3730243612596554, + "grad_norm": 0.4876013175023678, + "learning_rate": 1.8869173772189572e-05, + "loss": 0.875, + "num_tokens": 13119884355.0, + "step": 3139 + }, + { + "epoch": 0.3731431966726084, + "grad_norm": 0.5315613057252868, + "learning_rate": 1.8868333119115242e-05, + "loss": 0.8519, + "num_tokens": 13124070639.0, + "step": 3140 + }, + { + "epoch": 0.3732620320855615, + "grad_norm": 0.5446342477106604, + "learning_rate": 1.8867492174649937e-05, + "loss": 0.8728, + "num_tokens": 13128230532.0, + "step": 3141 + }, + { + "epoch": 0.37338086749851457, + "grad_norm": 0.4555917163154756, + "learning_rate": 1.8866650938824796e-05, + "loss": 0.9204, + "num_tokens": 13132419075.0, + "step": 3142 + }, + { + "epoch": 0.3734997029114676, + "grad_norm": 0.5776800097225024, + "learning_rate": 1.886580941167097e-05, + "loss": 0.9288, + "num_tokens": 13136607872.0, + "step": 3143 + }, + { + "epoch": 0.37361853832442066, + "grad_norm": 0.5575346339152295, + "learning_rate": 1.886496759321963e-05, + "loss": 0.8733, + "num_tokens": 13140797514.0, + "step": 3144 + }, + { + "epoch": 0.37373737373737376, + "grad_norm": 0.46935461558692515, + "learning_rate": 1.8864125483501944e-05, + "loss": 0.8674, + "num_tokens": 13144985835.0, + "step": 3145 + }, + { + "epoch": 0.3738562091503268, + "grad_norm": 0.4902272188779974, + "learning_rate": 1.8863283082549103e-05, + "loss": 0.8469, + "num_tokens": 13149117911.0, + "step": 3146 + }, + { + "epoch": 0.37397504456327985, + "grad_norm": 0.6025074212756646, + "learning_rate": 1.8862440390392305e-05, + "loss": 0.9252, + "num_tokens": 13153268262.0, + "step": 3147 + }, + { + "epoch": 0.3740938799762329, + "grad_norm": 0.4581085178991657, + "learning_rate": 1.8861597407062758e-05, + "loss": 0.8854, + "num_tokens": 13157449591.0, + "step": 3148 + }, + { + "epoch": 0.374212715389186, + "grad_norm": 0.6076136767525556, + "learning_rate": 1.8860754132591673e-05, + "loss": 0.9024, + "num_tokens": 13161638951.0, + "step": 3149 + }, + { + "epoch": 0.37433155080213903, + "grad_norm": 0.4986626340557526, + "learning_rate": 1.8859910567010285e-05, + "loss": 0.8491, + "num_tokens": 13165828963.0, + "step": 3150 + }, + { + "epoch": 0.3744503862150921, + "grad_norm": 0.5537418659051115, + "learning_rate": 1.8859066710349837e-05, + "loss": 0.8867, + "num_tokens": 13170019085.0, + "step": 3151 + }, + { + "epoch": 0.3745692216280452, + "grad_norm": 0.4564772051571931, + "learning_rate": 1.8858222562641576e-05, + "loss": 0.8692, + "num_tokens": 13174177592.0, + "step": 3152 + }, + { + "epoch": 0.3746880570409982, + "grad_norm": 0.576179040838371, + "learning_rate": 1.885737812391676e-05, + "loss": 0.8954, + "num_tokens": 13178365445.0, + "step": 3153 + }, + { + "epoch": 0.37480689245395127, + "grad_norm": 0.5478751160700414, + "learning_rate": 1.8856533394206672e-05, + "loss": 0.8735, + "num_tokens": 13182554931.0, + "step": 3154 + }, + { + "epoch": 0.3749257278669043, + "grad_norm": 0.5713795020500781, + "learning_rate": 1.8855688373542587e-05, + "loss": 0.888, + "num_tokens": 13186744403.0, + "step": 3155 + }, + { + "epoch": 0.3750445632798574, + "grad_norm": 0.5291399105608723, + "learning_rate": 1.88548430619558e-05, + "loss": 0.8586, + "num_tokens": 13190932711.0, + "step": 3156 + }, + { + "epoch": 0.37516339869281046, + "grad_norm": 0.5171258778599587, + "learning_rate": 1.8853997459477614e-05, + "loss": 0.9179, + "num_tokens": 13195093731.0, + "step": 3157 + }, + { + "epoch": 0.3752822341057635, + "grad_norm": 0.5447498423293211, + "learning_rate": 1.885315156613935e-05, + "loss": 0.891, + "num_tokens": 13199279844.0, + "step": 3158 + }, + { + "epoch": 0.3754010695187166, + "grad_norm": 0.6163705223769713, + "learning_rate": 1.885230538197233e-05, + "loss": 0.8984, + "num_tokens": 13203437993.0, + "step": 3159 + }, + { + "epoch": 0.37551990493166965, + "grad_norm": 0.5265315041488895, + "learning_rate": 1.8851458907007893e-05, + "loss": 0.872, + "num_tokens": 13207617520.0, + "step": 3160 + }, + { + "epoch": 0.3756387403446227, + "grad_norm": 0.5367932497679554, + "learning_rate": 1.8850612141277386e-05, + "loss": 0.9007, + "num_tokens": 13211808015.0, + "step": 3161 + }, + { + "epoch": 0.37575757575757573, + "grad_norm": 0.6011013210591523, + "learning_rate": 1.8849765084812167e-05, + "loss": 0.8946, + "num_tokens": 13215965424.0, + "step": 3162 + }, + { + "epoch": 0.37587641117052883, + "grad_norm": 0.6288474887670515, + "learning_rate": 1.8848917737643607e-05, + "loss": 0.875, + "num_tokens": 13220153180.0, + "step": 3163 + }, + { + "epoch": 0.3759952465834819, + "grad_norm": 0.5925166204847404, + "learning_rate": 1.8848070099803085e-05, + "loss": 0.8724, + "num_tokens": 13224343219.0, + "step": 3164 + }, + { + "epoch": 0.3761140819964349, + "grad_norm": 0.5026434449009392, + "learning_rate": 1.8847222171321993e-05, + "loss": 0.8825, + "num_tokens": 13228531768.0, + "step": 3165 + }, + { + "epoch": 0.376232917409388, + "grad_norm": 0.5996281306944998, + "learning_rate": 1.8846373952231735e-05, + "loss": 0.8707, + "num_tokens": 13232710864.0, + "step": 3166 + }, + { + "epoch": 0.37635175282234107, + "grad_norm": 0.4595176965478939, + "learning_rate": 1.8845525442563717e-05, + "loss": 0.9215, + "num_tokens": 13236878627.0, + "step": 3167 + }, + { + "epoch": 0.3764705882352941, + "grad_norm": 0.5921591962846529, + "learning_rate": 1.8844676642349364e-05, + "loss": 0.8716, + "num_tokens": 13241068736.0, + "step": 3168 + }, + { + "epoch": 0.37658942364824716, + "grad_norm": 0.6122257889132409, + "learning_rate": 1.8843827551620115e-05, + "loss": 0.8439, + "num_tokens": 13245256208.0, + "step": 3169 + }, + { + "epoch": 0.37670825906120026, + "grad_norm": 0.5878671380614976, + "learning_rate": 1.8842978170407412e-05, + "loss": 0.8921, + "num_tokens": 13249445718.0, + "step": 3170 + }, + { + "epoch": 0.3768270944741533, + "grad_norm": 0.547206028880086, + "learning_rate": 1.8842128498742703e-05, + "loss": 0.9201, + "num_tokens": 13253635705.0, + "step": 3171 + }, + { + "epoch": 0.37694592988710635, + "grad_norm": 0.5804779326939052, + "learning_rate": 1.8841278536657465e-05, + "loss": 0.8889, + "num_tokens": 13257825287.0, + "step": 3172 + }, + { + "epoch": 0.3770647653000594, + "grad_norm": 0.4680723419411814, + "learning_rate": 1.884042828418317e-05, + "loss": 0.9353, + "num_tokens": 13261987359.0, + "step": 3173 + }, + { + "epoch": 0.3771836007130125, + "grad_norm": 0.516612252627313, + "learning_rate": 1.8839577741351308e-05, + "loss": 0.8818, + "num_tokens": 13266142147.0, + "step": 3174 + }, + { + "epoch": 0.37730243612596553, + "grad_norm": 0.6035869178796929, + "learning_rate": 1.8838726908193375e-05, + "loss": 0.8806, + "num_tokens": 13270312805.0, + "step": 3175 + }, + { + "epoch": 0.3774212715389186, + "grad_norm": 0.6030108123592847, + "learning_rate": 1.8837875784740884e-05, + "loss": 0.8892, + "num_tokens": 13274477956.0, + "step": 3176 + }, + { + "epoch": 0.3775401069518717, + "grad_norm": 0.49186338016910486, + "learning_rate": 1.8837024371025346e-05, + "loss": 0.9049, + "num_tokens": 13278650897.0, + "step": 3177 + }, + { + "epoch": 0.3776589423648247, + "grad_norm": 0.5587724019927642, + "learning_rate": 1.88361726670783e-05, + "loss": 0.873, + "num_tokens": 13282838314.0, + "step": 3178 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.44647450470133143, + "learning_rate": 1.883532067293129e-05, + "loss": 0.9214, + "num_tokens": 13286983910.0, + "step": 3179 + }, + { + "epoch": 0.3778966131907308, + "grad_norm": 0.4946995696438246, + "learning_rate": 1.883446838861586e-05, + "loss": 0.8927, + "num_tokens": 13291143672.0, + "step": 3180 + }, + { + "epoch": 0.3780154486036839, + "grad_norm": 0.5064436557072256, + "learning_rate": 1.8833615814163576e-05, + "loss": 0.8915, + "num_tokens": 13295331530.0, + "step": 3181 + }, + { + "epoch": 0.37813428401663696, + "grad_norm": 0.519937316014578, + "learning_rate": 1.8832762949606014e-05, + "loss": 0.9155, + "num_tokens": 13299519036.0, + "step": 3182 + }, + { + "epoch": 0.37825311942959, + "grad_norm": 0.507570742960901, + "learning_rate": 1.883190979497476e-05, + "loss": 0.9087, + "num_tokens": 13303708722.0, + "step": 3183 + }, + { + "epoch": 0.3783719548425431, + "grad_norm": 0.4405833904807965, + "learning_rate": 1.88310563503014e-05, + "loss": 0.8907, + "num_tokens": 13307896061.0, + "step": 3184 + }, + { + "epoch": 0.37849079025549615, + "grad_norm": 0.5656704237610773, + "learning_rate": 1.8830202615617546e-05, + "loss": 0.8583, + "num_tokens": 13312085872.0, + "step": 3185 + }, + { + "epoch": 0.3786096256684492, + "grad_norm": 0.45860414439884384, + "learning_rate": 1.8829348590954817e-05, + "loss": 0.8682, + "num_tokens": 13316276068.0, + "step": 3186 + }, + { + "epoch": 0.37872846108140223, + "grad_norm": 0.6534037766027088, + "learning_rate": 1.882849427634484e-05, + "loss": 0.8991, + "num_tokens": 13320466167.0, + "step": 3187 + }, + { + "epoch": 0.37884729649435533, + "grad_norm": 0.4655011552053371, + "learning_rate": 1.8827639671819248e-05, + "loss": 0.8872, + "num_tokens": 13324656309.0, + "step": 3188 + }, + { + "epoch": 0.3789661319073084, + "grad_norm": 0.5381301166356748, + "learning_rate": 1.8826784777409692e-05, + "loss": 0.8744, + "num_tokens": 13328820938.0, + "step": 3189 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.5481135407830573, + "learning_rate": 1.8825929593147837e-05, + "loss": 0.8853, + "num_tokens": 13333009704.0, + "step": 3190 + }, + { + "epoch": 0.3792038027332145, + "grad_norm": 0.4940190621974537, + "learning_rate": 1.8825074119065347e-05, + "loss": 0.9119, + "num_tokens": 13337198663.0, + "step": 3191 + }, + { + "epoch": 0.37932263814616757, + "grad_norm": 0.5264886426786072, + "learning_rate": 1.8824218355193906e-05, + "loss": 0.8737, + "num_tokens": 13341361292.0, + "step": 3192 + }, + { + "epoch": 0.3794414735591206, + "grad_norm": 0.5478652416403763, + "learning_rate": 1.8823362301565203e-05, + "loss": 0.9119, + "num_tokens": 13345550787.0, + "step": 3193 + }, + { + "epoch": 0.37956030897207366, + "grad_norm": 0.514043459951284, + "learning_rate": 1.8822505958210943e-05, + "loss": 0.8796, + "num_tokens": 13349693801.0, + "step": 3194 + }, + { + "epoch": 0.37967914438502676, + "grad_norm": 0.5495503747508822, + "learning_rate": 1.882164932516284e-05, + "loss": 0.9018, + "num_tokens": 13353875672.0, + "step": 3195 + }, + { + "epoch": 0.3797979797979798, + "grad_norm": 0.5934063827718621, + "learning_rate": 1.8820792402452613e-05, + "loss": 0.8939, + "num_tokens": 13358066113.0, + "step": 3196 + }, + { + "epoch": 0.37991681521093285, + "grad_norm": 0.44534122203408744, + "learning_rate": 1.8819935190112003e-05, + "loss": 0.9171, + "num_tokens": 13362255843.0, + "step": 3197 + }, + { + "epoch": 0.38003565062388595, + "grad_norm": 0.5086527607186848, + "learning_rate": 1.8819077688172756e-05, + "loss": 0.8725, + "num_tokens": 13366446496.0, + "step": 3198 + }, + { + "epoch": 0.380154486036839, + "grad_norm": 0.5708296525403859, + "learning_rate": 1.8818219896666622e-05, + "loss": 0.8873, + "num_tokens": 13370634915.0, + "step": 3199 + }, + { + "epoch": 0.38027332144979203, + "grad_norm": 0.5508943421085859, + "learning_rate": 1.8817361815625376e-05, + "loss": 0.8862, + "num_tokens": 13374825032.0, + "step": 3200 + }, + { + "epoch": 0.3803921568627451, + "grad_norm": 0.568156553624648, + "learning_rate": 1.8816503445080787e-05, + "loss": 0.8701, + "num_tokens": 13379015866.0, + "step": 3201 + }, + { + "epoch": 0.3805109922756982, + "grad_norm": 0.5247195198468212, + "learning_rate": 1.8815644785064644e-05, + "loss": 0.909, + "num_tokens": 13383183691.0, + "step": 3202 + }, + { + "epoch": 0.3806298276886512, + "grad_norm": 0.6263857133977825, + "learning_rate": 1.8814785835608757e-05, + "loss": 0.9215, + "num_tokens": 13387347995.0, + "step": 3203 + }, + { + "epoch": 0.38074866310160427, + "grad_norm": 0.4794767180937313, + "learning_rate": 1.881392659674492e-05, + "loss": 0.9086, + "num_tokens": 13391522559.0, + "step": 3204 + }, + { + "epoch": 0.3808674985145573, + "grad_norm": 0.5606231367640605, + "learning_rate": 1.8813067068504966e-05, + "loss": 0.8876, + "num_tokens": 13395704157.0, + "step": 3205 + }, + { + "epoch": 0.3809863339275104, + "grad_norm": 0.48503721374656666, + "learning_rate": 1.8812207250920722e-05, + "loss": 0.8559, + "num_tokens": 13399892642.0, + "step": 3206 + }, + { + "epoch": 0.38110516934046346, + "grad_norm": 0.554574175663583, + "learning_rate": 1.8811347144024027e-05, + "loss": 0.896, + "num_tokens": 13404065575.0, + "step": 3207 + }, + { + "epoch": 0.3812240047534165, + "grad_norm": 0.48688002602973696, + "learning_rate": 1.8810486747846738e-05, + "loss": 0.8717, + "num_tokens": 13408253347.0, + "step": 3208 + }, + { + "epoch": 0.3813428401663696, + "grad_norm": 0.6840955568875844, + "learning_rate": 1.880962606242072e-05, + "loss": 0.882, + "num_tokens": 13412400352.0, + "step": 3209 + }, + { + "epoch": 0.38146167557932265, + "grad_norm": 0.4710437668402203, + "learning_rate": 1.8808765087777844e-05, + "loss": 0.869, + "num_tokens": 13416589367.0, + "step": 3210 + }, + { + "epoch": 0.3815805109922757, + "grad_norm": 0.6689816374891928, + "learning_rate": 1.880790382394999e-05, + "loss": 0.84, + "num_tokens": 13420777879.0, + "step": 3211 + }, + { + "epoch": 0.38169934640522873, + "grad_norm": 0.6050318708616982, + "learning_rate": 1.880704227096906e-05, + "loss": 0.8675, + "num_tokens": 13424967075.0, + "step": 3212 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 0.4475785543158359, + "learning_rate": 1.880618042886696e-05, + "loss": 0.8948, + "num_tokens": 13429137728.0, + "step": 3213 + }, + { + "epoch": 0.3819370172311349, + "grad_norm": 0.7135810599143679, + "learning_rate": 1.8805318297675604e-05, + "loss": 0.9194, + "num_tokens": 13433326349.0, + "step": 3214 + }, + { + "epoch": 0.3820558526440879, + "grad_norm": 0.47043981805483187, + "learning_rate": 1.8804455877426918e-05, + "loss": 0.9023, + "num_tokens": 13437514913.0, + "step": 3215 + }, + { + "epoch": 0.382174688057041, + "grad_norm": 0.7169369487429491, + "learning_rate": 1.880359316815285e-05, + "loss": 0.8901, + "num_tokens": 13441673989.0, + "step": 3216 + }, + { + "epoch": 0.38229352346999407, + "grad_norm": 0.5430014087192933, + "learning_rate": 1.8802730169885337e-05, + "loss": 0.8984, + "num_tokens": 13445844774.0, + "step": 3217 + }, + { + "epoch": 0.3824123588829471, + "grad_norm": 0.4778970088619371, + "learning_rate": 1.880186688265635e-05, + "loss": 0.9064, + "num_tokens": 13450034635.0, + "step": 3218 + }, + { + "epoch": 0.38253119429590016, + "grad_norm": 0.5359236261185097, + "learning_rate": 1.8801003306497845e-05, + "loss": 0.9015, + "num_tokens": 13454210172.0, + "step": 3219 + }, + { + "epoch": 0.38265002970885326, + "grad_norm": 0.5016876052538344, + "learning_rate": 1.880013944144182e-05, + "loss": 0.8711, + "num_tokens": 13458399955.0, + "step": 3220 + }, + { + "epoch": 0.3827688651218063, + "grad_norm": 0.5591865729649095, + "learning_rate": 1.8799275287520255e-05, + "loss": 0.9041, + "num_tokens": 13462587381.0, + "step": 3221 + }, + { + "epoch": 0.38288770053475935, + "grad_norm": 0.601699388719539, + "learning_rate": 1.8798410844765152e-05, + "loss": 0.9284, + "num_tokens": 13466763795.0, + "step": 3222 + }, + { + "epoch": 0.38300653594771245, + "grad_norm": 0.5401596947038368, + "learning_rate": 1.8797546113208535e-05, + "loss": 0.8951, + "num_tokens": 13470893779.0, + "step": 3223 + }, + { + "epoch": 0.3831253713606655, + "grad_norm": 0.6273637427644456, + "learning_rate": 1.8796681092882418e-05, + "loss": 0.8751, + "num_tokens": 13475055117.0, + "step": 3224 + }, + { + "epoch": 0.38324420677361853, + "grad_norm": 0.43518974345055306, + "learning_rate": 1.879581578381884e-05, + "loss": 0.8516, + "num_tokens": 13479222139.0, + "step": 3225 + }, + { + "epoch": 0.3833630421865716, + "grad_norm": 0.6055746731759963, + "learning_rate": 1.8794950186049844e-05, + "loss": 0.8633, + "num_tokens": 13483411840.0, + "step": 3226 + }, + { + "epoch": 0.3834818775995247, + "grad_norm": 0.5162018072196376, + "learning_rate": 1.879408429960749e-05, + "loss": 0.8853, + "num_tokens": 13487590765.0, + "step": 3227 + }, + { + "epoch": 0.3836007130124777, + "grad_norm": 0.5938088213164212, + "learning_rate": 1.879321812452384e-05, + "loss": 0.8876, + "num_tokens": 13491779881.0, + "step": 3228 + }, + { + "epoch": 0.38371954842543077, + "grad_norm": 0.5366955781865828, + "learning_rate": 1.879235166083097e-05, + "loss": 0.8861, + "num_tokens": 13495968379.0, + "step": 3229 + }, + { + "epoch": 0.3838383838383838, + "grad_norm": 0.5956988422366088, + "learning_rate": 1.879148490856098e-05, + "loss": 0.9121, + "num_tokens": 13500157768.0, + "step": 3230 + }, + { + "epoch": 0.3839572192513369, + "grad_norm": 0.42396022180310666, + "learning_rate": 1.8790617867745955e-05, + "loss": 0.8883, + "num_tokens": 13504347438.0, + "step": 3231 + }, + { + "epoch": 0.38407605466428996, + "grad_norm": 0.7518930184684871, + "learning_rate": 1.8789750538418008e-05, + "loss": 0.8697, + "num_tokens": 13508536814.0, + "step": 3232 + }, + { + "epoch": 0.384194890077243, + "grad_norm": 0.49515894561090246, + "learning_rate": 1.8788882920609265e-05, + "loss": 0.8849, + "num_tokens": 13512718354.0, + "step": 3233 + }, + { + "epoch": 0.3843137254901961, + "grad_norm": 0.6403204443394803, + "learning_rate": 1.8788015014351846e-05, + "loss": 0.8587, + "num_tokens": 13516907029.0, + "step": 3234 + }, + { + "epoch": 0.38443256090314915, + "grad_norm": 0.526460806384513, + "learning_rate": 1.8787146819677907e-05, + "loss": 0.8618, + "num_tokens": 13521070545.0, + "step": 3235 + }, + { + "epoch": 0.3845513963161022, + "grad_norm": 0.688187727617752, + "learning_rate": 1.8786278336619587e-05, + "loss": 0.8812, + "num_tokens": 13525250692.0, + "step": 3236 + }, + { + "epoch": 0.38467023172905523, + "grad_norm": 0.5159319992262039, + "learning_rate": 1.8785409565209056e-05, + "loss": 0.909, + "num_tokens": 13529414260.0, + "step": 3237 + }, + { + "epoch": 0.38478906714200833, + "grad_norm": 0.5930378960219796, + "learning_rate": 1.8784540505478487e-05, + "loss": 0.8809, + "num_tokens": 13533601133.0, + "step": 3238 + }, + { + "epoch": 0.3849079025549614, + "grad_norm": 0.5527922640382117, + "learning_rate": 1.878367115746006e-05, + "loss": 0.9241, + "num_tokens": 13537789855.0, + "step": 3239 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 0.49781159167863237, + "learning_rate": 1.8782801521185974e-05, + "loss": 0.8701, + "num_tokens": 13541954740.0, + "step": 3240 + }, + { + "epoch": 0.3851455733808675, + "grad_norm": 0.6204489883252682, + "learning_rate": 1.8781931596688435e-05, + "loss": 0.8794, + "num_tokens": 13546142536.0, + "step": 3241 + }, + { + "epoch": 0.38526440879382057, + "grad_norm": 0.476747017349077, + "learning_rate": 1.8781061383999658e-05, + "loss": 0.8749, + "num_tokens": 13550321706.0, + "step": 3242 + }, + { + "epoch": 0.3853832442067736, + "grad_norm": 0.6579499729381588, + "learning_rate": 1.8780190883151865e-05, + "loss": 0.8946, + "num_tokens": 13554508898.0, + "step": 3243 + }, + { + "epoch": 0.38550207961972666, + "grad_norm": 0.5596078381492624, + "learning_rate": 1.8779320094177304e-05, + "loss": 0.8656, + "num_tokens": 13558697322.0, + "step": 3244 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.5438731684056218, + "learning_rate": 1.8778449017108213e-05, + "loss": 0.8799, + "num_tokens": 13562885929.0, + "step": 3245 + }, + { + "epoch": 0.3857397504456328, + "grad_norm": 0.5394204493904585, + "learning_rate": 1.8777577651976853e-05, + "loss": 0.9084, + "num_tokens": 13567076257.0, + "step": 3246 + }, + { + "epoch": 0.38585858585858585, + "grad_norm": 0.5912687477944977, + "learning_rate": 1.8776705998815505e-05, + "loss": 0.9079, + "num_tokens": 13571244935.0, + "step": 3247 + }, + { + "epoch": 0.38597742127153895, + "grad_norm": 0.6276957831816219, + "learning_rate": 1.8775834057656435e-05, + "loss": 0.8677, + "num_tokens": 13575433768.0, + "step": 3248 + }, + { + "epoch": 0.386096256684492, + "grad_norm": 1.0870931302620412, + "learning_rate": 1.8774961828531937e-05, + "loss": 0.8728, + "num_tokens": 13579623621.0, + "step": 3249 + }, + { + "epoch": 0.38621509209744503, + "grad_norm": 0.6561704800039743, + "learning_rate": 1.8774089311474317e-05, + "loss": 0.905, + "num_tokens": 13583811114.0, + "step": 3250 + }, + { + "epoch": 0.3863339275103981, + "grad_norm": 0.7542169367521119, + "learning_rate": 1.8773216506515883e-05, + "loss": 0.8857, + "num_tokens": 13587980700.0, + "step": 3251 + }, + { + "epoch": 0.3864527629233512, + "grad_norm": 0.4762892330122202, + "learning_rate": 1.8772343413688964e-05, + "loss": 0.8853, + "num_tokens": 13592168456.0, + "step": 3252 + }, + { + "epoch": 0.3865715983363042, + "grad_norm": 0.8187727342415742, + "learning_rate": 1.877147003302589e-05, + "loss": 0.871, + "num_tokens": 13596339616.0, + "step": 3253 + }, + { + "epoch": 0.38669043374925727, + "grad_norm": 0.5491889093582054, + "learning_rate": 1.8770596364558998e-05, + "loss": 0.8419, + "num_tokens": 13600495694.0, + "step": 3254 + }, + { + "epoch": 0.3868092691622103, + "grad_norm": 0.6546851921499096, + "learning_rate": 1.8769722408320653e-05, + "loss": 0.9111, + "num_tokens": 13604664389.0, + "step": 3255 + }, + { + "epoch": 0.3869281045751634, + "grad_norm": 0.6001623874802168, + "learning_rate": 1.876884816434322e-05, + "loss": 0.9538, + "num_tokens": 13608852840.0, + "step": 3256 + }, + { + "epoch": 0.38704693998811646, + "grad_norm": 0.5290167229888885, + "learning_rate": 1.876797363265907e-05, + "loss": 0.8998, + "num_tokens": 13613026345.0, + "step": 3257 + }, + { + "epoch": 0.3871657754010695, + "grad_norm": 0.6398272139515884, + "learning_rate": 1.8767098813300592e-05, + "loss": 0.9217, + "num_tokens": 13617208442.0, + "step": 3258 + }, + { + "epoch": 0.3872846108140226, + "grad_norm": 0.4867460447926506, + "learning_rate": 1.8766223706300184e-05, + "loss": 0.8942, + "num_tokens": 13621360127.0, + "step": 3259 + }, + { + "epoch": 0.38740344622697565, + "grad_norm": 0.6484890575127877, + "learning_rate": 1.876534831169025e-05, + "loss": 0.8649, + "num_tokens": 13625549881.0, + "step": 3260 + }, + { + "epoch": 0.3875222816399287, + "grad_norm": 0.48091521905146256, + "learning_rate": 1.876447262950322e-05, + "loss": 0.9305, + "num_tokens": 13629715499.0, + "step": 3261 + }, + { + "epoch": 0.38764111705288173, + "grad_norm": 0.6049021036747926, + "learning_rate": 1.8763596659771513e-05, + "loss": 0.8648, + "num_tokens": 13633903837.0, + "step": 3262 + }, + { + "epoch": 0.38775995246583483, + "grad_norm": 0.5803328821810916, + "learning_rate": 1.8762720402527574e-05, + "loss": 0.9187, + "num_tokens": 13638062510.0, + "step": 3263 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 0.5981128671263011, + "learning_rate": 1.8761843857803847e-05, + "loss": 0.9073, + "num_tokens": 13642203753.0, + "step": 3264 + }, + { + "epoch": 0.3879976232917409, + "grad_norm": 0.5215580903026623, + "learning_rate": 1.87609670256328e-05, + "loss": 0.8803, + "num_tokens": 13646377217.0, + "step": 3265 + }, + { + "epoch": 0.388116458704694, + "grad_norm": 0.5080748934051097, + "learning_rate": 1.876008990604691e-05, + "loss": 0.8979, + "num_tokens": 13650539363.0, + "step": 3266 + }, + { + "epoch": 0.38823529411764707, + "grad_norm": 0.5447711751754071, + "learning_rate": 1.8759212499078647e-05, + "loss": 0.9061, + "num_tokens": 13654695330.0, + "step": 3267 + }, + { + "epoch": 0.3883541295306001, + "grad_norm": 0.5041934851721955, + "learning_rate": 1.875833480476051e-05, + "loss": 0.8373, + "num_tokens": 13658884573.0, + "step": 3268 + }, + { + "epoch": 0.38847296494355316, + "grad_norm": 0.4932498326505028, + "learning_rate": 1.875745682312501e-05, + "loss": 0.9009, + "num_tokens": 13663073674.0, + "step": 3269 + }, + { + "epoch": 0.38859180035650626, + "grad_norm": 0.5272271492019995, + "learning_rate": 1.875657855420465e-05, + "loss": 0.866, + "num_tokens": 13667261893.0, + "step": 3270 + }, + { + "epoch": 0.3887106357694593, + "grad_norm": 0.5044115310890476, + "learning_rate": 1.8755699998031964e-05, + "loss": 0.8944, + "num_tokens": 13671449537.0, + "step": 3271 + }, + { + "epoch": 0.38882947118241235, + "grad_norm": 0.5256314209005352, + "learning_rate": 1.875482115463948e-05, + "loss": 0.8855, + "num_tokens": 13675613982.0, + "step": 3272 + }, + { + "epoch": 0.38894830659536545, + "grad_norm": 0.552198127403891, + "learning_rate": 1.8753942024059752e-05, + "loss": 0.9116, + "num_tokens": 13679783408.0, + "step": 3273 + }, + { + "epoch": 0.3890671420083185, + "grad_norm": 0.4814367509344491, + "learning_rate": 1.8753062606325335e-05, + "loss": 0.8778, + "num_tokens": 13683969106.0, + "step": 3274 + }, + { + "epoch": 0.38918597742127153, + "grad_norm": 0.45461699724136656, + "learning_rate": 1.875218290146879e-05, + "loss": 0.8825, + "num_tokens": 13688115906.0, + "step": 3275 + }, + { + "epoch": 0.3893048128342246, + "grad_norm": 0.5993254704220291, + "learning_rate": 1.8751302909522705e-05, + "loss": 0.8955, + "num_tokens": 13692306516.0, + "step": 3276 + }, + { + "epoch": 0.3894236482471777, + "grad_norm": 0.5066783412899042, + "learning_rate": 1.875042263051967e-05, + "loss": 0.8764, + "num_tokens": 13696495522.0, + "step": 3277 + }, + { + "epoch": 0.3895424836601307, + "grad_norm": 0.5230441932440911, + "learning_rate": 1.8749542064492278e-05, + "loss": 0.8891, + "num_tokens": 13700686257.0, + "step": 3278 + }, + { + "epoch": 0.38966131907308377, + "grad_norm": 0.5614590766172531, + "learning_rate": 1.8748661211473135e-05, + "loss": 0.931, + "num_tokens": 13704850342.0, + "step": 3279 + }, + { + "epoch": 0.3897801544860368, + "grad_norm": 0.49707789187556484, + "learning_rate": 1.8747780071494875e-05, + "loss": 0.8771, + "num_tokens": 13709039791.0, + "step": 3280 + }, + { + "epoch": 0.3898989898989899, + "grad_norm": 0.5613990602734376, + "learning_rate": 1.874689864459012e-05, + "loss": 0.8912, + "num_tokens": 13713199856.0, + "step": 3281 + }, + { + "epoch": 0.39001782531194296, + "grad_norm": 0.4609046580758581, + "learning_rate": 1.8746016930791516e-05, + "loss": 0.894, + "num_tokens": 13717389546.0, + "step": 3282 + }, + { + "epoch": 0.390136660724896, + "grad_norm": 0.537337757254477, + "learning_rate": 1.874513493013172e-05, + "loss": 0.8768, + "num_tokens": 13721579276.0, + "step": 3283 + }, + { + "epoch": 0.3902554961378491, + "grad_norm": 0.46900966230952945, + "learning_rate": 1.8744252642643384e-05, + "loss": 0.8686, + "num_tokens": 13725736968.0, + "step": 3284 + }, + { + "epoch": 0.39037433155080214, + "grad_norm": 0.7237975043383761, + "learning_rate": 1.8743370068359193e-05, + "loss": 0.8813, + "num_tokens": 13729925091.0, + "step": 3285 + }, + { + "epoch": 0.3904931669637552, + "grad_norm": 0.4407456263766297, + "learning_rate": 1.8742487207311822e-05, + "loss": 0.8963, + "num_tokens": 13734113042.0, + "step": 3286 + }, + { + "epoch": 0.39061200237670823, + "grad_norm": 0.6291750094517752, + "learning_rate": 1.874160405953398e-05, + "loss": 0.875, + "num_tokens": 13738303563.0, + "step": 3287 + }, + { + "epoch": 0.39073083778966133, + "grad_norm": 0.5441153572182231, + "learning_rate": 1.8740720625058358e-05, + "loss": 0.8961, + "num_tokens": 13742490909.0, + "step": 3288 + }, + { + "epoch": 0.3908496732026144, + "grad_norm": 0.5255039382537128, + "learning_rate": 1.8739836903917683e-05, + "loss": 0.9218, + "num_tokens": 13746678569.0, + "step": 3289 + }, + { + "epoch": 0.3909685086155674, + "grad_norm": 0.524811823822048, + "learning_rate": 1.8738952896144676e-05, + "loss": 0.8962, + "num_tokens": 13750840596.0, + "step": 3290 + }, + { + "epoch": 0.3910873440285205, + "grad_norm": 0.6341438321201166, + "learning_rate": 1.873806860177208e-05, + "loss": 0.8759, + "num_tokens": 13755029887.0, + "step": 3291 + }, + { + "epoch": 0.39120617944147357, + "grad_norm": 0.5391173441631589, + "learning_rate": 1.8737184020832637e-05, + "loss": 0.8778, + "num_tokens": 13759220149.0, + "step": 3292 + }, + { + "epoch": 0.3913250148544266, + "grad_norm": 0.5211371578172797, + "learning_rate": 1.873629915335911e-05, + "loss": 0.8696, + "num_tokens": 13763409996.0, + "step": 3293 + }, + { + "epoch": 0.39144385026737966, + "grad_norm": 0.6705091578025821, + "learning_rate": 1.873541399938427e-05, + "loss": 0.8856, + "num_tokens": 13767563415.0, + "step": 3294 + }, + { + "epoch": 0.39156268568033276, + "grad_norm": 0.4812985558124397, + "learning_rate": 1.8734528558940896e-05, + "loss": 0.8853, + "num_tokens": 13771751117.0, + "step": 3295 + }, + { + "epoch": 0.3916815210932858, + "grad_norm": 0.5618234371668374, + "learning_rate": 1.873364283206178e-05, + "loss": 0.8739, + "num_tokens": 13775939713.0, + "step": 3296 + }, + { + "epoch": 0.39180035650623884, + "grad_norm": 0.5027184998804477, + "learning_rate": 1.8732756818779717e-05, + "loss": 0.866, + "num_tokens": 13780130480.0, + "step": 3297 + }, + { + "epoch": 0.39191919191919194, + "grad_norm": 0.6655737990885573, + "learning_rate": 1.8731870519127527e-05, + "loss": 0.8829, + "num_tokens": 13784291961.0, + "step": 3298 + }, + { + "epoch": 0.392038027332145, + "grad_norm": 0.46413627463095813, + "learning_rate": 1.8730983933138028e-05, + "loss": 0.8757, + "num_tokens": 13788481993.0, + "step": 3299 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.5176923495495948, + "learning_rate": 1.8730097060844058e-05, + "loss": 0.888, + "num_tokens": 13792670072.0, + "step": 3300 + }, + { + "epoch": 0.3922756981580511, + "grad_norm": 0.6560904620392669, + "learning_rate": 1.8729209902278457e-05, + "loss": 0.9045, + "num_tokens": 13796858781.0, + "step": 3301 + }, + { + "epoch": 0.3923945335710042, + "grad_norm": 0.5748316617133359, + "learning_rate": 1.8728322457474074e-05, + "loss": 0.8622, + "num_tokens": 13801037001.0, + "step": 3302 + }, + { + "epoch": 0.3925133689839572, + "grad_norm": 0.5304410929549584, + "learning_rate": 1.872743472646379e-05, + "loss": 0.893, + "num_tokens": 13805222101.0, + "step": 3303 + }, + { + "epoch": 0.39263220439691027, + "grad_norm": 0.5444464902599853, + "learning_rate": 1.8726546709280464e-05, + "loss": 0.8911, + "num_tokens": 13809411301.0, + "step": 3304 + }, + { + "epoch": 0.3927510398098633, + "grad_norm": 0.5402051696149051, + "learning_rate": 1.8725658405956994e-05, + "loss": 0.9205, + "num_tokens": 13813582392.0, + "step": 3305 + }, + { + "epoch": 0.3928698752228164, + "grad_norm": 0.4809326901779984, + "learning_rate": 1.872476981652627e-05, + "loss": 0.8909, + "num_tokens": 13817771389.0, + "step": 3306 + }, + { + "epoch": 0.39298871063576946, + "grad_norm": 0.5260683098034145, + "learning_rate": 1.87238809410212e-05, + "loss": 0.8761, + "num_tokens": 13821908300.0, + "step": 3307 + }, + { + "epoch": 0.3931075460487225, + "grad_norm": 0.5671717692611231, + "learning_rate": 1.8722991779474703e-05, + "loss": 0.9114, + "num_tokens": 13826078579.0, + "step": 3308 + }, + { + "epoch": 0.3932263814616756, + "grad_norm": 0.47457172015237675, + "learning_rate": 1.872210233191971e-05, + "loss": 0.8641, + "num_tokens": 13830242186.0, + "step": 3309 + }, + { + "epoch": 0.39334521687462864, + "grad_norm": 0.5395094912939674, + "learning_rate": 1.872121259838916e-05, + "loss": 0.9069, + "num_tokens": 13834394045.0, + "step": 3310 + }, + { + "epoch": 0.3934640522875817, + "grad_norm": 0.5012421453511949, + "learning_rate": 1.8720322578916e-05, + "loss": 0.8878, + "num_tokens": 13838582714.0, + "step": 3311 + }, + { + "epoch": 0.39358288770053473, + "grad_norm": 0.5379398393572001, + "learning_rate": 1.8719432273533193e-05, + "loss": 0.9024, + "num_tokens": 13842764407.0, + "step": 3312 + }, + { + "epoch": 0.39370172311348783, + "grad_norm": 0.5445787883894705, + "learning_rate": 1.871854168227371e-05, + "loss": 0.8812, + "num_tokens": 13846952968.0, + "step": 3313 + }, + { + "epoch": 0.3938205585264409, + "grad_norm": 0.5410238198371456, + "learning_rate": 1.8717650805170528e-05, + "loss": 0.9178, + "num_tokens": 13851142456.0, + "step": 3314 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 0.5098317618347241, + "learning_rate": 1.8716759642256645e-05, + "loss": 0.8834, + "num_tokens": 13855310381.0, + "step": 3315 + }, + { + "epoch": 0.394058229352347, + "grad_norm": 0.5408515040878923, + "learning_rate": 1.871586819356506e-05, + "loss": 0.8421, + "num_tokens": 13859477401.0, + "step": 3316 + }, + { + "epoch": 0.39417706476530007, + "grad_norm": 0.4511973794929227, + "learning_rate": 1.871497645912879e-05, + "loss": 0.8885, + "num_tokens": 13863636844.0, + "step": 3317 + }, + { + "epoch": 0.3942959001782531, + "grad_norm": 0.5066447729905539, + "learning_rate": 1.8714084438980853e-05, + "loss": 0.8862, + "num_tokens": 13867826550.0, + "step": 3318 + }, + { + "epoch": 0.39441473559120616, + "grad_norm": 0.5340451137795768, + "learning_rate": 1.871319213315429e-05, + "loss": 0.9262, + "num_tokens": 13872015672.0, + "step": 3319 + }, + { + "epoch": 0.39453357100415926, + "grad_norm": 0.6342688894225105, + "learning_rate": 1.8712299541682145e-05, + "loss": 0.8489, + "num_tokens": 13876182526.0, + "step": 3320 + }, + { + "epoch": 0.3946524064171123, + "grad_norm": 0.4926320752194287, + "learning_rate": 1.8711406664597468e-05, + "loss": 0.9097, + "num_tokens": 13880352207.0, + "step": 3321 + }, + { + "epoch": 0.39477124183006534, + "grad_norm": 0.5722527613542084, + "learning_rate": 1.8710513501933332e-05, + "loss": 0.8758, + "num_tokens": 13884540975.0, + "step": 3322 + }, + { + "epoch": 0.39489007724301844, + "grad_norm": 0.4589743960856154, + "learning_rate": 1.8709620053722813e-05, + "loss": 0.8857, + "num_tokens": 13888731654.0, + "step": 3323 + }, + { + "epoch": 0.3950089126559715, + "grad_norm": 0.47763560521870174, + "learning_rate": 1.8708726319998992e-05, + "loss": 0.9011, + "num_tokens": 13892920669.0, + "step": 3324 + }, + { + "epoch": 0.39512774806892453, + "grad_norm": 0.5796785700517476, + "learning_rate": 1.8707832300794974e-05, + "loss": 0.9015, + "num_tokens": 13897109579.0, + "step": 3325 + }, + { + "epoch": 0.3952465834818776, + "grad_norm": 0.5033167787966008, + "learning_rate": 1.8706937996143865e-05, + "loss": 0.9203, + "num_tokens": 13901296616.0, + "step": 3326 + }, + { + "epoch": 0.3953654188948307, + "grad_norm": 0.49905144920482164, + "learning_rate": 1.8706043406078784e-05, + "loss": 0.8711, + "num_tokens": 13905485964.0, + "step": 3327 + }, + { + "epoch": 0.3954842543077837, + "grad_norm": 0.5873510555919212, + "learning_rate": 1.870514853063286e-05, + "loss": 0.911, + "num_tokens": 13909666763.0, + "step": 3328 + }, + { + "epoch": 0.39560308972073677, + "grad_norm": 0.4233937884791239, + "learning_rate": 1.8704253369839235e-05, + "loss": 0.849, + "num_tokens": 13913829999.0, + "step": 3329 + }, + { + "epoch": 0.39572192513368987, + "grad_norm": 0.5245208816117324, + "learning_rate": 1.870335792373106e-05, + "loss": 0.911, + "num_tokens": 13918018939.0, + "step": 3330 + }, + { + "epoch": 0.3958407605466429, + "grad_norm": 0.6434556140787011, + "learning_rate": 1.8702462192341496e-05, + "loss": 0.865, + "num_tokens": 13922207584.0, + "step": 3331 + }, + { + "epoch": 0.39595959595959596, + "grad_norm": 0.5427280520152541, + "learning_rate": 1.8701566175703715e-05, + "loss": 0.852, + "num_tokens": 13926398459.0, + "step": 3332 + }, + { + "epoch": 0.396078431372549, + "grad_norm": 0.4749792564135421, + "learning_rate": 1.8700669873850895e-05, + "loss": 0.8751, + "num_tokens": 13930587847.0, + "step": 3333 + }, + { + "epoch": 0.3961972667855021, + "grad_norm": 0.6892817896151258, + "learning_rate": 1.8699773286816235e-05, + "loss": 0.8922, + "num_tokens": 13934776593.0, + "step": 3334 + }, + { + "epoch": 0.39631610219845514, + "grad_norm": 0.5606120736879479, + "learning_rate": 1.8698876414632937e-05, + "loss": 0.8673, + "num_tokens": 13938945272.0, + "step": 3335 + }, + { + "epoch": 0.3964349376114082, + "grad_norm": 0.6010750420153628, + "learning_rate": 1.8697979257334212e-05, + "loss": 0.8697, + "num_tokens": 13943132726.0, + "step": 3336 + }, + { + "epoch": 0.39655377302436123, + "grad_norm": 0.606121666166385, + "learning_rate": 1.869708181495329e-05, + "loss": 0.9309, + "num_tokens": 13947293132.0, + "step": 3337 + }, + { + "epoch": 0.39667260843731433, + "grad_norm": 0.5837921227673067, + "learning_rate": 1.8696184087523405e-05, + "loss": 0.8761, + "num_tokens": 13951469685.0, + "step": 3338 + }, + { + "epoch": 0.3967914438502674, + "grad_norm": 0.5575440315627697, + "learning_rate": 1.86952860750778e-05, + "loss": 0.8885, + "num_tokens": 13955659759.0, + "step": 3339 + }, + { + "epoch": 0.3969102792632204, + "grad_norm": 0.5472062192948182, + "learning_rate": 1.8694387777649734e-05, + "loss": 0.8955, + "num_tokens": 13959848210.0, + "step": 3340 + }, + { + "epoch": 0.3970291146761735, + "grad_norm": 0.4894918869111815, + "learning_rate": 1.8693489195272475e-05, + "loss": 0.8568, + "num_tokens": 13964009967.0, + "step": 3341 + }, + { + "epoch": 0.39714795008912657, + "grad_norm": 0.540185060050111, + "learning_rate": 1.8692590327979296e-05, + "loss": 0.8796, + "num_tokens": 13968199500.0, + "step": 3342 + }, + { + "epoch": 0.3972667855020796, + "grad_norm": 0.5966243339078654, + "learning_rate": 1.869169117580349e-05, + "loss": 0.8838, + "num_tokens": 13972379021.0, + "step": 3343 + }, + { + "epoch": 0.39738562091503266, + "grad_norm": 0.4283977912467412, + "learning_rate": 1.869079173877835e-05, + "loss": 0.8922, + "num_tokens": 13976569821.0, + "step": 3344 + }, + { + "epoch": 0.39750445632798576, + "grad_norm": 0.5916136512749879, + "learning_rate": 1.8689892016937193e-05, + "loss": 0.8811, + "num_tokens": 13980759104.0, + "step": 3345 + }, + { + "epoch": 0.3976232917409388, + "grad_norm": 0.5202056057712834, + "learning_rate": 1.8688992010313332e-05, + "loss": 0.9193, + "num_tokens": 13984946797.0, + "step": 3346 + }, + { + "epoch": 0.39774212715389184, + "grad_norm": 0.4633205680905348, + "learning_rate": 1.8688091718940102e-05, + "loss": 0.815, + "num_tokens": 13989121292.0, + "step": 3347 + }, + { + "epoch": 0.39786096256684494, + "grad_norm": 0.57141197712014, + "learning_rate": 1.868719114285084e-05, + "loss": 0.9295, + "num_tokens": 13993282954.0, + "step": 3348 + }, + { + "epoch": 0.397979797979798, + "grad_norm": 0.5360363028429125, + "learning_rate": 1.8686290282078895e-05, + "loss": 0.8449, + "num_tokens": 13997472749.0, + "step": 3349 + }, + { + "epoch": 0.39809863339275103, + "grad_norm": 0.6357012898546982, + "learning_rate": 1.8685389136657642e-05, + "loss": 0.8928, + "num_tokens": 14001660533.0, + "step": 3350 + }, + { + "epoch": 0.3982174688057041, + "grad_norm": 0.4990612683033764, + "learning_rate": 1.868448770662044e-05, + "loss": 0.9161, + "num_tokens": 14005834381.0, + "step": 3351 + }, + { + "epoch": 0.3983363042186572, + "grad_norm": 0.6249928523289425, + "learning_rate": 1.8683585992000675e-05, + "loss": 0.8732, + "num_tokens": 14010024239.0, + "step": 3352 + }, + { + "epoch": 0.3984551396316102, + "grad_norm": 0.543510617097883, + "learning_rate": 1.868268399283174e-05, + "loss": 0.9209, + "num_tokens": 14014194303.0, + "step": 3353 + }, + { + "epoch": 0.39857397504456327, + "grad_norm": 0.5144613031873349, + "learning_rate": 1.868178170914704e-05, + "loss": 0.8506, + "num_tokens": 14018384372.0, + "step": 3354 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.4893936699140216, + "learning_rate": 1.8680879140979996e-05, + "loss": 0.8611, + "num_tokens": 14022572757.0, + "step": 3355 + }, + { + "epoch": 0.3988116458704694, + "grad_norm": 0.5322548938343826, + "learning_rate": 1.8679976288364027e-05, + "loss": 0.9114, + "num_tokens": 14026734288.0, + "step": 3356 + }, + { + "epoch": 0.39893048128342246, + "grad_norm": 0.5755197204792943, + "learning_rate": 1.8679073151332568e-05, + "loss": 0.9054, + "num_tokens": 14030923020.0, + "step": 3357 + }, + { + "epoch": 0.3990493166963755, + "grad_norm": 0.646404184994706, + "learning_rate": 1.8678169729919067e-05, + "loss": 0.8843, + "num_tokens": 14035113080.0, + "step": 3358 + }, + { + "epoch": 0.3991681521093286, + "grad_norm": 0.440099321607551, + "learning_rate": 1.8677266024156982e-05, + "loss": 0.8689, + "num_tokens": 14039303365.0, + "step": 3359 + }, + { + "epoch": 0.39928698752228164, + "grad_norm": 0.5478337622976478, + "learning_rate": 1.8676362034079777e-05, + "loss": 0.8784, + "num_tokens": 14043493248.0, + "step": 3360 + }, + { + "epoch": 0.3994058229352347, + "grad_norm": 0.693534900499522, + "learning_rate": 1.8675457759720933e-05, + "loss": 0.9325, + "num_tokens": 14047682546.0, + "step": 3361 + }, + { + "epoch": 0.39952465834818773, + "grad_norm": 0.4864507387800391, + "learning_rate": 1.8674553201113933e-05, + "loss": 0.8994, + "num_tokens": 14051845442.0, + "step": 3362 + }, + { + "epoch": 0.39964349376114083, + "grad_norm": 0.6954247795307891, + "learning_rate": 1.8673648358292282e-05, + "loss": 0.8947, + "num_tokens": 14056033721.0, + "step": 3363 + }, + { + "epoch": 0.3997623291740939, + "grad_norm": 0.4931035322811643, + "learning_rate": 1.8672743231289488e-05, + "loss": 0.8907, + "num_tokens": 14060221895.0, + "step": 3364 + }, + { + "epoch": 0.3998811645870469, + "grad_norm": 0.6641216103204775, + "learning_rate": 1.8671837820139066e-05, + "loss": 0.8738, + "num_tokens": 14064410266.0, + "step": 3365 + }, + { + "epoch": 0.4, + "grad_norm": 0.5273358705201188, + "learning_rate": 1.8670932124874556e-05, + "loss": 0.8798, + "num_tokens": 14068597338.0, + "step": 3366 + }, + { + "epoch": 0.40011883541295307, + "grad_norm": 0.669399270854106, + "learning_rate": 1.8670026145529493e-05, + "loss": 0.8824, + "num_tokens": 14072787053.0, + "step": 3367 + }, + { + "epoch": 0.4002376708259061, + "grad_norm": 0.5369657633904412, + "learning_rate": 1.8669119882137424e-05, + "loss": 0.8368, + "num_tokens": 14076949542.0, + "step": 3368 + }, + { + "epoch": 0.40035650623885916, + "grad_norm": 0.6724590020344646, + "learning_rate": 1.8668213334731917e-05, + "loss": 0.9029, + "num_tokens": 14081138954.0, + "step": 3369 + }, + { + "epoch": 0.40047534165181226, + "grad_norm": 0.4787679826347071, + "learning_rate": 1.8667306503346544e-05, + "loss": 0.8718, + "num_tokens": 14085327876.0, + "step": 3370 + }, + { + "epoch": 0.4005941770647653, + "grad_norm": 0.7629192364624736, + "learning_rate": 1.8666399388014892e-05, + "loss": 0.8883, + "num_tokens": 14089515595.0, + "step": 3371 + }, + { + "epoch": 0.40071301247771834, + "grad_norm": 0.5479075606893513, + "learning_rate": 1.8665491988770543e-05, + "loss": 0.8798, + "num_tokens": 14093681657.0, + "step": 3372 + }, + { + "epoch": 0.40083184789067144, + "grad_norm": 0.7577632155398097, + "learning_rate": 1.8664584305647113e-05, + "loss": 0.8943, + "num_tokens": 14097849039.0, + "step": 3373 + }, + { + "epoch": 0.4009506833036245, + "grad_norm": 0.605641930463314, + "learning_rate": 1.866367633867821e-05, + "loss": 0.8752, + "num_tokens": 14102016621.0, + "step": 3374 + }, + { + "epoch": 0.40106951871657753, + "grad_norm": 0.6704038881456332, + "learning_rate": 1.866276808789746e-05, + "loss": 0.8661, + "num_tokens": 14106165361.0, + "step": 3375 + }, + { + "epoch": 0.4011883541295306, + "grad_norm": 0.6252129001111683, + "learning_rate": 1.86618595533385e-05, + "loss": 0.8828, + "num_tokens": 14110323277.0, + "step": 3376 + }, + { + "epoch": 0.4013071895424837, + "grad_norm": 0.5669050912682951, + "learning_rate": 1.8660950735034974e-05, + "loss": 0.8851, + "num_tokens": 14114507363.0, + "step": 3377 + }, + { + "epoch": 0.4014260249554367, + "grad_norm": 0.6251267683916762, + "learning_rate": 1.8660041633020542e-05, + "loss": 0.8441, + "num_tokens": 14118667407.0, + "step": 3378 + }, + { + "epoch": 0.40154486036838977, + "grad_norm": 0.5979725449024242, + "learning_rate": 1.8659132247328867e-05, + "loss": 0.8745, + "num_tokens": 14122825511.0, + "step": 3379 + }, + { + "epoch": 0.40166369578134287, + "grad_norm": 0.5112412810326668, + "learning_rate": 1.865822257799363e-05, + "loss": 0.8857, + "num_tokens": 14126983113.0, + "step": 3380 + }, + { + "epoch": 0.4017825311942959, + "grad_norm": 0.5792974955328386, + "learning_rate": 1.865731262504852e-05, + "loss": 0.8565, + "num_tokens": 14131171767.0, + "step": 3381 + }, + { + "epoch": 0.40190136660724896, + "grad_norm": 0.511001621177079, + "learning_rate": 1.8656402388527232e-05, + "loss": 0.8769, + "num_tokens": 14135356821.0, + "step": 3382 + }, + { + "epoch": 0.402020202020202, + "grad_norm": 0.4950839034497305, + "learning_rate": 1.8655491868463474e-05, + "loss": 0.8897, + "num_tokens": 14139546075.0, + "step": 3383 + }, + { + "epoch": 0.4021390374331551, + "grad_norm": 0.5215874666350069, + "learning_rate": 1.865458106489097e-05, + "loss": 0.9027, + "num_tokens": 14143735325.0, + "step": 3384 + }, + { + "epoch": 0.40225787284610814, + "grad_norm": 0.5441146292020836, + "learning_rate": 1.865366997784345e-05, + "loss": 0.8612, + "num_tokens": 14147923650.0, + "step": 3385 + }, + { + "epoch": 0.4023767082590612, + "grad_norm": 0.4847340987539439, + "learning_rate": 1.865275860735465e-05, + "loss": 0.871, + "num_tokens": 14152083084.0, + "step": 3386 + }, + { + "epoch": 0.40249554367201423, + "grad_norm": 0.577576781515766, + "learning_rate": 1.8651846953458326e-05, + "loss": 0.8716, + "num_tokens": 14156271814.0, + "step": 3387 + }, + { + "epoch": 0.40261437908496733, + "grad_norm": 0.506896778494895, + "learning_rate": 1.865093501618824e-05, + "loss": 0.8891, + "num_tokens": 14160436624.0, + "step": 3388 + }, + { + "epoch": 0.4027332144979204, + "grad_norm": 0.5813860085372627, + "learning_rate": 1.865002279557816e-05, + "loss": 0.8745, + "num_tokens": 14164598596.0, + "step": 3389 + }, + { + "epoch": 0.4028520499108734, + "grad_norm": 0.45095753342304506, + "learning_rate": 1.8649110291661868e-05, + "loss": 0.8933, + "num_tokens": 14168741163.0, + "step": 3390 + }, + { + "epoch": 0.4029708853238265, + "grad_norm": 0.5435420456263269, + "learning_rate": 1.8648197504473163e-05, + "loss": 0.9029, + "num_tokens": 14172919024.0, + "step": 3391 + }, + { + "epoch": 0.40308972073677957, + "grad_norm": 0.43887902269520523, + "learning_rate": 1.864728443404584e-05, + "loss": 0.8708, + "num_tokens": 14177103840.0, + "step": 3392 + }, + { + "epoch": 0.4032085561497326, + "grad_norm": 0.5904833280865168, + "learning_rate": 1.864637108041373e-05, + "loss": 0.854, + "num_tokens": 14181281577.0, + "step": 3393 + }, + { + "epoch": 0.40332739156268566, + "grad_norm": 0.45084519626798214, + "learning_rate": 1.8645457443610638e-05, + "loss": 0.9115, + "num_tokens": 14185441524.0, + "step": 3394 + }, + { + "epoch": 0.40344622697563876, + "grad_norm": 0.5913006490227632, + "learning_rate": 1.8644543523670403e-05, + "loss": 0.852, + "num_tokens": 14189629062.0, + "step": 3395 + }, + { + "epoch": 0.4035650623885918, + "grad_norm": 0.5254042850329689, + "learning_rate": 1.8643629320626878e-05, + "loss": 0.8562, + "num_tokens": 14193819822.0, + "step": 3396 + }, + { + "epoch": 0.40368389780154484, + "grad_norm": 0.5249409352524381, + "learning_rate": 1.864271483451392e-05, + "loss": 0.8871, + "num_tokens": 14198006676.0, + "step": 3397 + }, + { + "epoch": 0.40380273321449794, + "grad_norm": 0.5443241172437832, + "learning_rate": 1.8641800065365385e-05, + "loss": 0.8975, + "num_tokens": 14202194455.0, + "step": 3398 + }, + { + "epoch": 0.403921568627451, + "grad_norm": 0.5276229515016365, + "learning_rate": 1.864088501321516e-05, + "loss": 0.8948, + "num_tokens": 14206383019.0, + "step": 3399 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.5262363415203791, + "learning_rate": 1.8639969678097125e-05, + "loss": 0.9269, + "num_tokens": 14210571964.0, + "step": 3400 + }, + { + "epoch": 0.4041592394533571, + "grad_norm": 0.5426403614484568, + "learning_rate": 1.8639054060045186e-05, + "loss": 0.8582, + "num_tokens": 14214761994.0, + "step": 3401 + }, + { + "epoch": 0.4042780748663102, + "grad_norm": 0.5092541994770466, + "learning_rate": 1.8638138159093243e-05, + "loss": 0.9142, + "num_tokens": 14218951255.0, + "step": 3402 + }, + { + "epoch": 0.4043969102792632, + "grad_norm": 0.5892711530472986, + "learning_rate": 1.8637221975275222e-05, + "loss": 0.8555, + "num_tokens": 14223116885.0, + "step": 3403 + }, + { + "epoch": 0.40451574569221627, + "grad_norm": 0.5215346630843769, + "learning_rate": 1.8636305508625047e-05, + "loss": 0.8821, + "num_tokens": 14227305642.0, + "step": 3404 + }, + { + "epoch": 0.40463458110516937, + "grad_norm": 0.5156507786523467, + "learning_rate": 1.863538875917666e-05, + "loss": 0.9124, + "num_tokens": 14231492501.0, + "step": 3405 + }, + { + "epoch": 0.4047534165181224, + "grad_norm": 0.48542463786793283, + "learning_rate": 1.863447172696401e-05, + "loss": 0.8912, + "num_tokens": 14235672192.0, + "step": 3406 + }, + { + "epoch": 0.40487225193107546, + "grad_norm": 0.5084159726664869, + "learning_rate": 1.8633554412021062e-05, + "loss": 0.8559, + "num_tokens": 14239849986.0, + "step": 3407 + }, + { + "epoch": 0.4049910873440285, + "grad_norm": 0.5122221921555222, + "learning_rate": 1.8632636814381782e-05, + "loss": 0.8329, + "num_tokens": 14244038431.0, + "step": 3408 + }, + { + "epoch": 0.4051099227569816, + "grad_norm": 0.5050183655315841, + "learning_rate": 1.8631718934080157e-05, + "loss": 0.9374, + "num_tokens": 14248229032.0, + "step": 3409 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.5866751454139253, + "learning_rate": 1.8630800771150174e-05, + "loss": 0.915, + "num_tokens": 14252385458.0, + "step": 3410 + }, + { + "epoch": 0.4053475935828877, + "grad_norm": 0.5026846667748992, + "learning_rate": 1.862988232562584e-05, + "loss": 0.9496, + "num_tokens": 14256574052.0, + "step": 3411 + }, + { + "epoch": 0.40546642899584073, + "grad_norm": 0.5266189867658005, + "learning_rate": 1.8628963597541164e-05, + "loss": 0.887, + "num_tokens": 14260763625.0, + "step": 3412 + }, + { + "epoch": 0.40558526440879383, + "grad_norm": 0.4915296046474643, + "learning_rate": 1.8628044586930175e-05, + "loss": 0.9048, + "num_tokens": 14264952024.0, + "step": 3413 + }, + { + "epoch": 0.4057040998217469, + "grad_norm": 0.5076257866783036, + "learning_rate": 1.86271252938269e-05, + "loss": 0.8878, + "num_tokens": 14269114329.0, + "step": 3414 + }, + { + "epoch": 0.4058229352346999, + "grad_norm": 0.5782229022376196, + "learning_rate": 1.862620571826539e-05, + "loss": 0.9122, + "num_tokens": 14273270554.0, + "step": 3415 + }, + { + "epoch": 0.405941770647653, + "grad_norm": 0.52719540563388, + "learning_rate": 1.8625285860279697e-05, + "loss": 0.8688, + "num_tokens": 14277458446.0, + "step": 3416 + }, + { + "epoch": 0.40606060606060607, + "grad_norm": 0.48722914783778065, + "learning_rate": 1.8624365719903888e-05, + "loss": 0.8591, + "num_tokens": 14281647325.0, + "step": 3417 + }, + { + "epoch": 0.4061794414735591, + "grad_norm": 0.5027604996958508, + "learning_rate": 1.8623445297172038e-05, + "loss": 0.8422, + "num_tokens": 14285831007.0, + "step": 3418 + }, + { + "epoch": 0.40629827688651216, + "grad_norm": 0.6437329442662987, + "learning_rate": 1.862252459211823e-05, + "loss": 0.874, + "num_tokens": 14289983655.0, + "step": 3419 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 0.5766484776973275, + "learning_rate": 1.8621603604776567e-05, + "loss": 0.8831, + "num_tokens": 14294130997.0, + "step": 3420 + }, + { + "epoch": 0.4065359477124183, + "grad_norm": 0.5388659286737267, + "learning_rate": 1.862068233518115e-05, + "loss": 0.879, + "num_tokens": 14298292137.0, + "step": 3421 + }, + { + "epoch": 0.40665478312537134, + "grad_norm": 0.553065411029574, + "learning_rate": 1.8619760783366103e-05, + "loss": 0.8704, + "num_tokens": 14302481602.0, + "step": 3422 + }, + { + "epoch": 0.40677361853832444, + "grad_norm": 0.5022695203561983, + "learning_rate": 1.8618838949365548e-05, + "loss": 0.8516, + "num_tokens": 14306670637.0, + "step": 3423 + }, + { + "epoch": 0.4068924539512775, + "grad_norm": 0.5655642601452585, + "learning_rate": 1.8617916833213632e-05, + "loss": 0.8957, + "num_tokens": 14310848904.0, + "step": 3424 + }, + { + "epoch": 0.40701128936423053, + "grad_norm": 0.4441284261102043, + "learning_rate": 1.8616994434944498e-05, + "loss": 0.8871, + "num_tokens": 14314997002.0, + "step": 3425 + }, + { + "epoch": 0.4071301247771836, + "grad_norm": 0.6121530331467936, + "learning_rate": 1.8616071754592305e-05, + "loss": 0.8778, + "num_tokens": 14319154296.0, + "step": 3426 + }, + { + "epoch": 0.4072489601901367, + "grad_norm": 0.4468902195515488, + "learning_rate": 1.861514879219122e-05, + "loss": 0.865, + "num_tokens": 14323342905.0, + "step": 3427 + }, + { + "epoch": 0.4073677956030897, + "grad_norm": 0.5658774980539768, + "learning_rate": 1.861422554777543e-05, + "loss": 0.8689, + "num_tokens": 14327478932.0, + "step": 3428 + }, + { + "epoch": 0.40748663101604277, + "grad_norm": 0.5976445087987492, + "learning_rate": 1.8613302021379127e-05, + "loss": 0.8738, + "num_tokens": 14331669061.0, + "step": 3429 + }, + { + "epoch": 0.40760546642899587, + "grad_norm": 0.47575389409608393, + "learning_rate": 1.861237821303651e-05, + "loss": 0.8716, + "num_tokens": 14335853364.0, + "step": 3430 + }, + { + "epoch": 0.4077243018419489, + "grad_norm": 0.57660543237999, + "learning_rate": 1.8611454122781783e-05, + "loss": 0.8736, + "num_tokens": 14340043193.0, + "step": 3431 + }, + { + "epoch": 0.40784313725490196, + "grad_norm": 0.48030966518981916, + "learning_rate": 1.8610529750649183e-05, + "loss": 0.8803, + "num_tokens": 14344231088.0, + "step": 3432 + }, + { + "epoch": 0.407961972667855, + "grad_norm": 0.43372838517960827, + "learning_rate": 1.860960509667293e-05, + "loss": 0.9043, + "num_tokens": 14348421162.0, + "step": 3433 + }, + { + "epoch": 0.4080808080808081, + "grad_norm": 0.5263834618321294, + "learning_rate": 1.860868016088727e-05, + "loss": 0.9063, + "num_tokens": 14352610875.0, + "step": 3434 + }, + { + "epoch": 0.40819964349376114, + "grad_norm": 0.5472117872155051, + "learning_rate": 1.860775494332646e-05, + "loss": 0.8711, + "num_tokens": 14356769883.0, + "step": 3435 + }, + { + "epoch": 0.4083184789067142, + "grad_norm": 0.5806272313036914, + "learning_rate": 1.860682944402476e-05, + "loss": 0.8793, + "num_tokens": 14360960199.0, + "step": 3436 + }, + { + "epoch": 0.4084373143196673, + "grad_norm": 0.4703940937852164, + "learning_rate": 1.860590366301645e-05, + "loss": 0.8958, + "num_tokens": 14365150447.0, + "step": 3437 + }, + { + "epoch": 0.40855614973262033, + "grad_norm": 0.5844505032211059, + "learning_rate": 1.860497760033581e-05, + "loss": 0.8942, + "num_tokens": 14369316259.0, + "step": 3438 + }, + { + "epoch": 0.4086749851455734, + "grad_norm": 0.43085041875447594, + "learning_rate": 1.8604051256017134e-05, + "loss": 0.8764, + "num_tokens": 14373475818.0, + "step": 3439 + }, + { + "epoch": 0.4087938205585264, + "grad_norm": 0.6237015686922052, + "learning_rate": 1.8603124630094735e-05, + "loss": 0.8672, + "num_tokens": 14377664056.0, + "step": 3440 + }, + { + "epoch": 0.4089126559714795, + "grad_norm": 0.4329567212225277, + "learning_rate": 1.860219772260292e-05, + "loss": 0.8802, + "num_tokens": 14381844958.0, + "step": 3441 + }, + { + "epoch": 0.40903149138443257, + "grad_norm": 0.5821279636349702, + "learning_rate": 1.860127053357602e-05, + "loss": 0.8768, + "num_tokens": 14386033886.0, + "step": 3442 + }, + { + "epoch": 0.4091503267973856, + "grad_norm": 0.4833296394265366, + "learning_rate": 1.8600343063048375e-05, + "loss": 0.8874, + "num_tokens": 14390207092.0, + "step": 3443 + }, + { + "epoch": 0.40926916221033866, + "grad_norm": 0.5326041652284765, + "learning_rate": 1.8599415311054325e-05, + "loss": 0.9124, + "num_tokens": 14394396361.0, + "step": 3444 + }, + { + "epoch": 0.40938799762329176, + "grad_norm": 0.4724782641451045, + "learning_rate": 1.859848727762823e-05, + "loss": 0.8819, + "num_tokens": 14398569959.0, + "step": 3445 + }, + { + "epoch": 0.4095068330362448, + "grad_norm": 0.5628274096829821, + "learning_rate": 1.859755896280447e-05, + "loss": 0.8782, + "num_tokens": 14402755444.0, + "step": 3446 + }, + { + "epoch": 0.40962566844919784, + "grad_norm": 0.502791394921955, + "learning_rate": 1.8596630366617404e-05, + "loss": 0.8997, + "num_tokens": 14406943456.0, + "step": 3447 + }, + { + "epoch": 0.40974450386215094, + "grad_norm": 0.5256893132422783, + "learning_rate": 1.8595701489101433e-05, + "loss": 0.9039, + "num_tokens": 14411132417.0, + "step": 3448 + }, + { + "epoch": 0.409863339275104, + "grad_norm": 0.6234227162341484, + "learning_rate": 1.8594772330290957e-05, + "loss": 0.9178, + "num_tokens": 14415319961.0, + "step": 3449 + }, + { + "epoch": 0.40998217468805703, + "grad_norm": 0.45269382479361975, + "learning_rate": 1.8593842890220382e-05, + "loss": 0.8792, + "num_tokens": 14419509063.0, + "step": 3450 + }, + { + "epoch": 0.4101010101010101, + "grad_norm": 0.5164502217338901, + "learning_rate": 1.8592913168924128e-05, + "loss": 0.8918, + "num_tokens": 14423697456.0, + "step": 3451 + }, + { + "epoch": 0.4102198455139632, + "grad_norm": 0.5054043070294585, + "learning_rate": 1.8591983166436625e-05, + "loss": 0.8929, + "num_tokens": 14427886417.0, + "step": 3452 + }, + { + "epoch": 0.4103386809269162, + "grad_norm": 0.5384025639367412, + "learning_rate": 1.859105288279232e-05, + "loss": 0.9189, + "num_tokens": 14432076103.0, + "step": 3453 + }, + { + "epoch": 0.41045751633986927, + "grad_norm": 0.45332989633783777, + "learning_rate": 1.859012231802566e-05, + "loss": 0.8985, + "num_tokens": 14436251114.0, + "step": 3454 + }, + { + "epoch": 0.41057635175282237, + "grad_norm": 0.5861803759234329, + "learning_rate": 1.8589191472171107e-05, + "loss": 0.9195, + "num_tokens": 14440421546.0, + "step": 3455 + }, + { + "epoch": 0.4106951871657754, + "grad_norm": 0.40307151330292706, + "learning_rate": 1.8588260345263135e-05, + "loss": 0.8982, + "num_tokens": 14444590594.0, + "step": 3456 + }, + { + "epoch": 0.41081402257872845, + "grad_norm": 0.4936588051214782, + "learning_rate": 1.8587328937336225e-05, + "loss": 0.8426, + "num_tokens": 14448779431.0, + "step": 3457 + }, + { + "epoch": 0.4109328579916815, + "grad_norm": 0.4862161339315269, + "learning_rate": 1.858639724842487e-05, + "loss": 0.9017, + "num_tokens": 14452966826.0, + "step": 3458 + }, + { + "epoch": 0.4110516934046346, + "grad_norm": 0.4703593984427462, + "learning_rate": 1.8585465278563577e-05, + "loss": 0.8734, + "num_tokens": 14457124841.0, + "step": 3459 + }, + { + "epoch": 0.41117052881758764, + "grad_norm": 0.524084413038606, + "learning_rate": 1.8584533027786857e-05, + "loss": 0.8794, + "num_tokens": 14461257963.0, + "step": 3460 + }, + { + "epoch": 0.4112893642305407, + "grad_norm": 0.488739781976491, + "learning_rate": 1.8583600496129233e-05, + "loss": 0.8381, + "num_tokens": 14465446757.0, + "step": 3461 + }, + { + "epoch": 0.4114081996434938, + "grad_norm": 0.5271843347581494, + "learning_rate": 1.8582667683625246e-05, + "loss": 0.8888, + "num_tokens": 14469636881.0, + "step": 3462 + }, + { + "epoch": 0.41152703505644683, + "grad_norm": 0.4908911010343783, + "learning_rate": 1.8581734590309436e-05, + "loss": 0.8855, + "num_tokens": 14473826409.0, + "step": 3463 + }, + { + "epoch": 0.4116458704693999, + "grad_norm": 0.5169620798167011, + "learning_rate": 1.8580801216216357e-05, + "loss": 0.8558, + "num_tokens": 14478014548.0, + "step": 3464 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.4907316523873391, + "learning_rate": 1.8579867561380577e-05, + "loss": 0.8402, + "num_tokens": 14482191272.0, + "step": 3465 + }, + { + "epoch": 0.411883541295306, + "grad_norm": 0.4720271576048648, + "learning_rate": 1.8578933625836676e-05, + "loss": 0.8986, + "num_tokens": 14486380145.0, + "step": 3466 + }, + { + "epoch": 0.41200237670825907, + "grad_norm": 0.5397311740809839, + "learning_rate": 1.857799940961924e-05, + "loss": 0.8992, + "num_tokens": 14490554329.0, + "step": 3467 + }, + { + "epoch": 0.4121212121212121, + "grad_norm": 0.5424311534245224, + "learning_rate": 1.8577064912762857e-05, + "loss": 0.8533, + "num_tokens": 14494744872.0, + "step": 3468 + }, + { + "epoch": 0.41224004753416515, + "grad_norm": 0.4819246514536221, + "learning_rate": 1.8576130135302148e-05, + "loss": 0.8824, + "num_tokens": 14498919201.0, + "step": 3469 + }, + { + "epoch": 0.41235888294711825, + "grad_norm": 0.6782966195746973, + "learning_rate": 1.857519507727172e-05, + "loss": 0.8762, + "num_tokens": 14503103836.0, + "step": 3470 + }, + { + "epoch": 0.4124777183600713, + "grad_norm": 0.4396547766922426, + "learning_rate": 1.857425973870621e-05, + "loss": 0.9077, + "num_tokens": 14507273947.0, + "step": 3471 + }, + { + "epoch": 0.41259655377302434, + "grad_norm": 0.5337548025618103, + "learning_rate": 1.857332411964025e-05, + "loss": 0.8742, + "num_tokens": 14511462417.0, + "step": 3472 + }, + { + "epoch": 0.41271538918597744, + "grad_norm": 0.5519053789137822, + "learning_rate": 1.8572388220108493e-05, + "loss": 0.8817, + "num_tokens": 14515623632.0, + "step": 3473 + }, + { + "epoch": 0.4128342245989305, + "grad_norm": 0.4848829305295932, + "learning_rate": 1.8571452040145596e-05, + "loss": 0.8825, + "num_tokens": 14519812022.0, + "step": 3474 + }, + { + "epoch": 0.41295306001188353, + "grad_norm": 0.5764406263716092, + "learning_rate": 1.857051557978623e-05, + "loss": 0.8342, + "num_tokens": 14524002326.0, + "step": 3475 + }, + { + "epoch": 0.4130718954248366, + "grad_norm": 0.5758071858409503, + "learning_rate": 1.856957883906508e-05, + "loss": 0.8938, + "num_tokens": 14528189348.0, + "step": 3476 + }, + { + "epoch": 0.4131907308377897, + "grad_norm": 0.46438535757983285, + "learning_rate": 1.8568641818016827e-05, + "loss": 0.8765, + "num_tokens": 14532379562.0, + "step": 3477 + }, + { + "epoch": 0.4133095662507427, + "grad_norm": 0.6075168662575682, + "learning_rate": 1.856770451667618e-05, + "loss": 0.8855, + "num_tokens": 14536555103.0, + "step": 3478 + }, + { + "epoch": 0.41342840166369577, + "grad_norm": 0.4392152667417174, + "learning_rate": 1.8566766935077848e-05, + "loss": 0.8855, + "num_tokens": 14540745241.0, + "step": 3479 + }, + { + "epoch": 0.41354723707664887, + "grad_norm": 0.5862171044093459, + "learning_rate": 1.8565829073256553e-05, + "loss": 0.8783, + "num_tokens": 14544934684.0, + "step": 3480 + }, + { + "epoch": 0.4136660724896019, + "grad_norm": 0.4242273784248124, + "learning_rate": 1.8564890931247025e-05, + "loss": 0.8699, + "num_tokens": 14549076063.0, + "step": 3481 + }, + { + "epoch": 0.41378490790255495, + "grad_norm": 0.6561414277140801, + "learning_rate": 1.8563952509084005e-05, + "loss": 0.8493, + "num_tokens": 14553231858.0, + "step": 3482 + }, + { + "epoch": 0.413903743315508, + "grad_norm": 0.47928825027026734, + "learning_rate": 1.8563013806802257e-05, + "loss": 0.9105, + "num_tokens": 14557420202.0, + "step": 3483 + }, + { + "epoch": 0.4140225787284611, + "grad_norm": 0.5125083736718348, + "learning_rate": 1.8562074824436533e-05, + "loss": 0.9017, + "num_tokens": 14561550623.0, + "step": 3484 + }, + { + "epoch": 0.41414141414141414, + "grad_norm": 0.463815679111862, + "learning_rate": 1.8561135562021607e-05, + "loss": 0.8554, + "num_tokens": 14565727480.0, + "step": 3485 + }, + { + "epoch": 0.4142602495543672, + "grad_norm": 0.5690875541976038, + "learning_rate": 1.856019601959227e-05, + "loss": 0.8961, + "num_tokens": 14569917027.0, + "step": 3486 + }, + { + "epoch": 0.4143790849673203, + "grad_norm": 0.522237491971227, + "learning_rate": 1.8559256197183314e-05, + "loss": 0.885, + "num_tokens": 14574107104.0, + "step": 3487 + }, + { + "epoch": 0.41449792038027333, + "grad_norm": 0.4518235310665042, + "learning_rate": 1.855831609482954e-05, + "loss": 0.8609, + "num_tokens": 14578292925.0, + "step": 3488 + }, + { + "epoch": 0.4146167557932264, + "grad_norm": 0.4968633318492688, + "learning_rate": 1.8557375712565767e-05, + "loss": 0.8832, + "num_tokens": 14582451128.0, + "step": 3489 + }, + { + "epoch": 0.4147355912061794, + "grad_norm": 0.4795668874289462, + "learning_rate": 1.855643505042682e-05, + "loss": 0.8704, + "num_tokens": 14586641567.0, + "step": 3490 + }, + { + "epoch": 0.4148544266191325, + "grad_norm": 0.5755601847534572, + "learning_rate": 1.8555494108447528e-05, + "loss": 0.8647, + "num_tokens": 14590767373.0, + "step": 3491 + }, + { + "epoch": 0.41497326203208557, + "grad_norm": 0.5054393383762906, + "learning_rate": 1.8554552886662748e-05, + "loss": 0.8961, + "num_tokens": 14594954773.0, + "step": 3492 + }, + { + "epoch": 0.4150920974450386, + "grad_norm": 0.5539021010778535, + "learning_rate": 1.8553611385107336e-05, + "loss": 0.9087, + "num_tokens": 14599130875.0, + "step": 3493 + }, + { + "epoch": 0.41521093285799165, + "grad_norm": 0.47511035786405886, + "learning_rate": 1.8552669603816152e-05, + "loss": 0.8412, + "num_tokens": 14603320612.0, + "step": 3494 + }, + { + "epoch": 0.41532976827094475, + "grad_norm": 0.5573690683094159, + "learning_rate": 1.855172754282407e-05, + "loss": 0.8731, + "num_tokens": 14607498295.0, + "step": 3495 + }, + { + "epoch": 0.4154486036838978, + "grad_norm": 0.5562885506178165, + "learning_rate": 1.8550785202165993e-05, + "loss": 0.8794, + "num_tokens": 14611686297.0, + "step": 3496 + }, + { + "epoch": 0.41556743909685084, + "grad_norm": 0.5974525491541531, + "learning_rate": 1.8549842581876806e-05, + "loss": 0.8693, + "num_tokens": 14615875213.0, + "step": 3497 + }, + { + "epoch": 0.41568627450980394, + "grad_norm": 0.3869810655629719, + "learning_rate": 1.8548899681991422e-05, + "loss": 0.8574, + "num_tokens": 14620064764.0, + "step": 3498 + }, + { + "epoch": 0.415805109922757, + "grad_norm": 0.6564212803090842, + "learning_rate": 1.854795650254476e-05, + "loss": 0.8901, + "num_tokens": 14624252225.0, + "step": 3499 + }, + { + "epoch": 0.41592394533571003, + "grad_norm": 0.5583803231932811, + "learning_rate": 1.854701304357175e-05, + "loss": 0.8679, + "num_tokens": 14628442015.0, + "step": 3500 + }, + { + "epoch": 0.4160427807486631, + "grad_norm": 0.5082074290887667, + "learning_rate": 1.854606930510733e-05, + "loss": 0.8661, + "num_tokens": 14632629664.0, + "step": 3501 + }, + { + "epoch": 0.4161616161616162, + "grad_norm": 0.6313948138876658, + "learning_rate": 1.8545125287186444e-05, + "loss": 0.8893, + "num_tokens": 14636816949.0, + "step": 3502 + }, + { + "epoch": 0.4162804515745692, + "grad_norm": 0.5355660444609792, + "learning_rate": 1.8544180989844064e-05, + "loss": 0.8939, + "num_tokens": 14640994131.0, + "step": 3503 + }, + { + "epoch": 0.41639928698752227, + "grad_norm": 0.5339977077841142, + "learning_rate": 1.854323641311515e-05, + "loss": 0.8958, + "num_tokens": 14645183577.0, + "step": 3504 + }, + { + "epoch": 0.41651812240047537, + "grad_norm": 0.4699136142307648, + "learning_rate": 1.854229155703469e-05, + "loss": 0.8617, + "num_tokens": 14649373129.0, + "step": 3505 + }, + { + "epoch": 0.4166369578134284, + "grad_norm": 0.5036084925452962, + "learning_rate": 1.8541346421637674e-05, + "loss": 0.8835, + "num_tokens": 14653530470.0, + "step": 3506 + }, + { + "epoch": 0.41675579322638145, + "grad_norm": 0.5435319039883019, + "learning_rate": 1.85404010069591e-05, + "loss": 0.8868, + "num_tokens": 14657715792.0, + "step": 3507 + }, + { + "epoch": 0.4168746286393345, + "grad_norm": 0.5040783853362342, + "learning_rate": 1.8539455313033983e-05, + "loss": 0.8884, + "num_tokens": 14661889887.0, + "step": 3508 + }, + { + "epoch": 0.4169934640522876, + "grad_norm": 0.6082154336724902, + "learning_rate": 1.8538509339897344e-05, + "loss": 0.8741, + "num_tokens": 14666081165.0, + "step": 3509 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 0.4672861634879184, + "learning_rate": 1.8537563087584215e-05, + "loss": 0.9047, + "num_tokens": 14670268350.0, + "step": 3510 + }, + { + "epoch": 0.4172311348781937, + "grad_norm": 0.5846707539466628, + "learning_rate": 1.8536616556129638e-05, + "loss": 0.8936, + "num_tokens": 14674451984.0, + "step": 3511 + }, + { + "epoch": 0.4173499702911468, + "grad_norm": 0.4840253769934924, + "learning_rate": 1.853566974556867e-05, + "loss": 0.9002, + "num_tokens": 14678642326.0, + "step": 3512 + }, + { + "epoch": 0.41746880570409983, + "grad_norm": 0.4936677220589788, + "learning_rate": 1.8534722655936372e-05, + "loss": 0.8869, + "num_tokens": 14682831585.0, + "step": 3513 + }, + { + "epoch": 0.4175876411170529, + "grad_norm": 0.41707750556596285, + "learning_rate": 1.8533775287267823e-05, + "loss": 0.8615, + "num_tokens": 14686967602.0, + "step": 3514 + }, + { + "epoch": 0.4177064765300059, + "grad_norm": 0.4746436240710616, + "learning_rate": 1.85328276395981e-05, + "loss": 0.8994, + "num_tokens": 14691157034.0, + "step": 3515 + }, + { + "epoch": 0.417825311942959, + "grad_norm": 0.5064998855779913, + "learning_rate": 1.8531879712962296e-05, + "loss": 0.9215, + "num_tokens": 14695316496.0, + "step": 3516 + }, + { + "epoch": 0.41794414735591207, + "grad_norm": 0.5526643779084871, + "learning_rate": 1.8530931507395528e-05, + "loss": 0.8692, + "num_tokens": 14699505728.0, + "step": 3517 + }, + { + "epoch": 0.4180629827688651, + "grad_norm": 0.5787243288251492, + "learning_rate": 1.85299830229329e-05, + "loss": 0.8568, + "num_tokens": 14703675283.0, + "step": 3518 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 0.5022130896047146, + "learning_rate": 1.8529034259609538e-05, + "loss": 0.869, + "num_tokens": 14707823515.0, + "step": 3519 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.42691898377437854, + "learning_rate": 1.8528085217460583e-05, + "loss": 0.8669, + "num_tokens": 14712004370.0, + "step": 3520 + }, + { + "epoch": 0.4184194890077243, + "grad_norm": 0.60737739128034, + "learning_rate": 1.852713589652118e-05, + "loss": 0.917, + "num_tokens": 14716193944.0, + "step": 3521 + }, + { + "epoch": 0.41853832442067734, + "grad_norm": 0.439729597397037, + "learning_rate": 1.8526186296826488e-05, + "loss": 0.8642, + "num_tokens": 14720364732.0, + "step": 3522 + }, + { + "epoch": 0.41865715983363044, + "grad_norm": 0.4799381633496604, + "learning_rate": 1.852523641841167e-05, + "loss": 0.8854, + "num_tokens": 14724555291.0, + "step": 3523 + }, + { + "epoch": 0.4187759952465835, + "grad_norm": 0.5511763747296011, + "learning_rate": 1.85242862613119e-05, + "loss": 0.904, + "num_tokens": 14728733958.0, + "step": 3524 + }, + { + "epoch": 0.41889483065953653, + "grad_norm": 0.43950324779220984, + "learning_rate": 1.8523335825562368e-05, + "loss": 0.918, + "num_tokens": 14732902840.0, + "step": 3525 + }, + { + "epoch": 0.4190136660724896, + "grad_norm": 0.5780512151443313, + "learning_rate": 1.852238511119828e-05, + "loss": 0.8699, + "num_tokens": 14737055921.0, + "step": 3526 + }, + { + "epoch": 0.4191325014854427, + "grad_norm": 0.44831866260218944, + "learning_rate": 1.8521434118254834e-05, + "loss": 0.9297, + "num_tokens": 14741245623.0, + "step": 3527 + }, + { + "epoch": 0.4192513368983957, + "grad_norm": 0.43536522103647307, + "learning_rate": 1.852048284676725e-05, + "loss": 0.8842, + "num_tokens": 14745433600.0, + "step": 3528 + }, + { + "epoch": 0.41937017231134877, + "grad_norm": 0.4547882362447985, + "learning_rate": 1.8519531296770762e-05, + "loss": 0.91, + "num_tokens": 14749623981.0, + "step": 3529 + }, + { + "epoch": 0.41948900772430187, + "grad_norm": 0.5368347074831936, + "learning_rate": 1.8518579468300605e-05, + "loss": 0.8631, + "num_tokens": 14753807931.0, + "step": 3530 + }, + { + "epoch": 0.4196078431372549, + "grad_norm": 0.4653818242130039, + "learning_rate": 1.851762736139203e-05, + "loss": 0.861, + "num_tokens": 14757997582.0, + "step": 3531 + }, + { + "epoch": 0.41972667855020795, + "grad_norm": 0.47993165199496335, + "learning_rate": 1.8516674976080294e-05, + "loss": 0.8725, + "num_tokens": 14762187045.0, + "step": 3532 + }, + { + "epoch": 0.419845513963161, + "grad_norm": 0.5812980916095509, + "learning_rate": 1.8515722312400672e-05, + "loss": 0.8798, + "num_tokens": 14766375321.0, + "step": 3533 + }, + { + "epoch": 0.4199643493761141, + "grad_norm": 0.5442652251900941, + "learning_rate": 1.8514769370388442e-05, + "loss": 0.9466, + "num_tokens": 14770550807.0, + "step": 3534 + }, + { + "epoch": 0.42008318478906714, + "grad_norm": 0.4931088429700507, + "learning_rate": 1.8513816150078893e-05, + "loss": 0.8647, + "num_tokens": 14774740549.0, + "step": 3535 + }, + { + "epoch": 0.4202020202020202, + "grad_norm": 0.561982855218819, + "learning_rate": 1.8512862651507324e-05, + "loss": 0.8681, + "num_tokens": 14778917391.0, + "step": 3536 + }, + { + "epoch": 0.4203208556149733, + "grad_norm": 0.55781587938245, + "learning_rate": 1.8511908874709054e-05, + "loss": 0.8814, + "num_tokens": 14783106962.0, + "step": 3537 + }, + { + "epoch": 0.42043969102792633, + "grad_norm": 0.4754559545394622, + "learning_rate": 1.8510954819719398e-05, + "loss": 0.8714, + "num_tokens": 14787296702.0, + "step": 3538 + }, + { + "epoch": 0.4205585264408794, + "grad_norm": 0.45653773669205605, + "learning_rate": 1.851000048657369e-05, + "loss": 0.8714, + "num_tokens": 14791486989.0, + "step": 3539 + }, + { + "epoch": 0.4206773618538324, + "grad_norm": 0.597055848460088, + "learning_rate": 1.850904587530727e-05, + "loss": 0.8699, + "num_tokens": 14795675090.0, + "step": 3540 + }, + { + "epoch": 0.4207961972667855, + "grad_norm": 0.5663897178489442, + "learning_rate": 1.8508090985955494e-05, + "loss": 0.8624, + "num_tokens": 14799863801.0, + "step": 3541 + }, + { + "epoch": 0.42091503267973857, + "grad_norm": 0.41755587944068373, + "learning_rate": 1.8507135818553724e-05, + "loss": 0.8759, + "num_tokens": 14804048052.0, + "step": 3542 + }, + { + "epoch": 0.4210338680926916, + "grad_norm": 0.5561121300278741, + "learning_rate": 1.8506180373137333e-05, + "loss": 0.8687, + "num_tokens": 14808232975.0, + "step": 3543 + }, + { + "epoch": 0.42115270350564465, + "grad_norm": 0.5172348859337283, + "learning_rate": 1.8505224649741704e-05, + "loss": 0.8471, + "num_tokens": 14812410992.0, + "step": 3544 + }, + { + "epoch": 0.42127153891859775, + "grad_norm": 0.49043411719679425, + "learning_rate": 1.8504268648402228e-05, + "loss": 0.8672, + "num_tokens": 14816581789.0, + "step": 3545 + }, + { + "epoch": 0.4213903743315508, + "grad_norm": 0.5903587304328007, + "learning_rate": 1.8503312369154315e-05, + "loss": 0.9158, + "num_tokens": 14820770620.0, + "step": 3546 + }, + { + "epoch": 0.42150920974450384, + "grad_norm": 0.4345538346600398, + "learning_rate": 1.850235581203337e-05, + "loss": 0.9086, + "num_tokens": 14824959720.0, + "step": 3547 + }, + { + "epoch": 0.42162804515745694, + "grad_norm": 0.4283604654391698, + "learning_rate": 1.850139897707483e-05, + "loss": 0.8939, + "num_tokens": 14829149498.0, + "step": 3548 + }, + { + "epoch": 0.42174688057041, + "grad_norm": 0.43769327235322725, + "learning_rate": 1.850044186431412e-05, + "loss": 0.8618, + "num_tokens": 14833339888.0, + "step": 3549 + }, + { + "epoch": 0.42186571598336303, + "grad_norm": 0.4638333396967262, + "learning_rate": 1.8499484473786687e-05, + "loss": 0.9394, + "num_tokens": 14837474484.0, + "step": 3550 + }, + { + "epoch": 0.4219845513963161, + "grad_norm": 0.5264319876876283, + "learning_rate": 1.849852680552799e-05, + "loss": 0.878, + "num_tokens": 14841664742.0, + "step": 3551 + }, + { + "epoch": 0.4221033868092692, + "grad_norm": 0.583939790070577, + "learning_rate": 1.849756885957349e-05, + "loss": 0.8802, + "num_tokens": 14845833198.0, + "step": 3552 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.45325226849925904, + "learning_rate": 1.8496610635958668e-05, + "loss": 0.8837, + "num_tokens": 14850002539.0, + "step": 3553 + }, + { + "epoch": 0.42234105763517527, + "grad_norm": 0.5130415971455315, + "learning_rate": 1.8495652134719003e-05, + "loss": 0.8432, + "num_tokens": 14854192703.0, + "step": 3554 + }, + { + "epoch": 0.42245989304812837, + "grad_norm": 0.5082773168378977, + "learning_rate": 1.849469335589e-05, + "loss": 0.8698, + "num_tokens": 14858354870.0, + "step": 3555 + }, + { + "epoch": 0.4225787284610814, + "grad_norm": 0.508652210137134, + "learning_rate": 1.849373429950716e-05, + "loss": 0.9163, + "num_tokens": 14862512666.0, + "step": 3556 + }, + { + "epoch": 0.42269756387403445, + "grad_norm": 0.5831468740778714, + "learning_rate": 1.8492774965606002e-05, + "loss": 0.8788, + "num_tokens": 14866702370.0, + "step": 3557 + }, + { + "epoch": 0.4228163992869875, + "grad_norm": 0.4857977829620372, + "learning_rate": 1.8491815354222056e-05, + "loss": 0.8597, + "num_tokens": 14870871094.0, + "step": 3558 + }, + { + "epoch": 0.4229352346999406, + "grad_norm": 0.48376033882977354, + "learning_rate": 1.8490855465390855e-05, + "loss": 0.8655, + "num_tokens": 14875048542.0, + "step": 3559 + }, + { + "epoch": 0.42305407011289364, + "grad_norm": 0.48983890767667826, + "learning_rate": 1.8489895299147948e-05, + "loss": 0.8753, + "num_tokens": 14879223874.0, + "step": 3560 + }, + { + "epoch": 0.4231729055258467, + "grad_norm": 0.5340862925582867, + "learning_rate": 1.8488934855528892e-05, + "loss": 0.8786, + "num_tokens": 14883413036.0, + "step": 3561 + }, + { + "epoch": 0.4232917409387998, + "grad_norm": 0.47107484915152276, + "learning_rate": 1.848797413456926e-05, + "loss": 0.876, + "num_tokens": 14887602106.0, + "step": 3562 + }, + { + "epoch": 0.42341057635175283, + "grad_norm": 0.565972182034578, + "learning_rate": 1.8487013136304633e-05, + "loss": 0.9137, + "num_tokens": 14891791124.0, + "step": 3563 + }, + { + "epoch": 0.4235294117647059, + "grad_norm": 0.5434389870851256, + "learning_rate": 1.848605186077059e-05, + "loss": 0.8717, + "num_tokens": 14895943099.0, + "step": 3564 + }, + { + "epoch": 0.4236482471776589, + "grad_norm": 0.5791845475594714, + "learning_rate": 1.8485090308002734e-05, + "loss": 0.9159, + "num_tokens": 14900114697.0, + "step": 3565 + }, + { + "epoch": 0.423767082590612, + "grad_norm": 0.5056724542065439, + "learning_rate": 1.8484128478036682e-05, + "loss": 0.8995, + "num_tokens": 14904304647.0, + "step": 3566 + }, + { + "epoch": 0.42388591800356507, + "grad_norm": 0.48791027258216935, + "learning_rate": 1.8483166370908047e-05, + "loss": 0.8645, + "num_tokens": 14908493992.0, + "step": 3567 + }, + { + "epoch": 0.4240047534165181, + "grad_norm": 0.5044294938213405, + "learning_rate": 1.8482203986652456e-05, + "loss": 0.9425, + "num_tokens": 14912670616.0, + "step": 3568 + }, + { + "epoch": 0.4241235888294712, + "grad_norm": 0.5914478927801767, + "learning_rate": 1.8481241325305555e-05, + "loss": 0.8757, + "num_tokens": 14916860035.0, + "step": 3569 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 0.597619167269834, + "learning_rate": 1.8480278386902993e-05, + "loss": 0.877, + "num_tokens": 14921048518.0, + "step": 3570 + }, + { + "epoch": 0.4243612596553773, + "grad_norm": 0.47439937778769836, + "learning_rate": 1.8479315171480433e-05, + "loss": 0.8626, + "num_tokens": 14925212320.0, + "step": 3571 + }, + { + "epoch": 0.42448009506833034, + "grad_norm": 0.5424660508828998, + "learning_rate": 1.8478351679073546e-05, + "loss": 0.9311, + "num_tokens": 14929387466.0, + "step": 3572 + }, + { + "epoch": 0.42459893048128344, + "grad_norm": 0.45370343322685125, + "learning_rate": 1.8477387909718008e-05, + "loss": 0.858, + "num_tokens": 14933568816.0, + "step": 3573 + }, + { + "epoch": 0.4247177658942365, + "grad_norm": 0.5250055304457826, + "learning_rate": 1.8476423863449514e-05, + "loss": 0.9101, + "num_tokens": 14937757454.0, + "step": 3574 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.5815121915678333, + "learning_rate": 1.8475459540303764e-05, + "loss": 0.8391, + "num_tokens": 14941902498.0, + "step": 3575 + }, + { + "epoch": 0.4249554367201426, + "grad_norm": 0.5543280610570152, + "learning_rate": 1.8474494940316477e-05, + "loss": 0.8707, + "num_tokens": 14946092735.0, + "step": 3576 + }, + { + "epoch": 0.4250742721330957, + "grad_norm": 0.5362336665834868, + "learning_rate": 1.8473530063523365e-05, + "loss": 0.851, + "num_tokens": 14950265039.0, + "step": 3577 + }, + { + "epoch": 0.4251931075460487, + "grad_norm": 0.536743512398047, + "learning_rate": 1.8472564909960168e-05, + "loss": 0.8437, + "num_tokens": 14954455313.0, + "step": 3578 + }, + { + "epoch": 0.42531194295900177, + "grad_norm": 0.45491765614043134, + "learning_rate": 1.847159947966263e-05, + "loss": 0.8655, + "num_tokens": 14958644571.0, + "step": 3579 + }, + { + "epoch": 0.42543077837195487, + "grad_norm": 0.5392559300260056, + "learning_rate": 1.8470633772666496e-05, + "loss": 0.8992, + "num_tokens": 14962814565.0, + "step": 3580 + }, + { + "epoch": 0.4255496137849079, + "grad_norm": 0.6092717899307306, + "learning_rate": 1.8469667789007538e-05, + "loss": 0.8681, + "num_tokens": 14967004194.0, + "step": 3581 + }, + { + "epoch": 0.42566844919786095, + "grad_norm": 0.5353900312729347, + "learning_rate": 1.8468701528721528e-05, + "loss": 0.8884, + "num_tokens": 14971163784.0, + "step": 3582 + }, + { + "epoch": 0.425787284610814, + "grad_norm": 0.620127485533714, + "learning_rate": 1.8467734991844247e-05, + "loss": 0.8524, + "num_tokens": 14975331692.0, + "step": 3583 + }, + { + "epoch": 0.4259061200237671, + "grad_norm": 0.4497959370040303, + "learning_rate": 1.8466768178411488e-05, + "loss": 0.8932, + "num_tokens": 14979477154.0, + "step": 3584 + }, + { + "epoch": 0.42602495543672014, + "grad_norm": 0.6140037312046152, + "learning_rate": 1.8465801088459057e-05, + "loss": 0.914, + "num_tokens": 14983665901.0, + "step": 3585 + }, + { + "epoch": 0.4261437908496732, + "grad_norm": 0.5746143277581992, + "learning_rate": 1.8464833722022773e-05, + "loss": 0.8737, + "num_tokens": 14987853618.0, + "step": 3586 + }, + { + "epoch": 0.4262626262626263, + "grad_norm": 0.5743405935253683, + "learning_rate": 1.8463866079138456e-05, + "loss": 0.8607, + "num_tokens": 14992042093.0, + "step": 3587 + }, + { + "epoch": 0.42638146167557933, + "grad_norm": 0.4879404227596621, + "learning_rate": 1.846289815984194e-05, + "loss": 0.8718, + "num_tokens": 14996211178.0, + "step": 3588 + }, + { + "epoch": 0.4265002970885324, + "grad_norm": 0.5082465164352882, + "learning_rate": 1.8461929964169075e-05, + "loss": 0.8863, + "num_tokens": 15000400578.0, + "step": 3589 + }, + { + "epoch": 0.4266191325014854, + "grad_norm": 0.4833833358225661, + "learning_rate": 1.8460961492155712e-05, + "loss": 0.8941, + "num_tokens": 15004589860.0, + "step": 3590 + }, + { + "epoch": 0.4267379679144385, + "grad_norm": 0.6141583960963329, + "learning_rate": 1.845999274383772e-05, + "loss": 0.8767, + "num_tokens": 15008760267.0, + "step": 3591 + }, + { + "epoch": 0.42685680332739157, + "grad_norm": 0.4692521359126843, + "learning_rate": 1.8459023719250973e-05, + "loss": 0.8331, + "num_tokens": 15012951208.0, + "step": 3592 + }, + { + "epoch": 0.4269756387403446, + "grad_norm": 0.6058890029000206, + "learning_rate": 1.845805441843136e-05, + "loss": 0.8729, + "num_tokens": 15017132852.0, + "step": 3593 + }, + { + "epoch": 0.4270944741532977, + "grad_norm": 0.5166552909153035, + "learning_rate": 1.8457084841414774e-05, + "loss": 0.9243, + "num_tokens": 15021312729.0, + "step": 3594 + }, + { + "epoch": 0.42721330956625075, + "grad_norm": 0.41884759729955395, + "learning_rate": 1.845611498823712e-05, + "loss": 0.8607, + "num_tokens": 15025488154.0, + "step": 3595 + }, + { + "epoch": 0.4273321449792038, + "grad_norm": 0.514763394586817, + "learning_rate": 1.8455144858934325e-05, + "loss": 0.8812, + "num_tokens": 15029664333.0, + "step": 3596 + }, + { + "epoch": 0.42745098039215684, + "grad_norm": 0.48646656617507167, + "learning_rate": 1.8454174453542307e-05, + "loss": 0.8482, + "num_tokens": 15033854123.0, + "step": 3597 + }, + { + "epoch": 0.42756981580510994, + "grad_norm": 0.5732982094785342, + "learning_rate": 1.8453203772097003e-05, + "loss": 0.9061, + "num_tokens": 15038026616.0, + "step": 3598 + }, + { + "epoch": 0.427688651218063, + "grad_norm": 0.49927106758577133, + "learning_rate": 1.8452232814634367e-05, + "loss": 0.876, + "num_tokens": 15042215797.0, + "step": 3599 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 0.49571825863421076, + "learning_rate": 1.8451261581190353e-05, + "loss": 0.896, + "num_tokens": 15046394738.0, + "step": 3600 + }, + { + "epoch": 0.4279263220439691, + "grad_norm": 0.49961860287832405, + "learning_rate": 1.8450290071800928e-05, + "loss": 0.8888, + "num_tokens": 15050532450.0, + "step": 3601 + }, + { + "epoch": 0.4280451574569222, + "grad_norm": 0.49826090355036967, + "learning_rate": 1.8449318286502073e-05, + "loss": 0.8517, + "num_tokens": 15054722037.0, + "step": 3602 + }, + { + "epoch": 0.4281639928698752, + "grad_norm": 0.4948306701328999, + "learning_rate": 1.844834622532977e-05, + "loss": 0.8614, + "num_tokens": 15058882886.0, + "step": 3603 + }, + { + "epoch": 0.42828282828282827, + "grad_norm": 0.6846965488182836, + "learning_rate": 1.844737388832003e-05, + "loss": 0.9009, + "num_tokens": 15063070383.0, + "step": 3604 + }, + { + "epoch": 0.42840166369578137, + "grad_norm": 0.5246033581473739, + "learning_rate": 1.844640127550885e-05, + "loss": 0.8796, + "num_tokens": 15067259197.0, + "step": 3605 + }, + { + "epoch": 0.4285204991087344, + "grad_norm": 0.547015358591355, + "learning_rate": 1.8445428386932255e-05, + "loss": 0.8641, + "num_tokens": 15071448135.0, + "step": 3606 + }, + { + "epoch": 0.42863933452168745, + "grad_norm": 0.5913705750029, + "learning_rate": 1.8444455222626276e-05, + "loss": 0.8603, + "num_tokens": 15075635521.0, + "step": 3607 + }, + { + "epoch": 0.4287581699346405, + "grad_norm": 0.4813778347024982, + "learning_rate": 1.8443481782626946e-05, + "loss": 0.8496, + "num_tokens": 15079824445.0, + "step": 3608 + }, + { + "epoch": 0.4288770053475936, + "grad_norm": 0.545033113777661, + "learning_rate": 1.8442508066970318e-05, + "loss": 0.9038, + "num_tokens": 15084012630.0, + "step": 3609 + }, + { + "epoch": 0.42899584076054664, + "grad_norm": 0.5274280095651563, + "learning_rate": 1.844153407569246e-05, + "loss": 0.8505, + "num_tokens": 15088201435.0, + "step": 3610 + }, + { + "epoch": 0.4291146761734997, + "grad_norm": 0.47123302352702123, + "learning_rate": 1.8440559808829426e-05, + "loss": 0.9129, + "num_tokens": 15092391129.0, + "step": 3611 + }, + { + "epoch": 0.4292335115864528, + "grad_norm": 0.5010165185062146, + "learning_rate": 1.8439585266417308e-05, + "loss": 0.8973, + "num_tokens": 15096547748.0, + "step": 3612 + }, + { + "epoch": 0.42935234699940583, + "grad_norm": 0.4821791566717664, + "learning_rate": 1.8438610448492196e-05, + "loss": 0.8774, + "num_tokens": 15100719257.0, + "step": 3613 + }, + { + "epoch": 0.4294711824123589, + "grad_norm": 0.5896301710719443, + "learning_rate": 1.8437635355090184e-05, + "loss": 0.9115, + "num_tokens": 15104906036.0, + "step": 3614 + }, + { + "epoch": 0.4295900178253119, + "grad_norm": 0.44217001256500776, + "learning_rate": 1.8436659986247393e-05, + "loss": 0.9012, + "num_tokens": 15109091671.0, + "step": 3615 + }, + { + "epoch": 0.429708853238265, + "grad_norm": 0.571892321968431, + "learning_rate": 1.8435684341999937e-05, + "loss": 0.8766, + "num_tokens": 15113237626.0, + "step": 3616 + }, + { + "epoch": 0.42982768865121807, + "grad_norm": 0.4780363193188074, + "learning_rate": 1.8434708422383943e-05, + "loss": 0.8913, + "num_tokens": 15117422565.0, + "step": 3617 + }, + { + "epoch": 0.4299465240641711, + "grad_norm": 0.5816160306393784, + "learning_rate": 1.8433732227435567e-05, + "loss": 0.859, + "num_tokens": 15121612533.0, + "step": 3618 + }, + { + "epoch": 0.4300653594771242, + "grad_norm": 0.46318582521873836, + "learning_rate": 1.843275575719095e-05, + "loss": 0.882, + "num_tokens": 15125802205.0, + "step": 3619 + }, + { + "epoch": 0.43018419489007725, + "grad_norm": 0.5423167883023133, + "learning_rate": 1.8431779011686257e-05, + "loss": 0.8884, + "num_tokens": 15129989724.0, + "step": 3620 + }, + { + "epoch": 0.4303030303030303, + "grad_norm": 0.47677751447705724, + "learning_rate": 1.8430801990957658e-05, + "loss": 0.8929, + "num_tokens": 15134174760.0, + "step": 3621 + }, + { + "epoch": 0.43042186571598334, + "grad_norm": 0.5763834565696583, + "learning_rate": 1.8429824695041338e-05, + "loss": 0.9011, + "num_tokens": 15138364587.0, + "step": 3622 + }, + { + "epoch": 0.43054070112893644, + "grad_norm": 0.5031083644810369, + "learning_rate": 1.8428847123973488e-05, + "loss": 0.8693, + "num_tokens": 15142554562.0, + "step": 3623 + }, + { + "epoch": 0.4306595365418895, + "grad_norm": 0.47972166868416727, + "learning_rate": 1.8427869277790312e-05, + "loss": 0.8783, + "num_tokens": 15146744191.0, + "step": 3624 + }, + { + "epoch": 0.43077837195484253, + "grad_norm": 0.6469339754731347, + "learning_rate": 1.8426891156528023e-05, + "loss": 0.8763, + "num_tokens": 15150932971.0, + "step": 3625 + }, + { + "epoch": 0.4308972073677956, + "grad_norm": 0.49007450710030864, + "learning_rate": 1.8425912760222843e-05, + "loss": 0.8516, + "num_tokens": 15155086214.0, + "step": 3626 + }, + { + "epoch": 0.4310160427807487, + "grad_norm": 0.4927419403805133, + "learning_rate": 1.8424934088911006e-05, + "loss": 0.8684, + "num_tokens": 15159275115.0, + "step": 3627 + }, + { + "epoch": 0.4311348781937017, + "grad_norm": 0.5248088656321049, + "learning_rate": 1.8423955142628756e-05, + "loss": 0.8998, + "num_tokens": 15163453831.0, + "step": 3628 + }, + { + "epoch": 0.43125371360665476, + "grad_norm": 0.4933571384285576, + "learning_rate": 1.8422975921412348e-05, + "loss": 0.8784, + "num_tokens": 15167590318.0, + "step": 3629 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.4617010876720787, + "learning_rate": 1.842199642529804e-05, + "loss": 0.9038, + "num_tokens": 15171779162.0, + "step": 3630 + }, + { + "epoch": 0.4314913844325609, + "grad_norm": 0.5944360291319539, + "learning_rate": 1.842101665432211e-05, + "loss": 0.8814, + "num_tokens": 15175968609.0, + "step": 3631 + }, + { + "epoch": 0.43161021984551395, + "grad_norm": 0.5415567125539992, + "learning_rate": 1.8420036608520844e-05, + "loss": 0.8636, + "num_tokens": 15180157766.0, + "step": 3632 + }, + { + "epoch": 0.431729055258467, + "grad_norm": 0.49384768823779096, + "learning_rate": 1.841905628793054e-05, + "loss": 0.85, + "num_tokens": 15184337343.0, + "step": 3633 + }, + { + "epoch": 0.4318478906714201, + "grad_norm": 0.4719699388914426, + "learning_rate": 1.8418075692587488e-05, + "loss": 0.8738, + "num_tokens": 15188526776.0, + "step": 3634 + }, + { + "epoch": 0.43196672608437314, + "grad_norm": 0.6014015616783166, + "learning_rate": 1.8417094822528017e-05, + "loss": 0.8909, + "num_tokens": 15192695812.0, + "step": 3635 + }, + { + "epoch": 0.4320855614973262, + "grad_norm": 0.5055667773756163, + "learning_rate": 1.8416113677788443e-05, + "loss": 0.89, + "num_tokens": 15196885844.0, + "step": 3636 + }, + { + "epoch": 0.4322043969102793, + "grad_norm": 0.5193130931677239, + "learning_rate": 1.8415132258405107e-05, + "loss": 0.8807, + "num_tokens": 15201052240.0, + "step": 3637 + }, + { + "epoch": 0.43232323232323233, + "grad_norm": 0.49588132748940816, + "learning_rate": 1.8414150564414354e-05, + "loss": 0.8684, + "num_tokens": 15205239453.0, + "step": 3638 + }, + { + "epoch": 0.4324420677361854, + "grad_norm": 0.5198947121496978, + "learning_rate": 1.8413168595852538e-05, + "loss": 0.9215, + "num_tokens": 15209402209.0, + "step": 3639 + }, + { + "epoch": 0.4325609031491384, + "grad_norm": 0.5070467540673942, + "learning_rate": 1.8412186352756024e-05, + "loss": 0.8727, + "num_tokens": 15213575479.0, + "step": 3640 + }, + { + "epoch": 0.4326797385620915, + "grad_norm": 0.46865753978648134, + "learning_rate": 1.8411203835161183e-05, + "loss": 0.9137, + "num_tokens": 15217764051.0, + "step": 3641 + }, + { + "epoch": 0.43279857397504456, + "grad_norm": 0.4856614140029492, + "learning_rate": 1.8410221043104412e-05, + "loss": 0.9013, + "num_tokens": 15221893322.0, + "step": 3642 + }, + { + "epoch": 0.4329174093879976, + "grad_norm": 0.5262557293275106, + "learning_rate": 1.84092379766221e-05, + "loss": 0.901, + "num_tokens": 15226078881.0, + "step": 3643 + }, + { + "epoch": 0.4330362448009507, + "grad_norm": 0.5195789961506526, + "learning_rate": 1.8408254635750654e-05, + "loss": 0.9255, + "num_tokens": 15230267646.0, + "step": 3644 + }, + { + "epoch": 0.43315508021390375, + "grad_norm": 0.4671706088988373, + "learning_rate": 1.8407271020526488e-05, + "loss": 0.9102, + "num_tokens": 15234455904.0, + "step": 3645 + }, + { + "epoch": 0.4332739156268568, + "grad_norm": 0.5620816607123935, + "learning_rate": 1.8406287130986037e-05, + "loss": 0.8789, + "num_tokens": 15238631460.0, + "step": 3646 + }, + { + "epoch": 0.43339275103980984, + "grad_norm": 0.5101918304620942, + "learning_rate": 1.840530296716573e-05, + "loss": 0.8774, + "num_tokens": 15242820456.0, + "step": 3647 + }, + { + "epoch": 0.43351158645276294, + "grad_norm": 0.5118777795657266, + "learning_rate": 1.8404318529102016e-05, + "loss": 0.8408, + "num_tokens": 15247009257.0, + "step": 3648 + }, + { + "epoch": 0.433630421865716, + "grad_norm": 0.5038677227955212, + "learning_rate": 1.840333381683135e-05, + "loss": 0.8673, + "num_tokens": 15251176092.0, + "step": 3649 + }, + { + "epoch": 0.43374925727866903, + "grad_norm": 0.5048600393660967, + "learning_rate": 1.8402348830390206e-05, + "loss": 0.884, + "num_tokens": 15255364558.0, + "step": 3650 + }, + { + "epoch": 0.4338680926916221, + "grad_norm": 0.6702255095965814, + "learning_rate": 1.8401363569815057e-05, + "loss": 0.9089, + "num_tokens": 15259554029.0, + "step": 3651 + }, + { + "epoch": 0.4339869281045752, + "grad_norm": 0.4356372001274059, + "learning_rate": 1.8400378035142385e-05, + "loss": 0.9188, + "num_tokens": 15263735643.0, + "step": 3652 + }, + { + "epoch": 0.4341057635175282, + "grad_norm": 0.5008122640000248, + "learning_rate": 1.8399392226408695e-05, + "loss": 0.8888, + "num_tokens": 15267914862.0, + "step": 3653 + }, + { + "epoch": 0.43422459893048126, + "grad_norm": 0.6005494950780413, + "learning_rate": 1.83984061436505e-05, + "loss": 0.8866, + "num_tokens": 15272105032.0, + "step": 3654 + }, + { + "epoch": 0.43434343434343436, + "grad_norm": 0.47486337155746206, + "learning_rate": 1.83974197869043e-05, + "loss": 0.8528, + "num_tokens": 15276292073.0, + "step": 3655 + }, + { + "epoch": 0.4344622697563874, + "grad_norm": 0.5805508859157132, + "learning_rate": 1.839643315620664e-05, + "loss": 0.8741, + "num_tokens": 15280434732.0, + "step": 3656 + }, + { + "epoch": 0.43458110516934045, + "grad_norm": 0.4884908594487346, + "learning_rate": 1.839544625159405e-05, + "loss": 0.9052, + "num_tokens": 15284623295.0, + "step": 3657 + }, + { + "epoch": 0.4346999405822935, + "grad_norm": 0.6244701764589707, + "learning_rate": 1.8394459073103083e-05, + "loss": 0.8777, + "num_tokens": 15288812557.0, + "step": 3658 + }, + { + "epoch": 0.4348187759952466, + "grad_norm": 0.46671519483168267, + "learning_rate": 1.8393471620770294e-05, + "loss": 0.8561, + "num_tokens": 15292953188.0, + "step": 3659 + }, + { + "epoch": 0.43493761140819964, + "grad_norm": 0.4925359239506485, + "learning_rate": 1.8392483894632252e-05, + "loss": 0.9044, + "num_tokens": 15297131652.0, + "step": 3660 + }, + { + "epoch": 0.4350564468211527, + "grad_norm": 0.54218082514677, + "learning_rate": 1.8391495894725542e-05, + "loss": 0.847, + "num_tokens": 15301320793.0, + "step": 3661 + }, + { + "epoch": 0.4351752822341058, + "grad_norm": 0.6829286854586389, + "learning_rate": 1.839050762108674e-05, + "loss": 0.9198, + "num_tokens": 15305497600.0, + "step": 3662 + }, + { + "epoch": 0.43529411764705883, + "grad_norm": 0.4691295951950543, + "learning_rate": 1.838951907375246e-05, + "loss": 0.8513, + "num_tokens": 15309687816.0, + "step": 3663 + }, + { + "epoch": 0.4354129530600119, + "grad_norm": 0.5635121148240626, + "learning_rate": 1.83885302527593e-05, + "loss": 0.8783, + "num_tokens": 15313878011.0, + "step": 3664 + }, + { + "epoch": 0.4355317884729649, + "grad_norm": 0.5242935330539175, + "learning_rate": 1.8387541158143886e-05, + "loss": 0.8652, + "num_tokens": 15318067897.0, + "step": 3665 + }, + { + "epoch": 0.435650623885918, + "grad_norm": 0.4443290316923085, + "learning_rate": 1.8386551789942842e-05, + "loss": 0.8587, + "num_tokens": 15322256420.0, + "step": 3666 + }, + { + "epoch": 0.43576945929887106, + "grad_norm": 0.663387243061874, + "learning_rate": 1.8385562148192814e-05, + "loss": 0.9052, + "num_tokens": 15326416561.0, + "step": 3667 + }, + { + "epoch": 0.4358882947118241, + "grad_norm": 0.46034274133748126, + "learning_rate": 1.8384572232930446e-05, + "loss": 0.9007, + "num_tokens": 15330573084.0, + "step": 3668 + }, + { + "epoch": 0.4360071301247772, + "grad_norm": 0.5573903862296994, + "learning_rate": 1.8383582044192398e-05, + "loss": 0.8949, + "num_tokens": 15334748665.0, + "step": 3669 + }, + { + "epoch": 0.43612596553773025, + "grad_norm": 0.5153658177971036, + "learning_rate": 1.8382591582015347e-05, + "loss": 0.8412, + "num_tokens": 15338887793.0, + "step": 3670 + }, + { + "epoch": 0.4362448009506833, + "grad_norm": 0.5017878651623208, + "learning_rate": 1.838160084643597e-05, + "loss": 0.8508, + "num_tokens": 15343057329.0, + "step": 3671 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.5056065467076593, + "learning_rate": 1.838060983749095e-05, + "loss": 0.8686, + "num_tokens": 15347234969.0, + "step": 3672 + }, + { + "epoch": 0.43648247177658944, + "grad_norm": 0.5741016371403019, + "learning_rate": 1.8379618555216997e-05, + "loss": 0.9061, + "num_tokens": 15351425353.0, + "step": 3673 + }, + { + "epoch": 0.4366013071895425, + "grad_norm": 0.45052502255834165, + "learning_rate": 1.8378626999650813e-05, + "loss": 0.8684, + "num_tokens": 15355615263.0, + "step": 3674 + }, + { + "epoch": 0.43672014260249553, + "grad_norm": 0.5138596697372202, + "learning_rate": 1.837763517082913e-05, + "loss": 0.8885, + "num_tokens": 15359804612.0, + "step": 3675 + }, + { + "epoch": 0.43683897801544863, + "grad_norm": 0.5216547163477294, + "learning_rate": 1.8376643068788668e-05, + "loss": 0.8969, + "num_tokens": 15363993434.0, + "step": 3676 + }, + { + "epoch": 0.4369578134284017, + "grad_norm": 0.4429294142326443, + "learning_rate": 1.8375650693566177e-05, + "loss": 0.8571, + "num_tokens": 15368158082.0, + "step": 3677 + }, + { + "epoch": 0.4370766488413547, + "grad_norm": 0.5240485129652784, + "learning_rate": 1.83746580451984e-05, + "loss": 0.9052, + "num_tokens": 15372347078.0, + "step": 3678 + }, + { + "epoch": 0.43719548425430776, + "grad_norm": 0.6253885873886272, + "learning_rate": 1.8373665123722102e-05, + "loss": 0.8918, + "num_tokens": 15376537225.0, + "step": 3679 + }, + { + "epoch": 0.43731431966726086, + "grad_norm": 0.42007397784646944, + "learning_rate": 1.8372671929174054e-05, + "loss": 0.912, + "num_tokens": 15380699342.0, + "step": 3680 + }, + { + "epoch": 0.4374331550802139, + "grad_norm": 0.5728247165231058, + "learning_rate": 1.8371678461591034e-05, + "loss": 0.8591, + "num_tokens": 15384889125.0, + "step": 3681 + }, + { + "epoch": 0.43755199049316695, + "grad_norm": 0.5809818077076633, + "learning_rate": 1.8370684721009837e-05, + "loss": 0.8628, + "num_tokens": 15389049097.0, + "step": 3682 + }, + { + "epoch": 0.43767082590612, + "grad_norm": 0.5036662162708755, + "learning_rate": 1.8369690707467265e-05, + "loss": 0.8602, + "num_tokens": 15393239573.0, + "step": 3683 + }, + { + "epoch": 0.4377896613190731, + "grad_norm": 0.5516430595367952, + "learning_rate": 1.836869642100013e-05, + "loss": 0.8716, + "num_tokens": 15397427888.0, + "step": 3684 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.45819990976090175, + "learning_rate": 1.8367701861645248e-05, + "loss": 0.9089, + "num_tokens": 15401617264.0, + "step": 3685 + }, + { + "epoch": 0.4380273321449792, + "grad_norm": 0.578576762979747, + "learning_rate": 1.8366707029439457e-05, + "loss": 0.8588, + "num_tokens": 15405806402.0, + "step": 3686 + }, + { + "epoch": 0.4381461675579323, + "grad_norm": 0.4636133135818963, + "learning_rate": 1.83657119244196e-05, + "loss": 0.8962, + "num_tokens": 15409995096.0, + "step": 3687 + }, + { + "epoch": 0.43826500297088533, + "grad_norm": 0.459151205864276, + "learning_rate": 1.8364716546622525e-05, + "loss": 0.8676, + "num_tokens": 15414183285.0, + "step": 3688 + }, + { + "epoch": 0.4383838383838384, + "grad_norm": 0.6603599100933475, + "learning_rate": 1.8363720896085096e-05, + "loss": 0.8507, + "num_tokens": 15418371133.0, + "step": 3689 + }, + { + "epoch": 0.4385026737967914, + "grad_norm": 0.41817660541660506, + "learning_rate": 1.8362724972844183e-05, + "loss": 0.9275, + "num_tokens": 15422562176.0, + "step": 3690 + }, + { + "epoch": 0.4386215092097445, + "grad_norm": 0.5728736541654613, + "learning_rate": 1.8361728776936672e-05, + "loss": 0.8708, + "num_tokens": 15426751014.0, + "step": 3691 + }, + { + "epoch": 0.43874034462269756, + "grad_norm": 0.5106462849098033, + "learning_rate": 1.8360732308399454e-05, + "loss": 0.8713, + "num_tokens": 15430926365.0, + "step": 3692 + }, + { + "epoch": 0.4388591800356506, + "grad_norm": 0.4807382024758256, + "learning_rate": 1.835973556726943e-05, + "loss": 0.8527, + "num_tokens": 15435069733.0, + "step": 3693 + }, + { + "epoch": 0.4389780154486037, + "grad_norm": 0.4915550657946906, + "learning_rate": 1.8358738553583514e-05, + "loss": 0.8802, + "num_tokens": 15439259554.0, + "step": 3694 + }, + { + "epoch": 0.43909685086155675, + "grad_norm": 0.5229555536988062, + "learning_rate": 1.8357741267378633e-05, + "loss": 0.8586, + "num_tokens": 15443417752.0, + "step": 3695 + }, + { + "epoch": 0.4392156862745098, + "grad_norm": 0.6072580353989185, + "learning_rate": 1.8356743708691712e-05, + "loss": 0.89, + "num_tokens": 15447584818.0, + "step": 3696 + }, + { + "epoch": 0.43933452168746284, + "grad_norm": 0.5119559367131855, + "learning_rate": 1.8355745877559697e-05, + "loss": 0.8758, + "num_tokens": 15451764013.0, + "step": 3697 + }, + { + "epoch": 0.43945335710041594, + "grad_norm": 0.44143863405937955, + "learning_rate": 1.8354747774019545e-05, + "loss": 0.9005, + "num_tokens": 15455924520.0, + "step": 3698 + }, + { + "epoch": 0.439572192513369, + "grad_norm": 0.5500459578786708, + "learning_rate": 1.835374939810821e-05, + "loss": 0.8813, + "num_tokens": 15460085929.0, + "step": 3699 + }, + { + "epoch": 0.43969102792632203, + "grad_norm": 0.6905251882968528, + "learning_rate": 1.8352750749862677e-05, + "loss": 0.8689, + "num_tokens": 15464275696.0, + "step": 3700 + }, + { + "epoch": 0.43980986333927513, + "grad_norm": 0.4539317042954327, + "learning_rate": 1.835175182931992e-05, + "loss": 0.8741, + "num_tokens": 15468466137.0, + "step": 3701 + }, + { + "epoch": 0.4399286987522282, + "grad_norm": 0.6541851725512415, + "learning_rate": 1.835075263651694e-05, + "loss": 0.9185, + "num_tokens": 15472654730.0, + "step": 3702 + }, + { + "epoch": 0.4400475341651812, + "grad_norm": 0.5347371576415296, + "learning_rate": 1.834975317149073e-05, + "loss": 0.8753, + "num_tokens": 15476816515.0, + "step": 3703 + }, + { + "epoch": 0.44016636957813426, + "grad_norm": 0.6338441208783675, + "learning_rate": 1.8348753434278316e-05, + "loss": 0.8424, + "num_tokens": 15481005246.0, + "step": 3704 + }, + { + "epoch": 0.44028520499108736, + "grad_norm": 0.4311761985343555, + "learning_rate": 1.8347753424916713e-05, + "loss": 0.8764, + "num_tokens": 15485192213.0, + "step": 3705 + }, + { + "epoch": 0.4404040404040404, + "grad_norm": 0.6568943220166632, + "learning_rate": 1.8346753143442956e-05, + "loss": 0.8755, + "num_tokens": 15489382353.0, + "step": 3706 + }, + { + "epoch": 0.44052287581699345, + "grad_norm": 0.5078767551128822, + "learning_rate": 1.8345752589894095e-05, + "loss": 0.8803, + "num_tokens": 15493570879.0, + "step": 3707 + }, + { + "epoch": 0.4406417112299465, + "grad_norm": 0.6872083174567445, + "learning_rate": 1.8344751764307177e-05, + "loss": 0.8598, + "num_tokens": 15497760498.0, + "step": 3708 + }, + { + "epoch": 0.4407605466428996, + "grad_norm": 0.5055792636578873, + "learning_rate": 1.8343750666719266e-05, + "loss": 0.8864, + "num_tokens": 15501948601.0, + "step": 3709 + }, + { + "epoch": 0.44087938205585264, + "grad_norm": 0.5604428955716911, + "learning_rate": 1.834274929716744e-05, + "loss": 0.8686, + "num_tokens": 15506133483.0, + "step": 3710 + }, + { + "epoch": 0.4409982174688057, + "grad_norm": 0.584388294463331, + "learning_rate": 1.8341747655688777e-05, + "loss": 0.8834, + "num_tokens": 15510323082.0, + "step": 3711 + }, + { + "epoch": 0.4411170528817588, + "grad_norm": 0.6208528224689229, + "learning_rate": 1.8340745742320378e-05, + "loss": 0.8841, + "num_tokens": 15514485908.0, + "step": 3712 + }, + { + "epoch": 0.44123588829471183, + "grad_norm": 0.4727061088291678, + "learning_rate": 1.8339743557099347e-05, + "loss": 0.8838, + "num_tokens": 15518675746.0, + "step": 3713 + }, + { + "epoch": 0.4413547237076649, + "grad_norm": 0.6566803616715935, + "learning_rate": 1.833874110006279e-05, + "loss": 0.8746, + "num_tokens": 15522865099.0, + "step": 3714 + }, + { + "epoch": 0.4414735591206179, + "grad_norm": 0.5705444524228692, + "learning_rate": 1.8337738371247845e-05, + "loss": 0.8559, + "num_tokens": 15527056774.0, + "step": 3715 + }, + { + "epoch": 0.441592394533571, + "grad_norm": 0.5818805059039726, + "learning_rate": 1.8336735370691632e-05, + "loss": 0.8855, + "num_tokens": 15531221742.0, + "step": 3716 + }, + { + "epoch": 0.44171122994652406, + "grad_norm": 0.6513187557974598, + "learning_rate": 1.8335732098431306e-05, + "loss": 0.8721, + "num_tokens": 15535405850.0, + "step": 3717 + }, + { + "epoch": 0.4418300653594771, + "grad_norm": 0.48176206264536364, + "learning_rate": 1.8334728554504016e-05, + "loss": 0.8733, + "num_tokens": 15539594505.0, + "step": 3718 + }, + { + "epoch": 0.4419489007724302, + "grad_norm": 0.6841904866597109, + "learning_rate": 1.8333724738946928e-05, + "loss": 0.8709, + "num_tokens": 15543784725.0, + "step": 3719 + }, + { + "epoch": 0.44206773618538325, + "grad_norm": 0.5204470922591244, + "learning_rate": 1.833272065179722e-05, + "loss": 0.9035, + "num_tokens": 15547950502.0, + "step": 3720 + }, + { + "epoch": 0.4421865715983363, + "grad_norm": 0.5549728249782032, + "learning_rate": 1.8331716293092068e-05, + "loss": 0.8575, + "num_tokens": 15552139046.0, + "step": 3721 + }, + { + "epoch": 0.44230540701128934, + "grad_norm": 0.5930065431796324, + "learning_rate": 1.8330711662868676e-05, + "loss": 0.9099, + "num_tokens": 15556313748.0, + "step": 3722 + }, + { + "epoch": 0.44242424242424244, + "grad_norm": 0.5658887227001985, + "learning_rate": 1.832970676116425e-05, + "loss": 0.834, + "num_tokens": 15560484342.0, + "step": 3723 + }, + { + "epoch": 0.4425430778371955, + "grad_norm": 0.5955977819572729, + "learning_rate": 1.832870158801599e-05, + "loss": 0.8741, + "num_tokens": 15564673511.0, + "step": 3724 + }, + { + "epoch": 0.44266191325014853, + "grad_norm": 0.5175877040672421, + "learning_rate": 1.8327696143461137e-05, + "loss": 0.8617, + "num_tokens": 15568863171.0, + "step": 3725 + }, + { + "epoch": 0.44278074866310163, + "grad_norm": 0.6002008839193701, + "learning_rate": 1.832669042753692e-05, + "loss": 0.8602, + "num_tokens": 15573052155.0, + "step": 3726 + }, + { + "epoch": 0.4428995840760547, + "grad_norm": 0.5524987381980107, + "learning_rate": 1.8325684440280586e-05, + "loss": 0.8931, + "num_tokens": 15577209865.0, + "step": 3727 + }, + { + "epoch": 0.4430184194890077, + "grad_norm": 0.48535455276180417, + "learning_rate": 1.8324678181729388e-05, + "loss": 0.8914, + "num_tokens": 15581400256.0, + "step": 3728 + }, + { + "epoch": 0.44313725490196076, + "grad_norm": 0.5014743441770387, + "learning_rate": 1.8323671651920588e-05, + "loss": 0.8698, + "num_tokens": 15585589671.0, + "step": 3729 + }, + { + "epoch": 0.44325609031491386, + "grad_norm": 0.5596707479438244, + "learning_rate": 1.832266485089147e-05, + "loss": 0.8452, + "num_tokens": 15589778741.0, + "step": 3730 + }, + { + "epoch": 0.4433749257278669, + "grad_norm": 0.5645457015830891, + "learning_rate": 1.832165777867931e-05, + "loss": 0.8845, + "num_tokens": 15593968639.0, + "step": 3731 + }, + { + "epoch": 0.44349376114081995, + "grad_norm": 0.46565900082629114, + "learning_rate": 1.832065043532141e-05, + "loss": 0.8995, + "num_tokens": 15598159211.0, + "step": 3732 + }, + { + "epoch": 0.443612596553773, + "grad_norm": 0.5967482295888, + "learning_rate": 1.831964282085507e-05, + "loss": 0.8715, + "num_tokens": 15602346810.0, + "step": 3733 + }, + { + "epoch": 0.4437314319667261, + "grad_norm": 0.6032240797304578, + "learning_rate": 1.8318634935317606e-05, + "loss": 0.8573, + "num_tokens": 15606511985.0, + "step": 3734 + }, + { + "epoch": 0.44385026737967914, + "grad_norm": 0.4143360134655602, + "learning_rate": 1.831762677874635e-05, + "loss": 0.8924, + "num_tokens": 15610696897.0, + "step": 3735 + }, + { + "epoch": 0.4439691027926322, + "grad_norm": 0.5438065110898691, + "learning_rate": 1.8316618351178633e-05, + "loss": 0.9005, + "num_tokens": 15614877535.0, + "step": 3736 + }, + { + "epoch": 0.4440879382055853, + "grad_norm": 0.5615221665742087, + "learning_rate": 1.83156096526518e-05, + "loss": 0.8761, + "num_tokens": 15619053037.0, + "step": 3737 + }, + { + "epoch": 0.44420677361853833, + "grad_norm": 0.5171451780119924, + "learning_rate": 1.8314600683203205e-05, + "loss": 0.8833, + "num_tokens": 15623243237.0, + "step": 3738 + }, + { + "epoch": 0.4443256090314914, + "grad_norm": 0.6066852744466706, + "learning_rate": 1.8313591442870217e-05, + "loss": 0.8223, + "num_tokens": 15627384978.0, + "step": 3739 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.4516220547958209, + "learning_rate": 1.831258193169021e-05, + "loss": 0.8489, + "num_tokens": 15631573925.0, + "step": 3740 + }, + { + "epoch": 0.4445632798573975, + "grad_norm": 0.5957419755125485, + "learning_rate": 1.831157214970057e-05, + "loss": 0.8868, + "num_tokens": 15635743874.0, + "step": 3741 + }, + { + "epoch": 0.44468211527035056, + "grad_norm": 0.46839912967069003, + "learning_rate": 1.8310562096938692e-05, + "loss": 0.8788, + "num_tokens": 15639907017.0, + "step": 3742 + }, + { + "epoch": 0.4448009506833036, + "grad_norm": 0.6400065362957653, + "learning_rate": 1.830955177344198e-05, + "loss": 0.8747, + "num_tokens": 15644096849.0, + "step": 3743 + }, + { + "epoch": 0.4449197860962567, + "grad_norm": 0.4562125395436335, + "learning_rate": 1.8308541179247853e-05, + "loss": 0.8706, + "num_tokens": 15648286473.0, + "step": 3744 + }, + { + "epoch": 0.44503862150920975, + "grad_norm": 0.5521353183136195, + "learning_rate": 1.8307530314393737e-05, + "loss": 0.8971, + "num_tokens": 15652459597.0, + "step": 3745 + }, + { + "epoch": 0.4451574569221628, + "grad_norm": 0.4465785186792986, + "learning_rate": 1.8306519178917062e-05, + "loss": 0.9256, + "num_tokens": 15656649162.0, + "step": 3746 + }, + { + "epoch": 0.44527629233511584, + "grad_norm": 0.599376407857127, + "learning_rate": 1.830550777285528e-05, + "loss": 0.8408, + "num_tokens": 15660828784.0, + "step": 3747 + }, + { + "epoch": 0.44539512774806894, + "grad_norm": 0.4255892470465542, + "learning_rate": 1.8304496096245846e-05, + "loss": 0.911, + "num_tokens": 15665000642.0, + "step": 3748 + }, + { + "epoch": 0.445513963161022, + "grad_norm": 0.4960601055071089, + "learning_rate": 1.8303484149126223e-05, + "loss": 0.9158, + "num_tokens": 15669190538.0, + "step": 3749 + }, + { + "epoch": 0.44563279857397503, + "grad_norm": 0.5076535758941729, + "learning_rate": 1.830247193153389e-05, + "loss": 0.9063, + "num_tokens": 15673379703.0, + "step": 3750 + }, + { + "epoch": 0.44575163398692813, + "grad_norm": 0.5029205634238134, + "learning_rate": 1.830145944350633e-05, + "loss": 0.899, + "num_tokens": 15677570269.0, + "step": 3751 + }, + { + "epoch": 0.4458704693998812, + "grad_norm": 0.4359934632895124, + "learning_rate": 1.8300446685081038e-05, + "loss": 0.9115, + "num_tokens": 15681758303.0, + "step": 3752 + }, + { + "epoch": 0.4459893048128342, + "grad_norm": 0.535308348192851, + "learning_rate": 1.8299433656295527e-05, + "loss": 0.8775, + "num_tokens": 15685926754.0, + "step": 3753 + }, + { + "epoch": 0.44610814022578726, + "grad_norm": 0.5207065461847983, + "learning_rate": 1.8298420357187304e-05, + "loss": 0.8658, + "num_tokens": 15690075599.0, + "step": 3754 + }, + { + "epoch": 0.44622697563874036, + "grad_norm": 0.53502542855917, + "learning_rate": 1.8297406787793897e-05, + "loss": 0.8704, + "num_tokens": 15694264456.0, + "step": 3755 + }, + { + "epoch": 0.4463458110516934, + "grad_norm": 0.542545181310368, + "learning_rate": 1.8296392948152846e-05, + "loss": 0.8866, + "num_tokens": 15698453711.0, + "step": 3756 + }, + { + "epoch": 0.44646464646464645, + "grad_norm": 0.47321460776898105, + "learning_rate": 1.8295378838301695e-05, + "loss": 0.8778, + "num_tokens": 15702643484.0, + "step": 3757 + }, + { + "epoch": 0.4465834818775995, + "grad_norm": 0.49702576436510426, + "learning_rate": 1.8294364458278e-05, + "loss": 0.8652, + "num_tokens": 15706833400.0, + "step": 3758 + }, + { + "epoch": 0.4467023172905526, + "grad_norm": 0.47631305030387905, + "learning_rate": 1.8293349808119326e-05, + "loss": 0.8649, + "num_tokens": 15710990687.0, + "step": 3759 + }, + { + "epoch": 0.44682115270350564, + "grad_norm": 0.49748305932769404, + "learning_rate": 1.8292334887863246e-05, + "loss": 0.8836, + "num_tokens": 15715154126.0, + "step": 3760 + }, + { + "epoch": 0.4469399881164587, + "grad_norm": 0.6131915751180492, + "learning_rate": 1.829131969754735e-05, + "loss": 0.9207, + "num_tokens": 15719344314.0, + "step": 3761 + }, + { + "epoch": 0.4470588235294118, + "grad_norm": 0.4198228708496172, + "learning_rate": 1.8290304237209237e-05, + "loss": 0.8717, + "num_tokens": 15723489428.0, + "step": 3762 + }, + { + "epoch": 0.44717765894236483, + "grad_norm": 0.5376595999800442, + "learning_rate": 1.8289288506886503e-05, + "loss": 0.8996, + "num_tokens": 15727655542.0, + "step": 3763 + }, + { + "epoch": 0.4472964943553179, + "grad_norm": 0.4760053416239726, + "learning_rate": 1.8288272506616775e-05, + "loss": 0.8895, + "num_tokens": 15731840866.0, + "step": 3764 + }, + { + "epoch": 0.4474153297682709, + "grad_norm": 0.44701074992729295, + "learning_rate": 1.828725623643767e-05, + "loss": 0.9239, + "num_tokens": 15736009602.0, + "step": 3765 + }, + { + "epoch": 0.447534165181224, + "grad_norm": 0.5473403673283027, + "learning_rate": 1.828623969638683e-05, + "loss": 0.902, + "num_tokens": 15740199142.0, + "step": 3766 + }, + { + "epoch": 0.44765300059417706, + "grad_norm": 0.5202873298644399, + "learning_rate": 1.82852228865019e-05, + "loss": 0.8699, + "num_tokens": 15744381182.0, + "step": 3767 + }, + { + "epoch": 0.4477718360071301, + "grad_norm": 0.47612876941047494, + "learning_rate": 1.8284205806820533e-05, + "loss": 0.8408, + "num_tokens": 15748570616.0, + "step": 3768 + }, + { + "epoch": 0.4478906714200832, + "grad_norm": 0.503639633682219, + "learning_rate": 1.8283188457380395e-05, + "loss": 0.8944, + "num_tokens": 15752728909.0, + "step": 3769 + }, + { + "epoch": 0.44800950683303625, + "grad_norm": 0.4463454073909981, + "learning_rate": 1.8282170838219167e-05, + "loss": 0.8769, + "num_tokens": 15756917565.0, + "step": 3770 + }, + { + "epoch": 0.4481283422459893, + "grad_norm": 0.4617858471974359, + "learning_rate": 1.8281152949374527e-05, + "loss": 0.8717, + "num_tokens": 15761107298.0, + "step": 3771 + }, + { + "epoch": 0.44824717765894234, + "grad_norm": 0.5054164423179864, + "learning_rate": 1.8280134790884178e-05, + "loss": 0.8707, + "num_tokens": 15765296622.0, + "step": 3772 + }, + { + "epoch": 0.44836601307189544, + "grad_norm": 0.4531037859065886, + "learning_rate": 1.8279116362785824e-05, + "loss": 0.879, + "num_tokens": 15769448148.0, + "step": 3773 + }, + { + "epoch": 0.4484848484848485, + "grad_norm": 0.43732842558962737, + "learning_rate": 1.8278097665117178e-05, + "loss": 0.886, + "num_tokens": 15773623463.0, + "step": 3774 + }, + { + "epoch": 0.44860368389780153, + "grad_norm": 0.4716412973387505, + "learning_rate": 1.8277078697915966e-05, + "loss": 0.9058, + "num_tokens": 15777812077.0, + "step": 3775 + }, + { + "epoch": 0.44872251931075463, + "grad_norm": 0.5488187535287786, + "learning_rate": 1.8276059461219927e-05, + "loss": 0.8926, + "num_tokens": 15781999949.0, + "step": 3776 + }, + { + "epoch": 0.4488413547237077, + "grad_norm": 0.4531294735663819, + "learning_rate": 1.8275039955066806e-05, + "loss": 0.8925, + "num_tokens": 15786186812.0, + "step": 3777 + }, + { + "epoch": 0.4489601901366607, + "grad_norm": 0.5225531138815448, + "learning_rate": 1.8274020179494355e-05, + "loss": 0.9321, + "num_tokens": 15790376616.0, + "step": 3778 + }, + { + "epoch": 0.44907902554961376, + "grad_norm": 0.5899363355175314, + "learning_rate": 1.8273000134540348e-05, + "loss": 0.8766, + "num_tokens": 15794558510.0, + "step": 3779 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 0.425089878593, + "learning_rate": 1.8271979820242552e-05, + "loss": 0.8755, + "num_tokens": 15798747331.0, + "step": 3780 + }, + { + "epoch": 0.4493166963755199, + "grad_norm": 0.5112603495169293, + "learning_rate": 1.8270959236638755e-05, + "loss": 0.8682, + "num_tokens": 15802903353.0, + "step": 3781 + }, + { + "epoch": 0.44943553178847295, + "grad_norm": 0.4623756115496136, + "learning_rate": 1.8269938383766756e-05, + "loss": 0.8627, + "num_tokens": 15807063654.0, + "step": 3782 + }, + { + "epoch": 0.449554367201426, + "grad_norm": 0.5015847788899367, + "learning_rate": 1.8268917261664358e-05, + "loss": 0.8651, + "num_tokens": 15811251256.0, + "step": 3783 + }, + { + "epoch": 0.4496732026143791, + "grad_norm": 0.5452402883163325, + "learning_rate": 1.8267895870369377e-05, + "loss": 0.8569, + "num_tokens": 15815429427.0, + "step": 3784 + }, + { + "epoch": 0.44979203802733214, + "grad_norm": 0.431545902983992, + "learning_rate": 1.826687420991964e-05, + "loss": 0.8897, + "num_tokens": 15819583821.0, + "step": 3785 + }, + { + "epoch": 0.4499108734402852, + "grad_norm": 0.5300922829742907, + "learning_rate": 1.8265852280352977e-05, + "loss": 0.8949, + "num_tokens": 15823772989.0, + "step": 3786 + }, + { + "epoch": 0.4500297088532383, + "grad_norm": 0.48293714380474406, + "learning_rate": 1.826483008170724e-05, + "loss": 0.9296, + "num_tokens": 15827962122.0, + "step": 3787 + }, + { + "epoch": 0.45014854426619133, + "grad_norm": 0.49642185554938517, + "learning_rate": 1.8263807614020284e-05, + "loss": 0.8635, + "num_tokens": 15832150016.0, + "step": 3788 + }, + { + "epoch": 0.4502673796791444, + "grad_norm": 0.5856051039980941, + "learning_rate": 1.8262784877329968e-05, + "loss": 0.9036, + "num_tokens": 15836339341.0, + "step": 3789 + }, + { + "epoch": 0.4503862150920974, + "grad_norm": 0.47898342450638093, + "learning_rate": 1.8261761871674178e-05, + "loss": 0.8829, + "num_tokens": 15840474074.0, + "step": 3790 + }, + { + "epoch": 0.4505050505050505, + "grad_norm": 0.5116189151660632, + "learning_rate": 1.826073859709079e-05, + "loss": 0.8797, + "num_tokens": 15844659412.0, + "step": 3791 + }, + { + "epoch": 0.45062388591800356, + "grad_norm": 0.49563145154078775, + "learning_rate": 1.82597150536177e-05, + "loss": 0.8453, + "num_tokens": 15848849918.0, + "step": 3792 + }, + { + "epoch": 0.4507427213309566, + "grad_norm": 0.5170190816353134, + "learning_rate": 1.825869124129282e-05, + "loss": 0.8586, + "num_tokens": 15853038728.0, + "step": 3793 + }, + { + "epoch": 0.4508615567439097, + "grad_norm": 0.4730937835844594, + "learning_rate": 1.825766716015406e-05, + "loss": 0.9275, + "num_tokens": 15857201583.0, + "step": 3794 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.5170976082250136, + "learning_rate": 1.8256642810239346e-05, + "loss": 0.8378, + "num_tokens": 15861391812.0, + "step": 3795 + }, + { + "epoch": 0.4510992275698158, + "grad_norm": 0.6093246527857359, + "learning_rate": 1.825561819158661e-05, + "loss": 0.9167, + "num_tokens": 15865580750.0, + "step": 3796 + }, + { + "epoch": 0.45121806298276884, + "grad_norm": 0.4320987884904431, + "learning_rate": 1.8254593304233805e-05, + "loss": 0.8764, + "num_tokens": 15869766621.0, + "step": 3797 + }, + { + "epoch": 0.45133689839572194, + "grad_norm": 0.5406010005837026, + "learning_rate": 1.825356814821888e-05, + "loss": 0.8732, + "num_tokens": 15873936448.0, + "step": 3798 + }, + { + "epoch": 0.451455733808675, + "grad_norm": 0.5924433329536997, + "learning_rate": 1.8252542723579803e-05, + "loss": 0.8668, + "num_tokens": 15878126242.0, + "step": 3799 + }, + { + "epoch": 0.45157456922162803, + "grad_norm": 0.47242724577663286, + "learning_rate": 1.8251517030354547e-05, + "loss": 0.8935, + "num_tokens": 15882284069.0, + "step": 3800 + }, + { + "epoch": 0.45169340463458113, + "grad_norm": 0.6102863421079805, + "learning_rate": 1.8250491068581096e-05, + "loss": 0.8849, + "num_tokens": 15886473978.0, + "step": 3801 + }, + { + "epoch": 0.4518122400475342, + "grad_norm": 0.515788344860929, + "learning_rate": 1.8249464838297446e-05, + "loss": 0.8731, + "num_tokens": 15890663598.0, + "step": 3802 + }, + { + "epoch": 0.4519310754604872, + "grad_norm": 0.5352631566965664, + "learning_rate": 1.8248438339541604e-05, + "loss": 0.8949, + "num_tokens": 15894852221.0, + "step": 3803 + }, + { + "epoch": 0.45204991087344026, + "grad_norm": 0.4745904392509818, + "learning_rate": 1.8247411572351585e-05, + "loss": 0.8839, + "num_tokens": 15899041307.0, + "step": 3804 + }, + { + "epoch": 0.45216874628639336, + "grad_norm": 0.5824889400437342, + "learning_rate": 1.8246384536765404e-05, + "loss": 0.8903, + "num_tokens": 15903231456.0, + "step": 3805 + }, + { + "epoch": 0.4522875816993464, + "grad_norm": 0.4834699901028532, + "learning_rate": 1.824535723282111e-05, + "loss": 0.9034, + "num_tokens": 15907392972.0, + "step": 3806 + }, + { + "epoch": 0.45240641711229945, + "grad_norm": 0.5379403749903074, + "learning_rate": 1.8244329660556737e-05, + "loss": 0.878, + "num_tokens": 15911552097.0, + "step": 3807 + }, + { + "epoch": 0.45252525252525255, + "grad_norm": 0.48110898721050316, + "learning_rate": 1.824330182001034e-05, + "loss": 0.9004, + "num_tokens": 15915740964.0, + "step": 3808 + }, + { + "epoch": 0.4526440879382056, + "grad_norm": 0.5698875219700138, + "learning_rate": 1.8242273711219992e-05, + "loss": 0.8569, + "num_tokens": 15919904179.0, + "step": 3809 + }, + { + "epoch": 0.45276292335115864, + "grad_norm": 0.4108465821833891, + "learning_rate": 1.8241245334223762e-05, + "loss": 0.8709, + "num_tokens": 15924091040.0, + "step": 3810 + }, + { + "epoch": 0.4528817587641117, + "grad_norm": 0.5688676914763239, + "learning_rate": 1.824021668905973e-05, + "loss": 0.8549, + "num_tokens": 15928248559.0, + "step": 3811 + }, + { + "epoch": 0.4530005941770648, + "grad_norm": 0.47351621427541163, + "learning_rate": 1.8239187775765998e-05, + "loss": 0.8553, + "num_tokens": 15932410622.0, + "step": 3812 + }, + { + "epoch": 0.45311942959001783, + "grad_norm": 0.4645540883820255, + "learning_rate": 1.8238158594380667e-05, + "loss": 0.9207, + "num_tokens": 15936572471.0, + "step": 3813 + }, + { + "epoch": 0.4532382650029709, + "grad_norm": 0.4915904427462432, + "learning_rate": 1.8237129144941847e-05, + "loss": 0.8093, + "num_tokens": 15940761984.0, + "step": 3814 + }, + { + "epoch": 0.4533571004159239, + "grad_norm": 0.5942819614242071, + "learning_rate": 1.8236099427487666e-05, + "loss": 0.9122, + "num_tokens": 15944913605.0, + "step": 3815 + }, + { + "epoch": 0.453475935828877, + "grad_norm": 0.42797738643978267, + "learning_rate": 1.823506944205626e-05, + "loss": 0.8608, + "num_tokens": 15949104733.0, + "step": 3816 + }, + { + "epoch": 0.45359477124183006, + "grad_norm": 0.5297920835878226, + "learning_rate": 1.8234039188685766e-05, + "loss": 0.8833, + "num_tokens": 15953287670.0, + "step": 3817 + }, + { + "epoch": 0.4537136066547831, + "grad_norm": 0.5556397065885181, + "learning_rate": 1.8233008667414344e-05, + "loss": 0.8673, + "num_tokens": 15957477218.0, + "step": 3818 + }, + { + "epoch": 0.4538324420677362, + "grad_norm": 0.5243878959610326, + "learning_rate": 1.8231977878280153e-05, + "loss": 0.883, + "num_tokens": 15961625171.0, + "step": 3819 + }, + { + "epoch": 0.45395127748068925, + "grad_norm": 0.5233005155659501, + "learning_rate": 1.8230946821321375e-05, + "loss": 0.8748, + "num_tokens": 15965814069.0, + "step": 3820 + }, + { + "epoch": 0.4540701128936423, + "grad_norm": 0.5346136041114989, + "learning_rate": 1.8229915496576183e-05, + "loss": 0.8841, + "num_tokens": 15970003074.0, + "step": 3821 + }, + { + "epoch": 0.45418894830659534, + "grad_norm": 0.47987409194077596, + "learning_rate": 1.822888390408278e-05, + "loss": 0.874, + "num_tokens": 15974191992.0, + "step": 3822 + }, + { + "epoch": 0.45430778371954844, + "grad_norm": 0.4895255690141206, + "learning_rate": 1.822785204387936e-05, + "loss": 0.8415, + "num_tokens": 15978381789.0, + "step": 3823 + }, + { + "epoch": 0.4544266191325015, + "grad_norm": 0.5482395137495971, + "learning_rate": 1.822681991600414e-05, + "loss": 0.8755, + "num_tokens": 15982564656.0, + "step": 3824 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.47068453700597995, + "learning_rate": 1.8225787520495348e-05, + "loss": 0.901, + "num_tokens": 15986728446.0, + "step": 3825 + }, + { + "epoch": 0.45466428995840763, + "grad_norm": 0.4484405995417654, + "learning_rate": 1.8224754857391214e-05, + "loss": 0.8855, + "num_tokens": 15990903658.0, + "step": 3826 + }, + { + "epoch": 0.4547831253713607, + "grad_norm": 0.5174317530734792, + "learning_rate": 1.822372192672998e-05, + "loss": 0.8679, + "num_tokens": 15995090623.0, + "step": 3827 + }, + { + "epoch": 0.4549019607843137, + "grad_norm": 0.5978787275540366, + "learning_rate": 1.8222688728549895e-05, + "loss": 0.8479, + "num_tokens": 15999242182.0, + "step": 3828 + }, + { + "epoch": 0.45502079619726676, + "grad_norm": 0.47381689864671267, + "learning_rate": 1.8221655262889227e-05, + "loss": 0.9007, + "num_tokens": 16003431073.0, + "step": 3829 + }, + { + "epoch": 0.45513963161021986, + "grad_norm": 0.5406223072105301, + "learning_rate": 1.8220621529786247e-05, + "loss": 0.9093, + "num_tokens": 16007619335.0, + "step": 3830 + }, + { + "epoch": 0.4552584670231729, + "grad_norm": 0.5301959380860931, + "learning_rate": 1.8219587529279242e-05, + "loss": 0.8727, + "num_tokens": 16011777722.0, + "step": 3831 + }, + { + "epoch": 0.45537730243612595, + "grad_norm": 0.5192567960744373, + "learning_rate": 1.8218553261406497e-05, + "loss": 0.865, + "num_tokens": 16015966520.0, + "step": 3832 + }, + { + "epoch": 0.45549613784907905, + "grad_norm": 0.502574228295586, + "learning_rate": 1.821751872620632e-05, + "loss": 0.8761, + "num_tokens": 16020154651.0, + "step": 3833 + }, + { + "epoch": 0.4556149732620321, + "grad_norm": 0.5219606428966443, + "learning_rate": 1.8216483923717023e-05, + "loss": 0.8875, + "num_tokens": 16024344043.0, + "step": 3834 + }, + { + "epoch": 0.45573380867498514, + "grad_norm": 0.5600463794858759, + "learning_rate": 1.8215448853976923e-05, + "loss": 0.8657, + "num_tokens": 16028520547.0, + "step": 3835 + }, + { + "epoch": 0.4558526440879382, + "grad_norm": 0.46894713739490484, + "learning_rate": 1.821441351702436e-05, + "loss": 0.8758, + "num_tokens": 16032708960.0, + "step": 3836 + }, + { + "epoch": 0.4559714795008913, + "grad_norm": 0.49379528765694686, + "learning_rate": 1.8213377912897668e-05, + "loss": 0.8785, + "num_tokens": 16036898231.0, + "step": 3837 + }, + { + "epoch": 0.45609031491384433, + "grad_norm": 0.4814263370401866, + "learning_rate": 1.8212342041635203e-05, + "loss": 0.8495, + "num_tokens": 16041043728.0, + "step": 3838 + }, + { + "epoch": 0.4562091503267974, + "grad_norm": 0.5112740445340979, + "learning_rate": 1.8211305903275332e-05, + "loss": 0.8964, + "num_tokens": 16045232173.0, + "step": 3839 + }, + { + "epoch": 0.4563279857397504, + "grad_norm": 0.43716726424674124, + "learning_rate": 1.8210269497856418e-05, + "loss": 0.9013, + "num_tokens": 16049421905.0, + "step": 3840 + }, + { + "epoch": 0.4564468211527035, + "grad_norm": 0.6674965648416705, + "learning_rate": 1.8209232825416845e-05, + "loss": 0.8692, + "num_tokens": 16053563266.0, + "step": 3841 + }, + { + "epoch": 0.45656565656565656, + "grad_norm": 0.3751645472600462, + "learning_rate": 1.820819588599501e-05, + "loss": 0.8784, + "num_tokens": 16057747329.0, + "step": 3842 + }, + { + "epoch": 0.4566844919786096, + "grad_norm": 0.5491995830492988, + "learning_rate": 1.8207158679629305e-05, + "loss": 0.9184, + "num_tokens": 16061912753.0, + "step": 3843 + }, + { + "epoch": 0.4568033273915627, + "grad_norm": 0.5247931353498105, + "learning_rate": 1.8206121206358143e-05, + "loss": 0.8974, + "num_tokens": 16066102311.0, + "step": 3844 + }, + { + "epoch": 0.45692216280451575, + "grad_norm": 0.4789176409489225, + "learning_rate": 1.8205083466219958e-05, + "loss": 0.8827, + "num_tokens": 16070286778.0, + "step": 3845 + }, + { + "epoch": 0.4570409982174688, + "grad_norm": 0.5170579629444676, + "learning_rate": 1.8204045459253163e-05, + "loss": 0.8887, + "num_tokens": 16074467035.0, + "step": 3846 + }, + { + "epoch": 0.45715983363042184, + "grad_norm": 0.5966271899970124, + "learning_rate": 1.820300718549621e-05, + "loss": 0.8512, + "num_tokens": 16078647563.0, + "step": 3847 + }, + { + "epoch": 0.45727866904337494, + "grad_norm": 0.530796240120749, + "learning_rate": 1.8201968644987548e-05, + "loss": 0.8778, + "num_tokens": 16082815721.0, + "step": 3848 + }, + { + "epoch": 0.457397504456328, + "grad_norm": 0.5373693464882391, + "learning_rate": 1.8200929837765634e-05, + "loss": 0.866, + "num_tokens": 16086982452.0, + "step": 3849 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.5111170712291067, + "learning_rate": 1.8199890763868946e-05, + "loss": 0.85, + "num_tokens": 16091172662.0, + "step": 3850 + }, + { + "epoch": 0.45763517528223413, + "grad_norm": 0.4863286292236825, + "learning_rate": 1.8198851423335956e-05, + "loss": 0.882, + "num_tokens": 16095361565.0, + "step": 3851 + }, + { + "epoch": 0.4577540106951872, + "grad_norm": 0.46599623346240876, + "learning_rate": 1.8197811816205157e-05, + "loss": 0.8989, + "num_tokens": 16099550879.0, + "step": 3852 + }, + { + "epoch": 0.4578728461081402, + "grad_norm": 0.44761889442280756, + "learning_rate": 1.8196771942515054e-05, + "loss": 0.898, + "num_tokens": 16103717835.0, + "step": 3853 + }, + { + "epoch": 0.45799168152109326, + "grad_norm": 0.48864757073258375, + "learning_rate": 1.819573180230415e-05, + "loss": 0.9101, + "num_tokens": 16107906425.0, + "step": 3854 + }, + { + "epoch": 0.45811051693404636, + "grad_norm": 0.49848787610109596, + "learning_rate": 1.819469139561097e-05, + "loss": 0.879, + "num_tokens": 16112094564.0, + "step": 3855 + }, + { + "epoch": 0.4582293523469994, + "grad_norm": 0.5295182926773552, + "learning_rate": 1.819365072247404e-05, + "loss": 0.8531, + "num_tokens": 16116284921.0, + "step": 3856 + }, + { + "epoch": 0.45834818775995245, + "grad_norm": 0.41860483645443836, + "learning_rate": 1.81926097829319e-05, + "loss": 0.8909, + "num_tokens": 16120451643.0, + "step": 3857 + }, + { + "epoch": 0.45846702317290555, + "grad_norm": 0.6768303714666394, + "learning_rate": 1.8191568577023102e-05, + "loss": 0.8643, + "num_tokens": 16124641455.0, + "step": 3858 + }, + { + "epoch": 0.4585858585858586, + "grad_norm": 0.4201317081952136, + "learning_rate": 1.8190527104786204e-05, + "loss": 0.9051, + "num_tokens": 16128830167.0, + "step": 3859 + }, + { + "epoch": 0.45870469399881164, + "grad_norm": 0.5792199776993472, + "learning_rate": 1.818948536625978e-05, + "loss": 0.8965, + "num_tokens": 16133018445.0, + "step": 3860 + }, + { + "epoch": 0.4588235294117647, + "grad_norm": 0.47103560488485835, + "learning_rate": 1.81884433614824e-05, + "loss": 0.8815, + "num_tokens": 16137157438.0, + "step": 3861 + }, + { + "epoch": 0.4589423648247178, + "grad_norm": 0.49153516465451225, + "learning_rate": 1.8187401090492657e-05, + "loss": 0.8577, + "num_tokens": 16141344127.0, + "step": 3862 + }, + { + "epoch": 0.45906120023767083, + "grad_norm": 0.5868635236567494, + "learning_rate": 1.8186358553329156e-05, + "loss": 0.8845, + "num_tokens": 16145516901.0, + "step": 3863 + }, + { + "epoch": 0.4591800356506239, + "grad_norm": 0.5830511979582729, + "learning_rate": 1.8185315750030495e-05, + "loss": 0.9043, + "num_tokens": 16149706596.0, + "step": 3864 + }, + { + "epoch": 0.4592988710635769, + "grad_norm": 0.5274179732571473, + "learning_rate": 1.8184272680635298e-05, + "loss": 0.8953, + "num_tokens": 16153896078.0, + "step": 3865 + }, + { + "epoch": 0.45941770647653, + "grad_norm": 0.4343425827587218, + "learning_rate": 1.8183229345182194e-05, + "loss": 0.884, + "num_tokens": 16158085978.0, + "step": 3866 + }, + { + "epoch": 0.45953654188948306, + "grad_norm": 0.5063535228959043, + "learning_rate": 1.8182185743709824e-05, + "loss": 0.8526, + "num_tokens": 16162274693.0, + "step": 3867 + }, + { + "epoch": 0.4596553773024361, + "grad_norm": 0.5515406790382846, + "learning_rate": 1.8181141876256827e-05, + "loss": 0.8645, + "num_tokens": 16166463237.0, + "step": 3868 + }, + { + "epoch": 0.4597742127153892, + "grad_norm": 0.5253104593165762, + "learning_rate": 1.8180097742861868e-05, + "loss": 0.8379, + "num_tokens": 16170639719.0, + "step": 3869 + }, + { + "epoch": 0.45989304812834225, + "grad_norm": 0.5048134270480432, + "learning_rate": 1.8179053343563612e-05, + "loss": 0.8824, + "num_tokens": 16174814424.0, + "step": 3870 + }, + { + "epoch": 0.4600118835412953, + "grad_norm": 0.5758380111265113, + "learning_rate": 1.8178008678400742e-05, + "loss": 0.86, + "num_tokens": 16179000453.0, + "step": 3871 + }, + { + "epoch": 0.46013071895424834, + "grad_norm": 0.5007182936642763, + "learning_rate": 1.8176963747411936e-05, + "loss": 0.9196, + "num_tokens": 16183173459.0, + "step": 3872 + }, + { + "epoch": 0.46024955436720144, + "grad_norm": 0.5308269285085541, + "learning_rate": 1.8175918550635898e-05, + "loss": 0.9176, + "num_tokens": 16187361309.0, + "step": 3873 + }, + { + "epoch": 0.4603683897801545, + "grad_norm": 0.4391077198941601, + "learning_rate": 1.8174873088111336e-05, + "loss": 0.8903, + "num_tokens": 16191549835.0, + "step": 3874 + }, + { + "epoch": 0.46048722519310753, + "grad_norm": 0.4991061522659363, + "learning_rate": 1.8173827359876963e-05, + "loss": 0.8634, + "num_tokens": 16195740152.0, + "step": 3875 + }, + { + "epoch": 0.46060606060606063, + "grad_norm": 0.46032044901593916, + "learning_rate": 1.817278136597151e-05, + "loss": 0.8685, + "num_tokens": 16199883458.0, + "step": 3876 + }, + { + "epoch": 0.4607248960190137, + "grad_norm": 0.5150894630854949, + "learning_rate": 1.8171735106433705e-05, + "loss": 0.843, + "num_tokens": 16204061817.0, + "step": 3877 + }, + { + "epoch": 0.4608437314319667, + "grad_norm": 0.5098245044681637, + "learning_rate": 1.8170688581302306e-05, + "loss": 0.9064, + "num_tokens": 16208233439.0, + "step": 3878 + }, + { + "epoch": 0.46096256684491976, + "grad_norm": 0.611234927514499, + "learning_rate": 1.8169641790616062e-05, + "loss": 0.8382, + "num_tokens": 16212423957.0, + "step": 3879 + }, + { + "epoch": 0.46108140225787286, + "grad_norm": 0.5315180137012464, + "learning_rate": 1.816859473441374e-05, + "loss": 0.9158, + "num_tokens": 16216613750.0, + "step": 3880 + }, + { + "epoch": 0.4612002376708259, + "grad_norm": 0.5075596244436331, + "learning_rate": 1.8167547412734124e-05, + "loss": 0.8811, + "num_tokens": 16220801247.0, + "step": 3881 + }, + { + "epoch": 0.46131907308377895, + "grad_norm": 0.5185281075740027, + "learning_rate": 1.8166499825615987e-05, + "loss": 0.8934, + "num_tokens": 16224990149.0, + "step": 3882 + }, + { + "epoch": 0.46143790849673205, + "grad_norm": 0.5683364836295971, + "learning_rate": 1.8165451973098135e-05, + "loss": 0.8773, + "num_tokens": 16229179186.0, + "step": 3883 + }, + { + "epoch": 0.4615567439096851, + "grad_norm": 0.5341619671496382, + "learning_rate": 1.8164403855219365e-05, + "loss": 0.8704, + "num_tokens": 16233369748.0, + "step": 3884 + }, + { + "epoch": 0.46167557932263814, + "grad_norm": 0.514769974382884, + "learning_rate": 1.81633554720185e-05, + "loss": 0.8594, + "num_tokens": 16237538208.0, + "step": 3885 + }, + { + "epoch": 0.4617944147355912, + "grad_norm": 0.599642583613242, + "learning_rate": 1.8162306823534363e-05, + "loss": 0.8774, + "num_tokens": 16241725182.0, + "step": 3886 + }, + { + "epoch": 0.4619132501485443, + "grad_norm": 0.48191942088728323, + "learning_rate": 1.8161257909805784e-05, + "loss": 0.9163, + "num_tokens": 16245914200.0, + "step": 3887 + }, + { + "epoch": 0.46203208556149733, + "grad_norm": 0.5004599066652026, + "learning_rate": 1.8160208730871616e-05, + "loss": 0.84, + "num_tokens": 16250103369.0, + "step": 3888 + }, + { + "epoch": 0.4621509209744504, + "grad_norm": 0.5378055220075152, + "learning_rate": 1.8159159286770707e-05, + "loss": 0.8328, + "num_tokens": 16254291981.0, + "step": 3889 + }, + { + "epoch": 0.4622697563874034, + "grad_norm": 0.5060867647242644, + "learning_rate": 1.8158109577541923e-05, + "loss": 0.8884, + "num_tokens": 16258481765.0, + "step": 3890 + }, + { + "epoch": 0.4623885918003565, + "grad_norm": 0.5690075004617354, + "learning_rate": 1.8157059603224143e-05, + "loss": 0.8388, + "num_tokens": 16262671573.0, + "step": 3891 + }, + { + "epoch": 0.46250742721330956, + "grad_norm": 0.4780553569545772, + "learning_rate": 1.8156009363856245e-05, + "loss": 0.8655, + "num_tokens": 16266861214.0, + "step": 3892 + }, + { + "epoch": 0.4626262626262626, + "grad_norm": 0.4715074668003461, + "learning_rate": 1.8154958859477125e-05, + "loss": 0.8759, + "num_tokens": 16271048755.0, + "step": 3893 + }, + { + "epoch": 0.4627450980392157, + "grad_norm": 0.48312421521868154, + "learning_rate": 1.8153908090125686e-05, + "loss": 0.9185, + "num_tokens": 16275239257.0, + "step": 3894 + }, + { + "epoch": 0.46286393345216875, + "grad_norm": 0.5366940546481261, + "learning_rate": 1.8152857055840843e-05, + "loss": 0.8961, + "num_tokens": 16279417514.0, + "step": 3895 + }, + { + "epoch": 0.4629827688651218, + "grad_norm": 0.5253494409180948, + "learning_rate": 1.815180575666152e-05, + "loss": 0.8681, + "num_tokens": 16283606080.0, + "step": 3896 + }, + { + "epoch": 0.46310160427807484, + "grad_norm": 0.42499963596601087, + "learning_rate": 1.815075419262665e-05, + "loss": 0.9173, + "num_tokens": 16287794892.0, + "step": 3897 + }, + { + "epoch": 0.46322043969102794, + "grad_norm": 0.5431522712546091, + "learning_rate": 1.8149702363775174e-05, + "loss": 0.8544, + "num_tokens": 16291955853.0, + "step": 3898 + }, + { + "epoch": 0.463339275103981, + "grad_norm": 0.4765488331716392, + "learning_rate": 1.8148650270146045e-05, + "loss": 0.8601, + "num_tokens": 16296114483.0, + "step": 3899 + }, + { + "epoch": 0.46345811051693403, + "grad_norm": 0.5245331684180496, + "learning_rate": 1.8147597911778225e-05, + "loss": 0.8873, + "num_tokens": 16300304936.0, + "step": 3900 + }, + { + "epoch": 0.46357694592988713, + "grad_norm": 0.4744049052119865, + "learning_rate": 1.8146545288710692e-05, + "loss": 0.8977, + "num_tokens": 16304489425.0, + "step": 3901 + }, + { + "epoch": 0.4636957813428402, + "grad_norm": 0.46409550361252266, + "learning_rate": 1.8145492400982418e-05, + "loss": 0.8753, + "num_tokens": 16308679115.0, + "step": 3902 + }, + { + "epoch": 0.4638146167557932, + "grad_norm": 0.5147869752234426, + "learning_rate": 1.8144439248632405e-05, + "loss": 0.8817, + "num_tokens": 16312868110.0, + "step": 3903 + }, + { + "epoch": 0.46393345216874626, + "grad_norm": 0.4931639389639365, + "learning_rate": 1.8143385831699647e-05, + "loss": 0.8643, + "num_tokens": 16317056056.0, + "step": 3904 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.5312246820828151, + "learning_rate": 1.8142332150223165e-05, + "loss": 0.8745, + "num_tokens": 16321197975.0, + "step": 3905 + }, + { + "epoch": 0.4641711229946524, + "grad_norm": 0.4185055842843022, + "learning_rate": 1.814127820424197e-05, + "loss": 0.8876, + "num_tokens": 16325387503.0, + "step": 3906 + }, + { + "epoch": 0.46428995840760545, + "grad_norm": 0.5200478567136998, + "learning_rate": 1.81402239937951e-05, + "loss": 0.8963, + "num_tokens": 16329576564.0, + "step": 3907 + }, + { + "epoch": 0.46440879382055855, + "grad_norm": 0.5746115484528981, + "learning_rate": 1.8139169518921592e-05, + "loss": 0.8501, + "num_tokens": 16333703792.0, + "step": 3908 + }, + { + "epoch": 0.4645276292335116, + "grad_norm": 0.41461598543794415, + "learning_rate": 1.8138114779660502e-05, + "loss": 0.9057, + "num_tokens": 16337868503.0, + "step": 3909 + }, + { + "epoch": 0.46464646464646464, + "grad_norm": 0.5315504109728318, + "learning_rate": 1.8137059776050886e-05, + "loss": 0.9139, + "num_tokens": 16342057176.0, + "step": 3910 + }, + { + "epoch": 0.4647653000594177, + "grad_norm": 0.4417273914327511, + "learning_rate": 1.8136004508131814e-05, + "loss": 0.8832, + "num_tokens": 16346245380.0, + "step": 3911 + }, + { + "epoch": 0.4648841354723708, + "grad_norm": 0.5168558265004279, + "learning_rate": 1.813494897594237e-05, + "loss": 0.8995, + "num_tokens": 16350434456.0, + "step": 3912 + }, + { + "epoch": 0.46500297088532383, + "grad_norm": 0.45685670003925627, + "learning_rate": 1.8133893179521637e-05, + "loss": 0.8535, + "num_tokens": 16354623706.0, + "step": 3913 + }, + { + "epoch": 0.4651218062982769, + "grad_norm": 0.5067827551078868, + "learning_rate": 1.8132837118908723e-05, + "loss": 0.8948, + "num_tokens": 16358813071.0, + "step": 3914 + }, + { + "epoch": 0.46524064171123, + "grad_norm": 0.5587792254544895, + "learning_rate": 1.8131780794142733e-05, + "loss": 0.9058, + "num_tokens": 16363003200.0, + "step": 3915 + }, + { + "epoch": 0.465359477124183, + "grad_norm": 0.5260752488619262, + "learning_rate": 1.813072420526279e-05, + "loss": 0.8818, + "num_tokens": 16367169459.0, + "step": 3916 + }, + { + "epoch": 0.46547831253713606, + "grad_norm": 0.5599530005011455, + "learning_rate": 1.8129667352308013e-05, + "loss": 0.8753, + "num_tokens": 16371360095.0, + "step": 3917 + }, + { + "epoch": 0.4655971479500891, + "grad_norm": 0.5670669175743867, + "learning_rate": 1.8128610235317556e-05, + "loss": 0.8619, + "num_tokens": 16375548906.0, + "step": 3918 + }, + { + "epoch": 0.4657159833630422, + "grad_norm": 0.4068989601095821, + "learning_rate": 1.8127552854330554e-05, + "loss": 0.9174, + "num_tokens": 16379730354.0, + "step": 3919 + }, + { + "epoch": 0.46583481877599525, + "grad_norm": 0.5417725064100224, + "learning_rate": 1.812649520938617e-05, + "loss": 0.8889, + "num_tokens": 16383919441.0, + "step": 3920 + }, + { + "epoch": 0.4659536541889483, + "grad_norm": 0.6532260777647413, + "learning_rate": 1.812543730052358e-05, + "loss": 0.8531, + "num_tokens": 16388073290.0, + "step": 3921 + }, + { + "epoch": 0.46607248960190134, + "grad_norm": 0.45126436516265345, + "learning_rate": 1.8124379127781952e-05, + "loss": 0.9043, + "num_tokens": 16392232876.0, + "step": 3922 + }, + { + "epoch": 0.46619132501485444, + "grad_norm": 0.4644923527241784, + "learning_rate": 1.8123320691200475e-05, + "loss": 0.8726, + "num_tokens": 16396402714.0, + "step": 3923 + }, + { + "epoch": 0.4663101604278075, + "grad_norm": 0.4669223943223141, + "learning_rate": 1.812226199081835e-05, + "loss": 0.8606, + "num_tokens": 16400591935.0, + "step": 3924 + }, + { + "epoch": 0.46642899584076053, + "grad_norm": 0.5416547640908692, + "learning_rate": 1.812120302667478e-05, + "loss": 0.8762, + "num_tokens": 16404780203.0, + "step": 3925 + }, + { + "epoch": 0.46654783125371363, + "grad_norm": 0.5702262998978089, + "learning_rate": 1.8120143798808984e-05, + "loss": 0.9182, + "num_tokens": 16408946684.0, + "step": 3926 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.47412769345170663, + "learning_rate": 1.8119084307260192e-05, + "loss": 0.8954, + "num_tokens": 16413136833.0, + "step": 3927 + }, + { + "epoch": 0.4667855020796197, + "grad_norm": 0.488441687437987, + "learning_rate": 1.8118024552067636e-05, + "loss": 0.8647, + "num_tokens": 16417326665.0, + "step": 3928 + }, + { + "epoch": 0.46690433749257276, + "grad_norm": 0.5446923397632509, + "learning_rate": 1.8116964533270566e-05, + "loss": 0.8718, + "num_tokens": 16421515321.0, + "step": 3929 + }, + { + "epoch": 0.46702317290552586, + "grad_norm": 0.4833562696950391, + "learning_rate": 1.811590425090824e-05, + "loss": 0.9247, + "num_tokens": 16425703233.0, + "step": 3930 + }, + { + "epoch": 0.4671420083184789, + "grad_norm": 0.45033253838989573, + "learning_rate": 1.8114843705019914e-05, + "loss": 0.8855, + "num_tokens": 16429881377.0, + "step": 3931 + }, + { + "epoch": 0.46726084373143195, + "grad_norm": 0.43753791399770153, + "learning_rate": 1.8113782895644873e-05, + "loss": 0.9014, + "num_tokens": 16434067581.0, + "step": 3932 + }, + { + "epoch": 0.46737967914438505, + "grad_norm": 0.5613922128226873, + "learning_rate": 1.8112721822822395e-05, + "loss": 0.8916, + "num_tokens": 16438256298.0, + "step": 3933 + }, + { + "epoch": 0.4674985145573381, + "grad_norm": 0.4212130180330051, + "learning_rate": 1.8111660486591784e-05, + "loss": 0.9031, + "num_tokens": 16442444695.0, + "step": 3934 + }, + { + "epoch": 0.46761734997029114, + "grad_norm": 0.5531663918368279, + "learning_rate": 1.8110598886992337e-05, + "loss": 0.8761, + "num_tokens": 16446633964.0, + "step": 3935 + }, + { + "epoch": 0.4677361853832442, + "grad_norm": 0.5467801592679785, + "learning_rate": 1.8109537024063374e-05, + "loss": 0.8899, + "num_tokens": 16450812998.0, + "step": 3936 + }, + { + "epoch": 0.4678550207961973, + "grad_norm": 0.520611445434679, + "learning_rate": 1.8108474897844214e-05, + "loss": 0.8738, + "num_tokens": 16455000496.0, + "step": 3937 + }, + { + "epoch": 0.46797385620915033, + "grad_norm": 0.5091155470120374, + "learning_rate": 1.8107412508374198e-05, + "loss": 0.8498, + "num_tokens": 16459189862.0, + "step": 3938 + }, + { + "epoch": 0.4680926916221034, + "grad_norm": 0.5455237522225327, + "learning_rate": 1.8106349855692664e-05, + "loss": 0.8405, + "num_tokens": 16463380637.0, + "step": 3939 + }, + { + "epoch": 0.4682115270350565, + "grad_norm": 0.5472523221653542, + "learning_rate": 1.810528693983897e-05, + "loss": 0.8432, + "num_tokens": 16467566705.0, + "step": 3940 + }, + { + "epoch": 0.4683303624480095, + "grad_norm": 0.5473469803040717, + "learning_rate": 1.8104223760852473e-05, + "loss": 0.8941, + "num_tokens": 16471748204.0, + "step": 3941 + }, + { + "epoch": 0.46844919786096256, + "grad_norm": 0.4156877237001648, + "learning_rate": 1.8103160318772552e-05, + "loss": 0.8779, + "num_tokens": 16475936779.0, + "step": 3942 + }, + { + "epoch": 0.4685680332739156, + "grad_norm": 0.7845033254518652, + "learning_rate": 1.8102096613638592e-05, + "loss": 0.8807, + "num_tokens": 16480124619.0, + "step": 3943 + }, + { + "epoch": 0.4686868686868687, + "grad_norm": 0.4459219967009724, + "learning_rate": 1.8101032645489974e-05, + "loss": 0.8816, + "num_tokens": 16484313685.0, + "step": 3944 + }, + { + "epoch": 0.46880570409982175, + "grad_norm": 0.8764851754902526, + "learning_rate": 1.8099968414366114e-05, + "loss": 0.8779, + "num_tokens": 16488477174.0, + "step": 3945 + }, + { + "epoch": 0.4689245395127748, + "grad_norm": 0.6136482631305838, + "learning_rate": 1.8098903920306415e-05, + "loss": 0.8364, + "num_tokens": 16492666963.0, + "step": 3946 + }, + { + "epoch": 0.46904337492572784, + "grad_norm": 0.8197544494562009, + "learning_rate": 1.8097839163350305e-05, + "loss": 0.8776, + "num_tokens": 16496839750.0, + "step": 3947 + }, + { + "epoch": 0.46916221033868094, + "grad_norm": 0.6370823935316696, + "learning_rate": 1.809677414353721e-05, + "loss": 0.8959, + "num_tokens": 16501028373.0, + "step": 3948 + }, + { + "epoch": 0.469281045751634, + "grad_norm": 0.7850355859905598, + "learning_rate": 1.8095708860906573e-05, + "loss": 0.891, + "num_tokens": 16505216926.0, + "step": 3949 + }, + { + "epoch": 0.46939988116458703, + "grad_norm": 0.6630221943134772, + "learning_rate": 1.8094643315497847e-05, + "loss": 0.9101, + "num_tokens": 16509406742.0, + "step": 3950 + }, + { + "epoch": 0.46951871657754013, + "grad_norm": 0.6832580474393327, + "learning_rate": 1.8093577507350492e-05, + "loss": 0.8651, + "num_tokens": 16513594211.0, + "step": 3951 + }, + { + "epoch": 0.4696375519904932, + "grad_norm": 0.581597477149783, + "learning_rate": 1.8092511436503976e-05, + "loss": 0.8825, + "num_tokens": 16517782763.0, + "step": 3952 + }, + { + "epoch": 0.4697563874034462, + "grad_norm": 0.7451381680855343, + "learning_rate": 1.809144510299778e-05, + "loss": 0.869, + "num_tokens": 16521972598.0, + "step": 3953 + }, + { + "epoch": 0.46987522281639926, + "grad_norm": 0.5349505295138716, + "learning_rate": 1.80903785068714e-05, + "loss": 0.8766, + "num_tokens": 16526162319.0, + "step": 3954 + }, + { + "epoch": 0.46999405822935236, + "grad_norm": 0.6921838773738812, + "learning_rate": 1.808931164816433e-05, + "loss": 0.8767, + "num_tokens": 16530351551.0, + "step": 3955 + }, + { + "epoch": 0.4701128936423054, + "grad_norm": 0.5550528854919861, + "learning_rate": 1.8088244526916077e-05, + "loss": 0.8837, + "num_tokens": 16534539440.0, + "step": 3956 + }, + { + "epoch": 0.47023172905525845, + "grad_norm": 0.6790780655033157, + "learning_rate": 1.8087177143166165e-05, + "loss": 0.8799, + "num_tokens": 16538727734.0, + "step": 3957 + }, + { + "epoch": 0.47035056446821155, + "grad_norm": 0.6113548399890515, + "learning_rate": 1.808610949695412e-05, + "loss": 0.8849, + "num_tokens": 16542916719.0, + "step": 3958 + }, + { + "epoch": 0.4704693998811646, + "grad_norm": 0.6124370190763163, + "learning_rate": 1.8085041588319482e-05, + "loss": 0.8856, + "num_tokens": 16547102772.0, + "step": 3959 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.6594902209435535, + "learning_rate": 1.80839734173018e-05, + "loss": 0.9094, + "num_tokens": 16551292826.0, + "step": 3960 + }, + { + "epoch": 0.4707070707070707, + "grad_norm": 0.5181535658138073, + "learning_rate": 1.8082904983940628e-05, + "loss": 0.8828, + "num_tokens": 16555482124.0, + "step": 3961 + }, + { + "epoch": 0.4708259061200238, + "grad_norm": 0.5430591847762312, + "learning_rate": 1.808183628827554e-05, + "loss": 0.8898, + "num_tokens": 16559644622.0, + "step": 3962 + }, + { + "epoch": 0.47094474153297683, + "grad_norm": 0.5358541683451084, + "learning_rate": 1.8080767330346107e-05, + "loss": 0.8806, + "num_tokens": 16563834284.0, + "step": 3963 + }, + { + "epoch": 0.4710635769459299, + "grad_norm": 0.5755753610270229, + "learning_rate": 1.8079698110191922e-05, + "loss": 0.8569, + "num_tokens": 16568023613.0, + "step": 3964 + }, + { + "epoch": 0.471182412358883, + "grad_norm": 0.4949185756194508, + "learning_rate": 1.8078628627852577e-05, + "loss": 0.8715, + "num_tokens": 16572176127.0, + "step": 3965 + }, + { + "epoch": 0.471301247771836, + "grad_norm": 0.5296354438704185, + "learning_rate": 1.8077558883367682e-05, + "loss": 0.8831, + "num_tokens": 16576365525.0, + "step": 3966 + }, + { + "epoch": 0.47142008318478906, + "grad_norm": 0.5253197710944368, + "learning_rate": 1.807648887677685e-05, + "loss": 0.8848, + "num_tokens": 16580524687.0, + "step": 3967 + }, + { + "epoch": 0.4715389185977421, + "grad_norm": 0.5081932905135126, + "learning_rate": 1.807541860811971e-05, + "loss": 0.8383, + "num_tokens": 16584714752.0, + "step": 3968 + }, + { + "epoch": 0.4716577540106952, + "grad_norm": 0.5977530149337305, + "learning_rate": 1.8074348077435896e-05, + "loss": 0.8981, + "num_tokens": 16588903969.0, + "step": 3969 + }, + { + "epoch": 0.47177658942364825, + "grad_norm": 0.441889896197272, + "learning_rate": 1.8073277284765052e-05, + "loss": 0.8639, + "num_tokens": 16593083482.0, + "step": 3970 + }, + { + "epoch": 0.4718954248366013, + "grad_norm": 0.5578528313071848, + "learning_rate": 1.807220623014684e-05, + "loss": 0.9083, + "num_tokens": 16597272659.0, + "step": 3971 + }, + { + "epoch": 0.47201426024955434, + "grad_norm": 0.480048928534904, + "learning_rate": 1.8071134913620915e-05, + "loss": 0.8676, + "num_tokens": 16601462182.0, + "step": 3972 + }, + { + "epoch": 0.47213309566250744, + "grad_norm": 0.5053883498300213, + "learning_rate": 1.8070063335226957e-05, + "loss": 0.8561, + "num_tokens": 16605650831.0, + "step": 3973 + }, + { + "epoch": 0.4722519310754605, + "grad_norm": 0.5645530107612909, + "learning_rate": 1.806899149500465e-05, + "loss": 0.8851, + "num_tokens": 16609841147.0, + "step": 3974 + }, + { + "epoch": 0.47237076648841353, + "grad_norm": 0.43498136057593323, + "learning_rate": 1.8067919392993688e-05, + "loss": 0.86, + "num_tokens": 16614030878.0, + "step": 3975 + }, + { + "epoch": 0.47248960190136663, + "grad_norm": 0.5307749340168292, + "learning_rate": 1.806684702923377e-05, + "loss": 0.8628, + "num_tokens": 16618220081.0, + "step": 3976 + }, + { + "epoch": 0.4726084373143197, + "grad_norm": 0.4874180777498995, + "learning_rate": 1.806577440376462e-05, + "loss": 0.8746, + "num_tokens": 16622405402.0, + "step": 3977 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 0.4697099352178521, + "learning_rate": 1.8064701516625946e-05, + "loss": 0.8551, + "num_tokens": 16626594512.0, + "step": 3978 + }, + { + "epoch": 0.47284610814022576, + "grad_norm": 0.5169848257802622, + "learning_rate": 1.8063628367857492e-05, + "loss": 0.9268, + "num_tokens": 16630782650.0, + "step": 3979 + }, + { + "epoch": 0.47296494355317886, + "grad_norm": 0.4476647432095494, + "learning_rate": 1.8062554957498996e-05, + "loss": 0.8694, + "num_tokens": 16634949890.0, + "step": 3980 + }, + { + "epoch": 0.4730837789661319, + "grad_norm": 0.4389340080444259, + "learning_rate": 1.806148128559021e-05, + "loss": 0.8671, + "num_tokens": 16639138265.0, + "step": 3981 + }, + { + "epoch": 0.47320261437908495, + "grad_norm": 0.51519626746686, + "learning_rate": 1.80604073521709e-05, + "loss": 0.8722, + "num_tokens": 16643296015.0, + "step": 3982 + }, + { + "epoch": 0.47332144979203805, + "grad_norm": 0.5506763842381703, + "learning_rate": 1.805933315728083e-05, + "loss": 0.8539, + "num_tokens": 16647456821.0, + "step": 3983 + }, + { + "epoch": 0.4734402852049911, + "grad_norm": 0.5669098895193851, + "learning_rate": 1.805825870095979e-05, + "loss": 0.8408, + "num_tokens": 16651644530.0, + "step": 3984 + }, + { + "epoch": 0.47355912061794414, + "grad_norm": 0.4600642535085704, + "learning_rate": 1.805718398324756e-05, + "loss": 0.876, + "num_tokens": 16655819401.0, + "step": 3985 + }, + { + "epoch": 0.4736779560308972, + "grad_norm": 0.5051773263315723, + "learning_rate": 1.8056109004183946e-05, + "loss": 0.8579, + "num_tokens": 16660007891.0, + "step": 3986 + }, + { + "epoch": 0.4737967914438503, + "grad_norm": 0.42929598116999546, + "learning_rate": 1.8055033763808765e-05, + "loss": 0.8793, + "num_tokens": 16664197380.0, + "step": 3987 + }, + { + "epoch": 0.47391562685680333, + "grad_norm": 0.3994791228139786, + "learning_rate": 1.8053958262161826e-05, + "loss": 0.916, + "num_tokens": 16668373039.0, + "step": 3988 + }, + { + "epoch": 0.4740344622697564, + "grad_norm": 0.5522034944576468, + "learning_rate": 1.8052882499282963e-05, + "loss": 0.8781, + "num_tokens": 16672563162.0, + "step": 3989 + }, + { + "epoch": 0.4741532976827095, + "grad_norm": 0.5127367504613287, + "learning_rate": 1.805180647521201e-05, + "loss": 0.855, + "num_tokens": 16676752537.0, + "step": 3990 + }, + { + "epoch": 0.4742721330956625, + "grad_norm": 0.6702849217810577, + "learning_rate": 1.8050730189988828e-05, + "loss": 0.9093, + "num_tokens": 16680934512.0, + "step": 3991 + }, + { + "epoch": 0.47439096850861556, + "grad_norm": 0.4393402049814518, + "learning_rate": 1.8049653643653265e-05, + "loss": 0.8299, + "num_tokens": 16685118637.0, + "step": 3992 + }, + { + "epoch": 0.4745098039215686, + "grad_norm": 0.6634509215787642, + "learning_rate": 1.8048576836245193e-05, + "loss": 0.9015, + "num_tokens": 16689307397.0, + "step": 3993 + }, + { + "epoch": 0.4746286393345217, + "grad_norm": 0.49377821771460323, + "learning_rate": 1.8047499767804487e-05, + "loss": 0.867, + "num_tokens": 16693495999.0, + "step": 3994 + }, + { + "epoch": 0.47474747474747475, + "grad_norm": 0.6422673911329212, + "learning_rate": 1.8046422438371037e-05, + "loss": 0.8734, + "num_tokens": 16697685913.0, + "step": 3995 + }, + { + "epoch": 0.4748663101604278, + "grad_norm": 0.5245514025069846, + "learning_rate": 1.804534484798474e-05, + "loss": 0.8617, + "num_tokens": 16701875189.0, + "step": 3996 + }, + { + "epoch": 0.47498514557338084, + "grad_norm": 0.6109662596158737, + "learning_rate": 1.80442669966855e-05, + "loss": 0.8253, + "num_tokens": 16706033567.0, + "step": 3997 + }, + { + "epoch": 0.47510398098633394, + "grad_norm": 0.49349126066285304, + "learning_rate": 1.804318888451324e-05, + "loss": 0.8985, + "num_tokens": 16710222015.0, + "step": 3998 + }, + { + "epoch": 0.475222816399287, + "grad_norm": 0.6027021853538759, + "learning_rate": 1.804211051150788e-05, + "loss": 0.8646, + "num_tokens": 16714396424.0, + "step": 3999 + }, + { + "epoch": 0.47534165181224003, + "grad_norm": 0.48723772193929643, + "learning_rate": 1.8041031877709357e-05, + "loss": 0.8702, + "num_tokens": 16718552115.0, + "step": 4000 + }, + { + "epoch": 0.47546048722519313, + "grad_norm": 0.5967969884317686, + "learning_rate": 1.803995298315762e-05, + "loss": 0.881, + "num_tokens": 16722736969.0, + "step": 4001 + }, + { + "epoch": 0.4755793226381462, + "grad_norm": 0.5018841134518859, + "learning_rate": 1.8038873827892622e-05, + "loss": 0.8743, + "num_tokens": 16726897246.0, + "step": 4002 + }, + { + "epoch": 0.4756981580510992, + "grad_norm": 0.6651334224980202, + "learning_rate": 1.803779441195432e-05, + "loss": 0.8822, + "num_tokens": 16731087908.0, + "step": 4003 + }, + { + "epoch": 0.47581699346405226, + "grad_norm": 0.5326908453388961, + "learning_rate": 1.8036714735382703e-05, + "loss": 0.8325, + "num_tokens": 16735275908.0, + "step": 4004 + }, + { + "epoch": 0.47593582887700536, + "grad_norm": 0.6115082739708475, + "learning_rate": 1.8035634798217748e-05, + "loss": 0.8682, + "num_tokens": 16739465079.0, + "step": 4005 + }, + { + "epoch": 0.4760546642899584, + "grad_norm": 0.556909337898251, + "learning_rate": 1.8034554600499447e-05, + "loss": 0.8474, + "num_tokens": 16743650081.0, + "step": 4006 + }, + { + "epoch": 0.47617349970291145, + "grad_norm": 0.4763464393344381, + "learning_rate": 1.80334741422678e-05, + "loss": 0.8808, + "num_tokens": 16747840138.0, + "step": 4007 + }, + { + "epoch": 0.47629233511586455, + "grad_norm": 0.5470813312151781, + "learning_rate": 1.8032393423562828e-05, + "loss": 0.8673, + "num_tokens": 16752029136.0, + "step": 4008 + }, + { + "epoch": 0.4764111705288176, + "grad_norm": 0.5244026500790104, + "learning_rate": 1.8031312444424554e-05, + "loss": 0.8668, + "num_tokens": 16756219685.0, + "step": 4009 + }, + { + "epoch": 0.47653000594177064, + "grad_norm": 0.48279107052392073, + "learning_rate": 1.8030231204893e-05, + "loss": 0.8681, + "num_tokens": 16760381285.0, + "step": 4010 + }, + { + "epoch": 0.4766488413547237, + "grad_norm": 0.5001000929477092, + "learning_rate": 1.8029149705008216e-05, + "loss": 0.879, + "num_tokens": 16764544407.0, + "step": 4011 + }, + { + "epoch": 0.4767676767676768, + "grad_norm": 0.5051193601621976, + "learning_rate": 1.802806794481026e-05, + "loss": 0.8702, + "num_tokens": 16768733737.0, + "step": 4012 + }, + { + "epoch": 0.47688651218062983, + "grad_norm": 0.5347021642960899, + "learning_rate": 1.802698592433918e-05, + "loss": 0.8774, + "num_tokens": 16772921771.0, + "step": 4013 + }, + { + "epoch": 0.4770053475935829, + "grad_norm": 0.5443370458493418, + "learning_rate": 1.802590364363505e-05, + "loss": 0.8792, + "num_tokens": 16777112016.0, + "step": 4014 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.5002471206971507, + "learning_rate": 1.8024821102737956e-05, + "loss": 0.8904, + "num_tokens": 16781300098.0, + "step": 4015 + }, + { + "epoch": 0.477243018419489, + "grad_norm": 0.49395584488130745, + "learning_rate": 1.802373830168798e-05, + "loss": 0.8382, + "num_tokens": 16785473316.0, + "step": 4016 + }, + { + "epoch": 0.47736185383244206, + "grad_norm": 0.4990133842307461, + "learning_rate": 1.8022655240525234e-05, + "loss": 0.8925, + "num_tokens": 16789662057.0, + "step": 4017 + }, + { + "epoch": 0.4774806892453951, + "grad_norm": 0.5283050625279494, + "learning_rate": 1.802157191928982e-05, + "loss": 0.8779, + "num_tokens": 16793819177.0, + "step": 4018 + }, + { + "epoch": 0.4775995246583482, + "grad_norm": 0.47850580535797377, + "learning_rate": 1.8020488338021852e-05, + "loss": 0.9104, + "num_tokens": 16798007468.0, + "step": 4019 + }, + { + "epoch": 0.47771836007130125, + "grad_norm": 0.5717326828510929, + "learning_rate": 1.801940449676147e-05, + "loss": 0.8848, + "num_tokens": 16802172560.0, + "step": 4020 + }, + { + "epoch": 0.4778371954842543, + "grad_norm": 0.5342442463839006, + "learning_rate": 1.8018320395548802e-05, + "loss": 0.8734, + "num_tokens": 16806362285.0, + "step": 4021 + }, + { + "epoch": 0.47795603089720734, + "grad_norm": 0.47373620399909944, + "learning_rate": 1.8017236034423998e-05, + "loss": 0.8707, + "num_tokens": 16810552365.0, + "step": 4022 + }, + { + "epoch": 0.47807486631016044, + "grad_norm": 0.48457119597018433, + "learning_rate": 1.801615141342722e-05, + "loss": 0.8383, + "num_tokens": 16814743075.0, + "step": 4023 + }, + { + "epoch": 0.4781937017231135, + "grad_norm": 0.5384619558389151, + "learning_rate": 1.8015066532598637e-05, + "loss": 0.869, + "num_tokens": 16818910398.0, + "step": 4024 + }, + { + "epoch": 0.47831253713606653, + "grad_norm": 0.5632934756743735, + "learning_rate": 1.801398139197842e-05, + "loss": 0.879, + "num_tokens": 16823070583.0, + "step": 4025 + }, + { + "epoch": 0.47843137254901963, + "grad_norm": 0.4935874643428379, + "learning_rate": 1.8012895991606755e-05, + "loss": 0.8714, + "num_tokens": 16827260818.0, + "step": 4026 + }, + { + "epoch": 0.4785502079619727, + "grad_norm": 0.5535011281851799, + "learning_rate": 1.8011810331523844e-05, + "loss": 0.876, + "num_tokens": 16831449798.0, + "step": 4027 + }, + { + "epoch": 0.4786690433749257, + "grad_norm": 0.47071223563514664, + "learning_rate": 1.8010724411769884e-05, + "loss": 0.9117, + "num_tokens": 16835621185.0, + "step": 4028 + }, + { + "epoch": 0.47878787878787876, + "grad_norm": 0.492650195330816, + "learning_rate": 1.8009638232385097e-05, + "loss": 0.8887, + "num_tokens": 16839800165.0, + "step": 4029 + }, + { + "epoch": 0.47890671420083186, + "grad_norm": 0.4234998391406418, + "learning_rate": 1.8008551793409705e-05, + "loss": 0.8555, + "num_tokens": 16843989760.0, + "step": 4030 + }, + { + "epoch": 0.4790255496137849, + "grad_norm": 0.7150736987008461, + "learning_rate": 1.800746509488395e-05, + "loss": 0.8867, + "num_tokens": 16848179617.0, + "step": 4031 + }, + { + "epoch": 0.47914438502673795, + "grad_norm": 0.4204965633015847, + "learning_rate": 1.8006378136848063e-05, + "loss": 0.8774, + "num_tokens": 16852360724.0, + "step": 4032 + }, + { + "epoch": 0.47926322043969105, + "grad_norm": 0.5873599624237869, + "learning_rate": 1.8005290919342308e-05, + "loss": 0.8516, + "num_tokens": 16856549823.0, + "step": 4033 + }, + { + "epoch": 0.4793820558526441, + "grad_norm": 0.459651932842005, + "learning_rate": 1.800420344240694e-05, + "loss": 0.874, + "num_tokens": 16860738517.0, + "step": 4034 + }, + { + "epoch": 0.47950089126559714, + "grad_norm": 0.5206252868717284, + "learning_rate": 1.8003115706082238e-05, + "loss": 0.8511, + "num_tokens": 16864928324.0, + "step": 4035 + }, + { + "epoch": 0.4796197266785502, + "grad_norm": 0.4898910775539104, + "learning_rate": 1.800202771040849e-05, + "loss": 0.8662, + "num_tokens": 16869115239.0, + "step": 4036 + }, + { + "epoch": 0.4797385620915033, + "grad_norm": 0.4944000234143019, + "learning_rate": 1.8000939455425976e-05, + "loss": 0.8613, + "num_tokens": 16873304624.0, + "step": 4037 + }, + { + "epoch": 0.47985739750445633, + "grad_norm": 0.5500465263819359, + "learning_rate": 1.7999850941175005e-05, + "loss": 0.8953, + "num_tokens": 16877492585.0, + "step": 4038 + }, + { + "epoch": 0.4799762329174094, + "grad_norm": 0.5530275555300432, + "learning_rate": 1.799876216769588e-05, + "loss": 0.8793, + "num_tokens": 16881682771.0, + "step": 4039 + }, + { + "epoch": 0.4800950683303625, + "grad_norm": 0.5066118513862764, + "learning_rate": 1.7997673135028936e-05, + "loss": 0.8487, + "num_tokens": 16885872075.0, + "step": 4040 + }, + { + "epoch": 0.4802139037433155, + "grad_norm": 0.552977967517536, + "learning_rate": 1.7996583843214496e-05, + "loss": 0.8401, + "num_tokens": 16890062108.0, + "step": 4041 + }, + { + "epoch": 0.48033273915626856, + "grad_norm": 0.40822577830134443, + "learning_rate": 1.79954942922929e-05, + "loss": 0.8569, + "num_tokens": 16894233057.0, + "step": 4042 + }, + { + "epoch": 0.4804515745692216, + "grad_norm": 0.4988718760778933, + "learning_rate": 1.7994404482304496e-05, + "loss": 0.8902, + "num_tokens": 16898398342.0, + "step": 4043 + }, + { + "epoch": 0.4805704099821747, + "grad_norm": 0.4489229193593322, + "learning_rate": 1.7993314413289646e-05, + "loss": 0.8776, + "num_tokens": 16902586728.0, + "step": 4044 + }, + { + "epoch": 0.48068924539512775, + "grad_norm": 0.5776646835925218, + "learning_rate": 1.799222408528872e-05, + "loss": 0.9053, + "num_tokens": 16906777692.0, + "step": 4045 + }, + { + "epoch": 0.4808080808080808, + "grad_norm": 0.497525111367427, + "learning_rate": 1.7991133498342092e-05, + "loss": 0.8735, + "num_tokens": 16910963042.0, + "step": 4046 + }, + { + "epoch": 0.4809269162210339, + "grad_norm": 0.4840417235871767, + "learning_rate": 1.7990042652490154e-05, + "loss": 0.8974, + "num_tokens": 16915151451.0, + "step": 4047 + }, + { + "epoch": 0.48104575163398694, + "grad_norm": 0.5155024510595354, + "learning_rate": 1.7988951547773303e-05, + "loss": 0.8389, + "num_tokens": 16919341024.0, + "step": 4048 + }, + { + "epoch": 0.48116458704694, + "grad_norm": 0.5286013592844021, + "learning_rate": 1.7987860184231948e-05, + "loss": 0.8724, + "num_tokens": 16923530877.0, + "step": 4049 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 0.614194372277073, + "learning_rate": 1.7986768561906496e-05, + "loss": 0.8798, + "num_tokens": 16927703436.0, + "step": 4050 + }, + { + "epoch": 0.48140225787284613, + "grad_norm": 0.46205101432132745, + "learning_rate": 1.798567668083739e-05, + "loss": 0.8742, + "num_tokens": 16931891966.0, + "step": 4051 + }, + { + "epoch": 0.4815210932857992, + "grad_norm": 0.6261834926828703, + "learning_rate": 1.7984584541065057e-05, + "loss": 0.9001, + "num_tokens": 16936031932.0, + "step": 4052 + }, + { + "epoch": 0.4816399286987522, + "grad_norm": 0.42621979462383974, + "learning_rate": 1.7983492142629936e-05, + "loss": 0.8824, + "num_tokens": 16940220712.0, + "step": 4053 + }, + { + "epoch": 0.48175876411170526, + "grad_norm": 0.5472494699629328, + "learning_rate": 1.7982399485572496e-05, + "loss": 0.8717, + "num_tokens": 16944382832.0, + "step": 4054 + }, + { + "epoch": 0.48187759952465836, + "grad_norm": 0.5341696333959093, + "learning_rate": 1.7981306569933193e-05, + "loss": 0.8618, + "num_tokens": 16948541547.0, + "step": 4055 + }, + { + "epoch": 0.4819964349376114, + "grad_norm": 0.518236974756074, + "learning_rate": 1.7980213395752502e-05, + "loss": 0.9076, + "num_tokens": 16952731763.0, + "step": 4056 + }, + { + "epoch": 0.48211527035056445, + "grad_norm": 0.46202881491950215, + "learning_rate": 1.797911996307091e-05, + "loss": 0.8727, + "num_tokens": 16956921704.0, + "step": 4057 + }, + { + "epoch": 0.48223410576351755, + "grad_norm": 0.5549231992170355, + "learning_rate": 1.7978026271928907e-05, + "loss": 0.8731, + "num_tokens": 16961108584.0, + "step": 4058 + }, + { + "epoch": 0.4823529411764706, + "grad_norm": 0.4135283298192654, + "learning_rate": 1.7976932322367e-05, + "loss": 0.8358, + "num_tokens": 16965298079.0, + "step": 4059 + }, + { + "epoch": 0.48247177658942364, + "grad_norm": 0.5887430535563439, + "learning_rate": 1.7975838114425697e-05, + "loss": 0.8999, + "num_tokens": 16969487307.0, + "step": 4060 + }, + { + "epoch": 0.4825906120023767, + "grad_norm": 0.5394032632895325, + "learning_rate": 1.7974743648145525e-05, + "loss": 0.8849, + "num_tokens": 16973675609.0, + "step": 4061 + }, + { + "epoch": 0.4827094474153298, + "grad_norm": 0.5033738550468575, + "learning_rate": 1.7973648923567014e-05, + "loss": 0.8611, + "num_tokens": 16977864866.0, + "step": 4062 + }, + { + "epoch": 0.48282828282828283, + "grad_norm": 0.4641264021741717, + "learning_rate": 1.79725539407307e-05, + "loss": 0.8583, + "num_tokens": 16982035649.0, + "step": 4063 + }, + { + "epoch": 0.4829471182412359, + "grad_norm": 0.5400615882769185, + "learning_rate": 1.7971458699677144e-05, + "loss": 0.9057, + "num_tokens": 16986204403.0, + "step": 4064 + }, + { + "epoch": 0.483065953654189, + "grad_norm": 0.4975662930938316, + "learning_rate": 1.79703632004469e-05, + "loss": 0.8488, + "num_tokens": 16990392740.0, + "step": 4065 + }, + { + "epoch": 0.483184789067142, + "grad_norm": 0.42534546304874077, + "learning_rate": 1.7969267443080537e-05, + "loss": 0.8953, + "num_tokens": 16994581101.0, + "step": 4066 + }, + { + "epoch": 0.48330362448009506, + "grad_norm": 0.5360861808954718, + "learning_rate": 1.796817142761864e-05, + "loss": 0.832, + "num_tokens": 16998739319.0, + "step": 4067 + }, + { + "epoch": 0.4834224598930481, + "grad_norm": 0.4190985736136148, + "learning_rate": 1.7967075154101796e-05, + "loss": 0.8751, + "num_tokens": 17002930025.0, + "step": 4068 + }, + { + "epoch": 0.4835412953060012, + "grad_norm": 0.5795382894507061, + "learning_rate": 1.7965978622570602e-05, + "loss": 0.8763, + "num_tokens": 17007119783.0, + "step": 4069 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.5312368362161195, + "learning_rate": 1.7964881833065665e-05, + "loss": 0.9223, + "num_tokens": 17011308253.0, + "step": 4070 + }, + { + "epoch": 0.4837789661319073, + "grad_norm": 0.5206009285164431, + "learning_rate": 1.7963784785627604e-05, + "loss": 0.8797, + "num_tokens": 17015497453.0, + "step": 4071 + }, + { + "epoch": 0.4838978015448604, + "grad_norm": 0.4839598828646171, + "learning_rate": 1.796268748029705e-05, + "loss": 0.8244, + "num_tokens": 17019651659.0, + "step": 4072 + }, + { + "epoch": 0.48401663695781344, + "grad_norm": 0.5256745865079934, + "learning_rate": 1.7961589917114638e-05, + "loss": 0.871, + "num_tokens": 17023841193.0, + "step": 4073 + }, + { + "epoch": 0.4841354723707665, + "grad_norm": 0.4941876990182334, + "learning_rate": 1.796049209612101e-05, + "loss": 0.8558, + "num_tokens": 17028030742.0, + "step": 4074 + }, + { + "epoch": 0.48425430778371953, + "grad_norm": 0.5197986707955707, + "learning_rate": 1.795939401735683e-05, + "loss": 0.8774, + "num_tokens": 17032192818.0, + "step": 4075 + }, + { + "epoch": 0.48437314319667263, + "grad_norm": 0.38833107732542205, + "learning_rate": 1.7958295680862756e-05, + "loss": 0.8745, + "num_tokens": 17036349133.0, + "step": 4076 + }, + { + "epoch": 0.4844919786096257, + "grad_norm": 0.5721666073459767, + "learning_rate": 1.7957197086679467e-05, + "loss": 0.8554, + "num_tokens": 17040537116.0, + "step": 4077 + }, + { + "epoch": 0.4846108140225787, + "grad_norm": 0.42299613353952936, + "learning_rate": 1.7956098234847647e-05, + "loss": 0.8927, + "num_tokens": 17044705176.0, + "step": 4078 + }, + { + "epoch": 0.48472964943553176, + "grad_norm": 0.5406084866327185, + "learning_rate": 1.795499912540799e-05, + "loss": 0.8909, + "num_tokens": 17048869283.0, + "step": 4079 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.5700454847932007, + "learning_rate": 1.79538997584012e-05, + "loss": 0.8862, + "num_tokens": 17053057601.0, + "step": 4080 + }, + { + "epoch": 0.4849673202614379, + "grad_norm": 0.4072604311107206, + "learning_rate": 1.7952800133867993e-05, + "loss": 0.9, + "num_tokens": 17057245979.0, + "step": 4081 + }, + { + "epoch": 0.48508615567439095, + "grad_norm": 0.5016538071322817, + "learning_rate": 1.7951700251849085e-05, + "loss": 0.8364, + "num_tokens": 17061435663.0, + "step": 4082 + }, + { + "epoch": 0.48520499108734405, + "grad_norm": 0.4817164878311006, + "learning_rate": 1.7950600112385216e-05, + "loss": 0.9212, + "num_tokens": 17065625841.0, + "step": 4083 + }, + { + "epoch": 0.4853238265002971, + "grad_norm": 0.45325525742340805, + "learning_rate": 1.7949499715517125e-05, + "loss": 0.8661, + "num_tokens": 17069792149.0, + "step": 4084 + }, + { + "epoch": 0.48544266191325014, + "grad_norm": 0.49920360441248285, + "learning_rate": 1.7948399061285558e-05, + "loss": 0.887, + "num_tokens": 17073982140.0, + "step": 4085 + }, + { + "epoch": 0.4855614973262032, + "grad_norm": 0.4923998324792292, + "learning_rate": 1.7947298149731283e-05, + "loss": 0.856, + "num_tokens": 17078140149.0, + "step": 4086 + }, + { + "epoch": 0.4856803327391563, + "grad_norm": 0.539583355912313, + "learning_rate": 1.794619698089507e-05, + "loss": 0.8987, + "num_tokens": 17082329543.0, + "step": 4087 + }, + { + "epoch": 0.48579916815210933, + "grad_norm": 0.5221527313160769, + "learning_rate": 1.7945095554817695e-05, + "loss": 0.8861, + "num_tokens": 17086518660.0, + "step": 4088 + }, + { + "epoch": 0.4859180035650624, + "grad_norm": 0.5381379659893172, + "learning_rate": 1.7943993871539952e-05, + "loss": 0.8962, + "num_tokens": 17090707731.0, + "step": 4089 + }, + { + "epoch": 0.4860368389780155, + "grad_norm": 0.4893252704537432, + "learning_rate": 1.7942891931102634e-05, + "loss": 0.8866, + "num_tokens": 17094896294.0, + "step": 4090 + }, + { + "epoch": 0.4861556743909685, + "grad_norm": 0.5683861837615972, + "learning_rate": 1.7941789733546553e-05, + "loss": 0.8651, + "num_tokens": 17099068257.0, + "step": 4091 + }, + { + "epoch": 0.48627450980392156, + "grad_norm": 0.4422914652501378, + "learning_rate": 1.794068727891253e-05, + "loss": 0.8716, + "num_tokens": 17103246477.0, + "step": 4092 + }, + { + "epoch": 0.4863933452168746, + "grad_norm": 0.48357814820956246, + "learning_rate": 1.793958456724139e-05, + "loss": 0.865, + "num_tokens": 17107435689.0, + "step": 4093 + }, + { + "epoch": 0.4865121806298277, + "grad_norm": 0.48444367654066933, + "learning_rate": 1.793848159857397e-05, + "loss": 0.8634, + "num_tokens": 17111624195.0, + "step": 4094 + }, + { + "epoch": 0.48663101604278075, + "grad_norm": 0.47451679283402276, + "learning_rate": 1.7937378372951116e-05, + "loss": 0.8538, + "num_tokens": 17115812566.0, + "step": 4095 + }, + { + "epoch": 0.4867498514557338, + "grad_norm": 0.5648674220617965, + "learning_rate": 1.7936274890413684e-05, + "loss": 0.8875, + "num_tokens": 17120000184.0, + "step": 4096 + }, + { + "epoch": 0.4868686868686869, + "grad_norm": 0.5023918990750459, + "learning_rate": 1.7935171151002544e-05, + "loss": 0.8892, + "num_tokens": 17124188059.0, + "step": 4097 + }, + { + "epoch": 0.48698752228163994, + "grad_norm": 0.5581499122406645, + "learning_rate": 1.7934067154758565e-05, + "loss": 0.8467, + "num_tokens": 17128359068.0, + "step": 4098 + }, + { + "epoch": 0.487106357694593, + "grad_norm": 0.46713676436435536, + "learning_rate": 1.7932962901722638e-05, + "loss": 0.8962, + "num_tokens": 17132547705.0, + "step": 4099 + }, + { + "epoch": 0.48722519310754603, + "grad_norm": 0.5501794323299324, + "learning_rate": 1.7931858391935652e-05, + "loss": 0.8925, + "num_tokens": 17136695487.0, + "step": 4100 + }, + { + "epoch": 0.48734402852049913, + "grad_norm": 0.5111302391688639, + "learning_rate": 1.7930753625438512e-05, + "loss": 0.8394, + "num_tokens": 17140855519.0, + "step": 4101 + }, + { + "epoch": 0.4874628639334522, + "grad_norm": 0.4796845349135678, + "learning_rate": 1.7929648602272128e-05, + "loss": 0.8837, + "num_tokens": 17145044581.0, + "step": 4102 + }, + { + "epoch": 0.4875816993464052, + "grad_norm": 0.4751236311714162, + "learning_rate": 1.7928543322477434e-05, + "loss": 0.8653, + "num_tokens": 17149232823.0, + "step": 4103 + }, + { + "epoch": 0.48770053475935826, + "grad_norm": 0.5209458747918092, + "learning_rate": 1.7927437786095352e-05, + "loss": 0.9082, + "num_tokens": 17153421044.0, + "step": 4104 + }, + { + "epoch": 0.48781937017231136, + "grad_norm": 0.5069741167737449, + "learning_rate": 1.7926331993166825e-05, + "loss": 0.8678, + "num_tokens": 17157590971.0, + "step": 4105 + }, + { + "epoch": 0.4879382055852644, + "grad_norm": 0.49025645838730747, + "learning_rate": 1.7925225943732808e-05, + "loss": 0.8877, + "num_tokens": 17161779377.0, + "step": 4106 + }, + { + "epoch": 0.48805704099821745, + "grad_norm": 0.48553040063943287, + "learning_rate": 1.792411963783426e-05, + "loss": 0.8656, + "num_tokens": 17165968947.0, + "step": 4107 + }, + { + "epoch": 0.48817587641117055, + "grad_norm": 0.43099369445519936, + "learning_rate": 1.7923013075512147e-05, + "loss": 0.8948, + "num_tokens": 17170160049.0, + "step": 4108 + }, + { + "epoch": 0.4882947118241236, + "grad_norm": 0.6044168694413867, + "learning_rate": 1.7921906256807455e-05, + "loss": 0.8662, + "num_tokens": 17174347843.0, + "step": 4109 + }, + { + "epoch": 0.48841354723707664, + "grad_norm": 0.42201025746803345, + "learning_rate": 1.7920799181761168e-05, + "loss": 0.877, + "num_tokens": 17178518422.0, + "step": 4110 + }, + { + "epoch": 0.4885323826500297, + "grad_norm": 0.5772017296923978, + "learning_rate": 1.7919691850414292e-05, + "loss": 0.9249, + "num_tokens": 17182679518.0, + "step": 4111 + }, + { + "epoch": 0.4886512180629828, + "grad_norm": 0.38994162252616915, + "learning_rate": 1.791858426280783e-05, + "loss": 0.9008, + "num_tokens": 17186869816.0, + "step": 4112 + }, + { + "epoch": 0.48877005347593583, + "grad_norm": 0.6108822950821776, + "learning_rate": 1.7917476418982798e-05, + "loss": 0.8717, + "num_tokens": 17191003477.0, + "step": 4113 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.46087212925271337, + "learning_rate": 1.7916368318980227e-05, + "loss": 0.8957, + "num_tokens": 17195193037.0, + "step": 4114 + }, + { + "epoch": 0.489007724301842, + "grad_norm": 0.5471955553242441, + "learning_rate": 1.7915259962841154e-05, + "loss": 0.8856, + "num_tokens": 17199354972.0, + "step": 4115 + }, + { + "epoch": 0.489126559714795, + "grad_norm": 0.6056904937045122, + "learning_rate": 1.791415135060662e-05, + "loss": 0.8944, + "num_tokens": 17203520007.0, + "step": 4116 + }, + { + "epoch": 0.48924539512774806, + "grad_norm": 0.4491769502986578, + "learning_rate": 1.7913042482317685e-05, + "loss": 0.8678, + "num_tokens": 17207666247.0, + "step": 4117 + }, + { + "epoch": 0.4893642305407011, + "grad_norm": 0.5547433498309015, + "learning_rate": 1.7911933358015413e-05, + "loss": 0.8866, + "num_tokens": 17211855827.0, + "step": 4118 + }, + { + "epoch": 0.4894830659536542, + "grad_norm": 0.5069665683672075, + "learning_rate": 1.7910823977740883e-05, + "loss": 0.8474, + "num_tokens": 17216044967.0, + "step": 4119 + }, + { + "epoch": 0.48960190136660725, + "grad_norm": 0.650811048729518, + "learning_rate": 1.790971434153517e-05, + "loss": 0.8892, + "num_tokens": 17220235720.0, + "step": 4120 + }, + { + "epoch": 0.4897207367795603, + "grad_norm": 0.46176958601574186, + "learning_rate": 1.7908604449439376e-05, + "loss": 0.8754, + "num_tokens": 17224423342.0, + "step": 4121 + }, + { + "epoch": 0.4898395721925134, + "grad_norm": 0.5126820415712893, + "learning_rate": 1.7907494301494602e-05, + "loss": 0.8269, + "num_tokens": 17228612703.0, + "step": 4122 + }, + { + "epoch": 0.48995840760546644, + "grad_norm": 0.45328466176922705, + "learning_rate": 1.7906383897741956e-05, + "loss": 0.8754, + "num_tokens": 17232777159.0, + "step": 4123 + }, + { + "epoch": 0.4900772430184195, + "grad_norm": 0.5816364737416593, + "learning_rate": 1.7905273238222564e-05, + "loss": 0.8738, + "num_tokens": 17236965995.0, + "step": 4124 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.4970208544152149, + "learning_rate": 1.7904162322977556e-05, + "loss": 0.8584, + "num_tokens": 17241129181.0, + "step": 4125 + }, + { + "epoch": 0.49031491384432563, + "grad_norm": 0.5954864489940552, + "learning_rate": 1.7903051152048073e-05, + "loss": 0.8899, + "num_tokens": 17245317974.0, + "step": 4126 + }, + { + "epoch": 0.4904337492572787, + "grad_norm": 0.5039963562812405, + "learning_rate": 1.7901939725475268e-05, + "loss": 0.8532, + "num_tokens": 17249506168.0, + "step": 4127 + }, + { + "epoch": 0.4905525846702317, + "grad_norm": 0.4865327901268863, + "learning_rate": 1.79008280433003e-05, + "loss": 0.8555, + "num_tokens": 17253695947.0, + "step": 4128 + }, + { + "epoch": 0.49067142008318476, + "grad_norm": 0.4594380388057427, + "learning_rate": 1.7899716105564335e-05, + "loss": 0.861, + "num_tokens": 17257884632.0, + "step": 4129 + }, + { + "epoch": 0.49079025549613786, + "grad_norm": 0.5786782039996621, + "learning_rate": 1.7898603912308555e-05, + "loss": 0.8667, + "num_tokens": 17262072231.0, + "step": 4130 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 0.5467963414285814, + "learning_rate": 1.7897491463574147e-05, + "loss": 0.8847, + "num_tokens": 17266236409.0, + "step": 4131 + }, + { + "epoch": 0.49102792632204395, + "grad_norm": 0.5283724441203558, + "learning_rate": 1.789637875940231e-05, + "loss": 0.8814, + "num_tokens": 17270417166.0, + "step": 4132 + }, + { + "epoch": 0.49114676173499705, + "grad_norm": 0.5062001659582689, + "learning_rate": 1.789526579983425e-05, + "loss": 0.8976, + "num_tokens": 17274604881.0, + "step": 4133 + }, + { + "epoch": 0.4912655971479501, + "grad_norm": 0.46981258686119226, + "learning_rate": 1.7894152584911183e-05, + "loss": 0.8734, + "num_tokens": 17278795711.0, + "step": 4134 + }, + { + "epoch": 0.49138443256090314, + "grad_norm": 0.6821084618564475, + "learning_rate": 1.789303911467434e-05, + "loss": 0.8953, + "num_tokens": 17282985402.0, + "step": 4135 + }, + { + "epoch": 0.4915032679738562, + "grad_norm": 0.470236429102819, + "learning_rate": 1.7891925389164946e-05, + "loss": 0.7975, + "num_tokens": 17287110593.0, + "step": 4136 + }, + { + "epoch": 0.4916221033868093, + "grad_norm": 0.6403222206905189, + "learning_rate": 1.789081140842426e-05, + "loss": 0.9055, + "num_tokens": 17291294540.0, + "step": 4137 + }, + { + "epoch": 0.49174093879976233, + "grad_norm": 0.5079725533462685, + "learning_rate": 1.7889697172493524e-05, + "loss": 0.8674, + "num_tokens": 17295468567.0, + "step": 4138 + }, + { + "epoch": 0.4918597742127154, + "grad_norm": 0.5598915681477685, + "learning_rate": 1.788858268141401e-05, + "loss": 0.8824, + "num_tokens": 17299655944.0, + "step": 4139 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 0.5572343354256052, + "learning_rate": 1.7887467935226984e-05, + "loss": 0.8866, + "num_tokens": 17303825582.0, + "step": 4140 + }, + { + "epoch": 0.4920974450386215, + "grad_norm": 0.5004008147006686, + "learning_rate": 1.7886352933973738e-05, + "loss": 0.8677, + "num_tokens": 17308016520.0, + "step": 4141 + }, + { + "epoch": 0.49221628045157456, + "grad_norm": 0.5118229761715025, + "learning_rate": 1.7885237677695554e-05, + "loss": 0.8778, + "num_tokens": 17312206118.0, + "step": 4142 + }, + { + "epoch": 0.4923351158645276, + "grad_norm": 0.5722270365781301, + "learning_rate": 1.7884122166433745e-05, + "loss": 0.8813, + "num_tokens": 17316341171.0, + "step": 4143 + }, + { + "epoch": 0.4924539512774807, + "grad_norm": 0.4505683459893076, + "learning_rate": 1.7883006400229615e-05, + "loss": 0.8755, + "num_tokens": 17320531705.0, + "step": 4144 + }, + { + "epoch": 0.49257278669043375, + "grad_norm": 0.6162408020323791, + "learning_rate": 1.7881890379124485e-05, + "loss": 0.8931, + "num_tokens": 17324690368.0, + "step": 4145 + }, + { + "epoch": 0.4926916221033868, + "grad_norm": 0.48565614699765297, + "learning_rate": 1.7880774103159687e-05, + "loss": 0.8658, + "num_tokens": 17328823858.0, + "step": 4146 + }, + { + "epoch": 0.4928104575163399, + "grad_norm": 0.474171664219316, + "learning_rate": 1.7879657572376558e-05, + "loss": 0.8676, + "num_tokens": 17332986839.0, + "step": 4147 + }, + { + "epoch": 0.49292929292929294, + "grad_norm": 0.5805714933292092, + "learning_rate": 1.7878540786816448e-05, + "loss": 0.8467, + "num_tokens": 17337144792.0, + "step": 4148 + }, + { + "epoch": 0.493048128342246, + "grad_norm": 0.4742823848165484, + "learning_rate": 1.787742374652072e-05, + "loss": 0.8717, + "num_tokens": 17341332762.0, + "step": 4149 + }, + { + "epoch": 0.49316696375519903, + "grad_norm": 0.6445739774772548, + "learning_rate": 1.7876306451530733e-05, + "loss": 0.8589, + "num_tokens": 17345521746.0, + "step": 4150 + }, + { + "epoch": 0.49328579916815213, + "grad_norm": 0.38540511205964206, + "learning_rate": 1.787518890188787e-05, + "loss": 0.8503, + "num_tokens": 17349710146.0, + "step": 4151 + }, + { + "epoch": 0.4934046345811052, + "grad_norm": 0.5017609619286315, + "learning_rate": 1.7874071097633516e-05, + "loss": 0.8589, + "num_tokens": 17353885104.0, + "step": 4152 + }, + { + "epoch": 0.4935234699940582, + "grad_norm": 0.5729973674866404, + "learning_rate": 1.787295303880907e-05, + "loss": 0.8615, + "num_tokens": 17358069546.0, + "step": 4153 + }, + { + "epoch": 0.4936423054070113, + "grad_norm": 0.5438271705300539, + "learning_rate": 1.7871834725455933e-05, + "loss": 0.8734, + "num_tokens": 17362242321.0, + "step": 4154 + }, + { + "epoch": 0.49376114081996436, + "grad_norm": 0.47034294807655985, + "learning_rate": 1.7870716157615524e-05, + "loss": 0.8972, + "num_tokens": 17366417826.0, + "step": 4155 + }, + { + "epoch": 0.4938799762329174, + "grad_norm": 0.4596742230451391, + "learning_rate": 1.7869597335329262e-05, + "loss": 0.8805, + "num_tokens": 17370590228.0, + "step": 4156 + }, + { + "epoch": 0.49399881164587045, + "grad_norm": 0.5317228915721871, + "learning_rate": 1.7868478258638583e-05, + "loss": 0.8573, + "num_tokens": 17374777909.0, + "step": 4157 + }, + { + "epoch": 0.49411764705882355, + "grad_norm": 0.4704497110477096, + "learning_rate": 1.7867358927584933e-05, + "loss": 0.8976, + "num_tokens": 17378940569.0, + "step": 4158 + }, + { + "epoch": 0.4942364824717766, + "grad_norm": 0.5485827679185469, + "learning_rate": 1.7866239342209766e-05, + "loss": 0.8502, + "num_tokens": 17383070913.0, + "step": 4159 + }, + { + "epoch": 0.49435531788472964, + "grad_norm": 0.44855405249874625, + "learning_rate": 1.786511950255454e-05, + "loss": 0.8688, + "num_tokens": 17387259182.0, + "step": 4160 + }, + { + "epoch": 0.4944741532976827, + "grad_norm": 0.486951144691362, + "learning_rate": 1.7863999408660723e-05, + "loss": 0.8716, + "num_tokens": 17391445833.0, + "step": 4161 + }, + { + "epoch": 0.4945929887106358, + "grad_norm": 0.4749409402582863, + "learning_rate": 1.7862879060569805e-05, + "loss": 0.8619, + "num_tokens": 17395608206.0, + "step": 4162 + }, + { + "epoch": 0.49471182412358883, + "grad_norm": 0.5834402222300787, + "learning_rate": 1.7861758458323267e-05, + "loss": 0.8579, + "num_tokens": 17399798678.0, + "step": 4163 + }, + { + "epoch": 0.4948306595365419, + "grad_norm": 0.4439983887705912, + "learning_rate": 1.7860637601962617e-05, + "loss": 0.898, + "num_tokens": 17403981480.0, + "step": 4164 + }, + { + "epoch": 0.494949494949495, + "grad_norm": 0.4985417059438987, + "learning_rate": 1.7859516491529358e-05, + "loss": 0.8802, + "num_tokens": 17408169158.0, + "step": 4165 + }, + { + "epoch": 0.495068330362448, + "grad_norm": 0.5711897523346199, + "learning_rate": 1.7858395127065013e-05, + "loss": 0.9033, + "num_tokens": 17412358720.0, + "step": 4166 + }, + { + "epoch": 0.49518716577540106, + "grad_norm": 0.4350511091162261, + "learning_rate": 1.7857273508611103e-05, + "loss": 0.8878, + "num_tokens": 17416524786.0, + "step": 4167 + }, + { + "epoch": 0.4953060011883541, + "grad_norm": 0.5162745103019213, + "learning_rate": 1.7856151636209174e-05, + "loss": 0.8848, + "num_tokens": 17420663098.0, + "step": 4168 + }, + { + "epoch": 0.4954248366013072, + "grad_norm": 0.5259070391984731, + "learning_rate": 1.785502950990077e-05, + "loss": 0.8284, + "num_tokens": 17424852431.0, + "step": 4169 + }, + { + "epoch": 0.49554367201426025, + "grad_norm": 0.3869812237952217, + "learning_rate": 1.785390712972744e-05, + "loss": 0.8401, + "num_tokens": 17429032229.0, + "step": 4170 + }, + { + "epoch": 0.4956625074272133, + "grad_norm": 0.5289773350863344, + "learning_rate": 1.7852784495730758e-05, + "loss": 0.8418, + "num_tokens": 17433221509.0, + "step": 4171 + }, + { + "epoch": 0.4957813428401664, + "grad_norm": 0.4567529423199504, + "learning_rate": 1.78516616079523e-05, + "loss": 0.8724, + "num_tokens": 17437411166.0, + "step": 4172 + }, + { + "epoch": 0.49590017825311944, + "grad_norm": 0.44995082456057695, + "learning_rate": 1.7850538466433643e-05, + "loss": 0.8761, + "num_tokens": 17441582351.0, + "step": 4173 + }, + { + "epoch": 0.4960190136660725, + "grad_norm": 0.4372752764843958, + "learning_rate": 1.7849415071216385e-05, + "loss": 0.8613, + "num_tokens": 17445770532.0, + "step": 4174 + }, + { + "epoch": 0.49613784907902553, + "grad_norm": 0.4747612781714717, + "learning_rate": 1.784829142234213e-05, + "loss": 0.8808, + "num_tokens": 17449960274.0, + "step": 4175 + }, + { + "epoch": 0.49625668449197863, + "grad_norm": 0.38528473117420764, + "learning_rate": 1.7847167519852486e-05, + "loss": 0.8759, + "num_tokens": 17454121897.0, + "step": 4176 + }, + { + "epoch": 0.4963755199049317, + "grad_norm": 0.5210700028520412, + "learning_rate": 1.784604336378908e-05, + "loss": 0.8542, + "num_tokens": 17458280435.0, + "step": 4177 + }, + { + "epoch": 0.4964943553178847, + "grad_norm": 0.513718364952417, + "learning_rate": 1.7844918954193544e-05, + "loss": 0.8759, + "num_tokens": 17462470029.0, + "step": 4178 + }, + { + "epoch": 0.4966131907308378, + "grad_norm": 0.46398279634577905, + "learning_rate": 1.784379429110751e-05, + "loss": 0.8605, + "num_tokens": 17466659267.0, + "step": 4179 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.5283072660951837, + "learning_rate": 1.7842669374572633e-05, + "loss": 0.8501, + "num_tokens": 17470848757.0, + "step": 4180 + }, + { + "epoch": 0.4968508615567439, + "grad_norm": 0.5282437206861282, + "learning_rate": 1.7841544204630582e-05, + "loss": 0.8855, + "num_tokens": 17475036862.0, + "step": 4181 + }, + { + "epoch": 0.49696969696969695, + "grad_norm": 0.5075520587268801, + "learning_rate": 1.784041878132301e-05, + "loss": 0.849, + "num_tokens": 17479200918.0, + "step": 4182 + }, + { + "epoch": 0.49708853238265005, + "grad_norm": 0.5931534124958656, + "learning_rate": 1.7839293104691607e-05, + "loss": 0.8481, + "num_tokens": 17483390914.0, + "step": 4183 + }, + { + "epoch": 0.4972073677956031, + "grad_norm": 0.4568926082161377, + "learning_rate": 1.7838167174778053e-05, + "loss": 0.8746, + "num_tokens": 17487565918.0, + "step": 4184 + }, + { + "epoch": 0.49732620320855614, + "grad_norm": 0.5443276311801921, + "learning_rate": 1.783704099162405e-05, + "loss": 0.8744, + "num_tokens": 17491744188.0, + "step": 4185 + }, + { + "epoch": 0.4974450386215092, + "grad_norm": 0.4908376836792944, + "learning_rate": 1.78359145552713e-05, + "loss": 0.8781, + "num_tokens": 17495931916.0, + "step": 4186 + }, + { + "epoch": 0.4975638740344623, + "grad_norm": 0.5207251024371558, + "learning_rate": 1.7834787865761522e-05, + "loss": 0.8608, + "num_tokens": 17500120556.0, + "step": 4187 + }, + { + "epoch": 0.49768270944741533, + "grad_norm": 0.5541127296004072, + "learning_rate": 1.783366092313644e-05, + "loss": 0.8598, + "num_tokens": 17504249151.0, + "step": 4188 + }, + { + "epoch": 0.4978015448603684, + "grad_norm": 0.436362755541731, + "learning_rate": 1.7832533727437793e-05, + "loss": 0.8731, + "num_tokens": 17508431937.0, + "step": 4189 + }, + { + "epoch": 0.4979203802733215, + "grad_norm": 0.5310331642948309, + "learning_rate": 1.783140627870732e-05, + "loss": 0.8646, + "num_tokens": 17512620242.0, + "step": 4190 + }, + { + "epoch": 0.4980392156862745, + "grad_norm": 0.4557138328764536, + "learning_rate": 1.7830278576986773e-05, + "loss": 0.8688, + "num_tokens": 17516808847.0, + "step": 4191 + }, + { + "epoch": 0.49815805109922756, + "grad_norm": 0.5510027379177526, + "learning_rate": 1.7829150622317914e-05, + "loss": 0.8588, + "num_tokens": 17520998447.0, + "step": 4192 + }, + { + "epoch": 0.4982768865121806, + "grad_norm": 0.5088008759031962, + "learning_rate": 1.7828022414742523e-05, + "loss": 0.8698, + "num_tokens": 17525178158.0, + "step": 4193 + }, + { + "epoch": 0.4983957219251337, + "grad_norm": 0.47148201716986843, + "learning_rate": 1.782689395430237e-05, + "loss": 0.8474, + "num_tokens": 17529367765.0, + "step": 4194 + }, + { + "epoch": 0.49851455733808675, + "grad_norm": 0.5029115698368178, + "learning_rate": 1.7825765241039257e-05, + "loss": 0.8657, + "num_tokens": 17533555638.0, + "step": 4195 + }, + { + "epoch": 0.4986333927510398, + "grad_norm": 0.5068704292284605, + "learning_rate": 1.7824636274994976e-05, + "loss": 0.8533, + "num_tokens": 17537738704.0, + "step": 4196 + }, + { + "epoch": 0.4987522281639929, + "grad_norm": 0.5277349218450185, + "learning_rate": 1.7823507056211338e-05, + "loss": 0.8666, + "num_tokens": 17541928092.0, + "step": 4197 + }, + { + "epoch": 0.49887106357694594, + "grad_norm": 0.4417137229778438, + "learning_rate": 1.7822377584730165e-05, + "loss": 0.8891, + "num_tokens": 17546098256.0, + "step": 4198 + }, + { + "epoch": 0.498989898989899, + "grad_norm": 0.5376126331475473, + "learning_rate": 1.782124786059328e-05, + "loss": 0.8967, + "num_tokens": 17550287616.0, + "step": 4199 + }, + { + "epoch": 0.49910873440285203, + "grad_norm": 0.4524742984621469, + "learning_rate": 1.7820117883842528e-05, + "loss": 0.8861, + "num_tokens": 17554477426.0, + "step": 4200 + }, + { + "epoch": 0.49922756981580513, + "grad_norm": 0.5136176264224795, + "learning_rate": 1.7818987654519747e-05, + "loss": 0.8627, + "num_tokens": 17558667001.0, + "step": 4201 + }, + { + "epoch": 0.4993464052287582, + "grad_norm": 0.5669723005845396, + "learning_rate": 1.7817857172666802e-05, + "loss": 0.8893, + "num_tokens": 17562837528.0, + "step": 4202 + }, + { + "epoch": 0.4994652406417112, + "grad_norm": 0.4001788615221839, + "learning_rate": 1.781672643832555e-05, + "loss": 0.8948, + "num_tokens": 17567021893.0, + "step": 4203 + }, + { + "epoch": 0.4995840760546643, + "grad_norm": 0.629077467823576, + "learning_rate": 1.7815595451537874e-05, + "loss": 0.865, + "num_tokens": 17571192990.0, + "step": 4204 + }, + { + "epoch": 0.49970291146761736, + "grad_norm": 0.45424850257835153, + "learning_rate": 1.7814464212345653e-05, + "loss": 0.8606, + "num_tokens": 17575384406.0, + "step": 4205 + }, + { + "epoch": 0.4998217468805704, + "grad_norm": 0.5013629817055685, + "learning_rate": 1.7813332720790783e-05, + "loss": 0.8839, + "num_tokens": 17579574345.0, + "step": 4206 + }, + { + "epoch": 0.49994058229352345, + "grad_norm": 0.5023040747022679, + "learning_rate": 1.7812200976915162e-05, + "loss": 0.8295, + "num_tokens": 17583762946.0, + "step": 4207 + }, + { + "epoch": 0.5000594177064765, + "grad_norm": 0.5218977926451134, + "learning_rate": 1.781106898076071e-05, + "loss": 0.9198, + "num_tokens": 17587944383.0, + "step": 4208 + }, + { + "epoch": 0.5001782531194295, + "grad_norm": 0.435643518695323, + "learning_rate": 1.780993673236934e-05, + "loss": 0.9212, + "num_tokens": 17592133964.0, + "step": 4209 + }, + { + "epoch": 0.5002970885323826, + "grad_norm": 0.49640933282857785, + "learning_rate": 1.780880423178299e-05, + "loss": 0.8399, + "num_tokens": 17596321420.0, + "step": 4210 + }, + { + "epoch": 0.5004159239453357, + "grad_norm": 0.4331685101665654, + "learning_rate": 1.7807671479043603e-05, + "loss": 0.8874, + "num_tokens": 17600510092.0, + "step": 4211 + }, + { + "epoch": 0.5005347593582887, + "grad_norm": 0.48429676500661306, + "learning_rate": 1.780653847419312e-05, + "loss": 0.9013, + "num_tokens": 17604698291.0, + "step": 4212 + }, + { + "epoch": 0.5006535947712418, + "grad_norm": 0.5207503664095832, + "learning_rate": 1.7805405217273503e-05, + "loss": 0.8382, + "num_tokens": 17608888387.0, + "step": 4213 + }, + { + "epoch": 0.5007724301841949, + "grad_norm": 0.4732596750983197, + "learning_rate": 1.780427170832672e-05, + "loss": 0.8947, + "num_tokens": 17613058794.0, + "step": 4214 + }, + { + "epoch": 0.5008912655971479, + "grad_norm": 0.4640068117100981, + "learning_rate": 1.7803137947394754e-05, + "loss": 0.8797, + "num_tokens": 17617247163.0, + "step": 4215 + }, + { + "epoch": 0.501010101010101, + "grad_norm": 0.4603177269702851, + "learning_rate": 1.7802003934519587e-05, + "loss": 0.8812, + "num_tokens": 17621420260.0, + "step": 4216 + }, + { + "epoch": 0.5011289364230541, + "grad_norm": 0.46678239804653465, + "learning_rate": 1.780086966974321e-05, + "loss": 0.8961, + "num_tokens": 17625606919.0, + "step": 4217 + }, + { + "epoch": 0.5012477718360071, + "grad_norm": 0.5060716544061137, + "learning_rate": 1.7799735153107643e-05, + "loss": 0.8998, + "num_tokens": 17629795601.0, + "step": 4218 + }, + { + "epoch": 0.5013666072489602, + "grad_norm": 0.49718618872616605, + "learning_rate": 1.7798600384654893e-05, + "loss": 0.8778, + "num_tokens": 17633959943.0, + "step": 4219 + }, + { + "epoch": 0.5014854426619133, + "grad_norm": 0.4786857722892321, + "learning_rate": 1.779746536442698e-05, + "loss": 0.8962, + "num_tokens": 17638149597.0, + "step": 4220 + }, + { + "epoch": 0.5016042780748663, + "grad_norm": 0.5445506203480583, + "learning_rate": 1.779633009246594e-05, + "loss": 0.9024, + "num_tokens": 17642338842.0, + "step": 4221 + }, + { + "epoch": 0.5017231134878194, + "grad_norm": 0.5024695707521528, + "learning_rate": 1.7795194568813824e-05, + "loss": 0.8779, + "num_tokens": 17646496937.0, + "step": 4222 + }, + { + "epoch": 0.5018419489007724, + "grad_norm": 0.47967865685986383, + "learning_rate": 1.779405879351267e-05, + "loss": 0.8728, + "num_tokens": 17650685868.0, + "step": 4223 + }, + { + "epoch": 0.5019607843137255, + "grad_norm": 0.42205936610501293, + "learning_rate": 1.7792922766604557e-05, + "loss": 0.8764, + "num_tokens": 17654856170.0, + "step": 4224 + }, + { + "epoch": 0.5020796197266786, + "grad_norm": 0.4837943391196337, + "learning_rate": 1.7791786488131538e-05, + "loss": 0.8923, + "num_tokens": 17659014995.0, + "step": 4225 + }, + { + "epoch": 0.5021984551396316, + "grad_norm": 0.4605904027838433, + "learning_rate": 1.7790649958135707e-05, + "loss": 0.9058, + "num_tokens": 17663201700.0, + "step": 4226 + }, + { + "epoch": 0.5023172905525847, + "grad_norm": 0.5327417213983517, + "learning_rate": 1.778951317665915e-05, + "loss": 0.8405, + "num_tokens": 17667391259.0, + "step": 4227 + }, + { + "epoch": 0.5024361259655378, + "grad_norm": 0.5265604240298286, + "learning_rate": 1.778837614374396e-05, + "loss": 0.8641, + "num_tokens": 17671525424.0, + "step": 4228 + }, + { + "epoch": 0.5025549613784908, + "grad_norm": 0.394858087712515, + "learning_rate": 1.778723885943225e-05, + "loss": 0.8414, + "num_tokens": 17675715472.0, + "step": 4229 + }, + { + "epoch": 0.5026737967914439, + "grad_norm": 0.550436331034935, + "learning_rate": 1.7786101323766135e-05, + "loss": 0.8709, + "num_tokens": 17679880847.0, + "step": 4230 + }, + { + "epoch": 0.502792632204397, + "grad_norm": 0.4477099872215294, + "learning_rate": 1.7784963536787746e-05, + "loss": 0.8391, + "num_tokens": 17684063125.0, + "step": 4231 + }, + { + "epoch": 0.50291146761735, + "grad_norm": 0.5313985484370515, + "learning_rate": 1.7783825498539218e-05, + "loss": 0.8628, + "num_tokens": 17688251107.0, + "step": 4232 + }, + { + "epoch": 0.503030303030303, + "grad_norm": 0.4178974032277717, + "learning_rate": 1.7782687209062696e-05, + "loss": 0.8881, + "num_tokens": 17692440701.0, + "step": 4233 + }, + { + "epoch": 0.503149138443256, + "grad_norm": 0.45374841966073, + "learning_rate": 1.778154866840033e-05, + "loss": 0.875, + "num_tokens": 17696607891.0, + "step": 4234 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.4605159809104839, + "learning_rate": 1.7780409876594287e-05, + "loss": 0.9037, + "num_tokens": 17700795503.0, + "step": 4235 + }, + { + "epoch": 0.5033868092691622, + "grad_norm": 0.5142339593979776, + "learning_rate": 1.7779270833686746e-05, + "loss": 0.8871, + "num_tokens": 17704983529.0, + "step": 4236 + }, + { + "epoch": 0.5035056446821152, + "grad_norm": 0.4553935105008381, + "learning_rate": 1.777813153971988e-05, + "loss": 0.8419, + "num_tokens": 17709173895.0, + "step": 4237 + }, + { + "epoch": 0.5036244800950683, + "grad_norm": 0.4527693566250573, + "learning_rate": 1.7776991994735888e-05, + "loss": 0.8676, + "num_tokens": 17713362576.0, + "step": 4238 + }, + { + "epoch": 0.5037433155080214, + "grad_norm": 0.45378102247933144, + "learning_rate": 1.7775852198776968e-05, + "loss": 0.8814, + "num_tokens": 17717543798.0, + "step": 4239 + }, + { + "epoch": 0.5038621509209744, + "grad_norm": 0.4705514807081525, + "learning_rate": 1.7774712151885334e-05, + "loss": 0.8849, + "num_tokens": 17721732360.0, + "step": 4240 + }, + { + "epoch": 0.5039809863339275, + "grad_norm": 0.43355393016610827, + "learning_rate": 1.77735718541032e-05, + "loss": 0.8492, + "num_tokens": 17725921938.0, + "step": 4241 + }, + { + "epoch": 0.5040998217468806, + "grad_norm": 0.5327989015256062, + "learning_rate": 1.77724313054728e-05, + "loss": 0.8584, + "num_tokens": 17730104164.0, + "step": 4242 + }, + { + "epoch": 0.5042186571598336, + "grad_norm": 0.5501313771988112, + "learning_rate": 1.7771290506036368e-05, + "loss": 0.8717, + "num_tokens": 17734293310.0, + "step": 4243 + }, + { + "epoch": 0.5043374925727867, + "grad_norm": 0.4697999161229568, + "learning_rate": 1.777014945583616e-05, + "loss": 0.8685, + "num_tokens": 17738471045.0, + "step": 4244 + }, + { + "epoch": 0.5044563279857398, + "grad_norm": 0.4978084205159927, + "learning_rate": 1.7769008154914422e-05, + "loss": 0.8718, + "num_tokens": 17742622154.0, + "step": 4245 + }, + { + "epoch": 0.5045751633986928, + "grad_norm": 0.5987687108333122, + "learning_rate": 1.7767866603313427e-05, + "loss": 0.89, + "num_tokens": 17746782095.0, + "step": 4246 + }, + { + "epoch": 0.5046939988116459, + "grad_norm": 0.46289295714545975, + "learning_rate": 1.776672480107545e-05, + "loss": 0.8436, + "num_tokens": 17750939497.0, + "step": 4247 + }, + { + "epoch": 0.5048128342245989, + "grad_norm": 0.48880148765107057, + "learning_rate": 1.7765582748242772e-05, + "loss": 0.9105, + "num_tokens": 17755096729.0, + "step": 4248 + }, + { + "epoch": 0.504931669637552, + "grad_norm": 0.5301696586681836, + "learning_rate": 1.7764440444857697e-05, + "loss": 0.9394, + "num_tokens": 17759263625.0, + "step": 4249 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.5307666714467159, + "learning_rate": 1.7763297890962514e-05, + "loss": 0.8738, + "num_tokens": 17763439034.0, + "step": 4250 + }, + { + "epoch": 0.5051693404634581, + "grad_norm": 0.5236608831119699, + "learning_rate": 1.776215508659954e-05, + "loss": 0.8612, + "num_tokens": 17767628369.0, + "step": 4251 + }, + { + "epoch": 0.5052881758764112, + "grad_norm": 0.5531960042971384, + "learning_rate": 1.776101203181111e-05, + "loss": 0.864, + "num_tokens": 17771793284.0, + "step": 4252 + }, + { + "epoch": 0.5054070112893643, + "grad_norm": 0.48161889451086504, + "learning_rate": 1.7759868726639538e-05, + "loss": 0.892, + "num_tokens": 17775983516.0, + "step": 4253 + }, + { + "epoch": 0.5055258467023173, + "grad_norm": 0.43645049883511544, + "learning_rate": 1.7758725171127173e-05, + "loss": 0.8503, + "num_tokens": 17780172933.0, + "step": 4254 + }, + { + "epoch": 0.5056446821152704, + "grad_norm": 0.5035644187282515, + "learning_rate": 1.7757581365316364e-05, + "loss": 0.8928, + "num_tokens": 17784361261.0, + "step": 4255 + }, + { + "epoch": 0.5057635175282235, + "grad_norm": 0.42447656957139007, + "learning_rate": 1.775643730924947e-05, + "loss": 0.8598, + "num_tokens": 17788551526.0, + "step": 4256 + }, + { + "epoch": 0.5058823529411764, + "grad_norm": 0.5092945625843933, + "learning_rate": 1.7755293002968854e-05, + "loss": 0.8671, + "num_tokens": 17792739502.0, + "step": 4257 + }, + { + "epoch": 0.5060011883541295, + "grad_norm": 0.4827362555301213, + "learning_rate": 1.7754148446516902e-05, + "loss": 0.8576, + "num_tokens": 17796929554.0, + "step": 4258 + }, + { + "epoch": 0.5061200237670825, + "grad_norm": 0.5058365137642246, + "learning_rate": 1.7753003639935993e-05, + "loss": 0.8809, + "num_tokens": 17801117948.0, + "step": 4259 + }, + { + "epoch": 0.5062388591800356, + "grad_norm": 0.5550969572571502, + "learning_rate": 1.7751858583268528e-05, + "loss": 0.8719, + "num_tokens": 17805307720.0, + "step": 4260 + }, + { + "epoch": 0.5063576945929887, + "grad_norm": 0.49278114232855746, + "learning_rate": 1.7750713276556916e-05, + "loss": 0.8653, + "num_tokens": 17809495826.0, + "step": 4261 + }, + { + "epoch": 0.5064765300059417, + "grad_norm": 0.40434510847505095, + "learning_rate": 1.774956771984356e-05, + "loss": 0.8393, + "num_tokens": 17813685378.0, + "step": 4262 + }, + { + "epoch": 0.5065953654188948, + "grad_norm": 0.5708425773929815, + "learning_rate": 1.774842191317089e-05, + "loss": 0.8841, + "num_tokens": 17817874041.0, + "step": 4263 + }, + { + "epoch": 0.5067142008318479, + "grad_norm": 0.5012634381498714, + "learning_rate": 1.7747275856581347e-05, + "loss": 0.9176, + "num_tokens": 17822063797.0, + "step": 4264 + }, + { + "epoch": 0.5068330362448009, + "grad_norm": 0.4019739207319479, + "learning_rate": 1.774612955011736e-05, + "loss": 0.8613, + "num_tokens": 17826253824.0, + "step": 4265 + }, + { + "epoch": 0.506951871657754, + "grad_norm": 0.5335736397502087, + "learning_rate": 1.774498299382139e-05, + "loss": 0.8669, + "num_tokens": 17830442623.0, + "step": 4266 + }, + { + "epoch": 0.5070707070707071, + "grad_norm": 0.48538865269472126, + "learning_rate": 1.774383618773589e-05, + "loss": 0.8785, + "num_tokens": 17834572229.0, + "step": 4267 + }, + { + "epoch": 0.5071895424836601, + "grad_norm": 0.5193353607486307, + "learning_rate": 1.774268913190334e-05, + "loss": 0.8759, + "num_tokens": 17838761151.0, + "step": 4268 + }, + { + "epoch": 0.5073083778966132, + "grad_norm": 0.4804396483481145, + "learning_rate": 1.7741541826366212e-05, + "loss": 0.864, + "num_tokens": 17842943917.0, + "step": 4269 + }, + { + "epoch": 0.5074272133095663, + "grad_norm": 0.4369945320739141, + "learning_rate": 1.7740394271166994e-05, + "loss": 0.8741, + "num_tokens": 17847124995.0, + "step": 4270 + }, + { + "epoch": 0.5075460487225193, + "grad_norm": 0.4702787229927909, + "learning_rate": 1.773924646634819e-05, + "loss": 0.8672, + "num_tokens": 17851286684.0, + "step": 4271 + }, + { + "epoch": 0.5076648841354724, + "grad_norm": 0.5069190181750964, + "learning_rate": 1.77380984119523e-05, + "loss": 0.8738, + "num_tokens": 17855459219.0, + "step": 4272 + }, + { + "epoch": 0.5077837195484254, + "grad_norm": 0.5256266806621204, + "learning_rate": 1.7736950108021844e-05, + "loss": 0.8329, + "num_tokens": 17859647284.0, + "step": 4273 + }, + { + "epoch": 0.5079025549613785, + "grad_norm": 0.45824009439024393, + "learning_rate": 1.773580155459935e-05, + "loss": 0.8604, + "num_tokens": 17863837123.0, + "step": 4274 + }, + { + "epoch": 0.5080213903743316, + "grad_norm": 0.4364566319270536, + "learning_rate": 1.7734652751727345e-05, + "loss": 0.8922, + "num_tokens": 17868000156.0, + "step": 4275 + }, + { + "epoch": 0.5081402257872846, + "grad_norm": 0.5157478936551498, + "learning_rate": 1.7733503699448383e-05, + "loss": 0.8627, + "num_tokens": 17872150799.0, + "step": 4276 + }, + { + "epoch": 0.5082590612002377, + "grad_norm": 0.4666440605969502, + "learning_rate": 1.7732354397805013e-05, + "loss": 0.8419, + "num_tokens": 17876340370.0, + "step": 4277 + }, + { + "epoch": 0.5083778966131908, + "grad_norm": 0.42638241621697315, + "learning_rate": 1.7731204846839795e-05, + "loss": 0.8681, + "num_tokens": 17880529552.0, + "step": 4278 + }, + { + "epoch": 0.5084967320261438, + "grad_norm": 0.5103662709316228, + "learning_rate": 1.7730055046595305e-05, + "loss": 0.8604, + "num_tokens": 17884718504.0, + "step": 4279 + }, + { + "epoch": 0.5086155674390969, + "grad_norm": 0.4626544080129309, + "learning_rate": 1.7728904997114117e-05, + "loss": 0.8542, + "num_tokens": 17888877684.0, + "step": 4280 + }, + { + "epoch": 0.50873440285205, + "grad_norm": 0.4795422213315932, + "learning_rate": 1.772775469843883e-05, + "loss": 0.8863, + "num_tokens": 17893067131.0, + "step": 4281 + }, + { + "epoch": 0.508853238265003, + "grad_norm": 0.5207190415252243, + "learning_rate": 1.7726604150612038e-05, + "loss": 0.8311, + "num_tokens": 17897245765.0, + "step": 4282 + }, + { + "epoch": 0.508972073677956, + "grad_norm": 0.555181626893168, + "learning_rate": 1.7725453353676354e-05, + "loss": 0.8743, + "num_tokens": 17901415962.0, + "step": 4283 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 0.5190307727636814, + "learning_rate": 1.772430230767439e-05, + "loss": 0.8677, + "num_tokens": 17905604335.0, + "step": 4284 + }, + { + "epoch": 0.5092097445038621, + "grad_norm": 0.4703659085614267, + "learning_rate": 1.7723151012648773e-05, + "loss": 0.8799, + "num_tokens": 17909762540.0, + "step": 4285 + }, + { + "epoch": 0.5093285799168152, + "grad_norm": 0.4476678655107253, + "learning_rate": 1.772199946864215e-05, + "loss": 0.8669, + "num_tokens": 17913943750.0, + "step": 4286 + }, + { + "epoch": 0.5094474153297682, + "grad_norm": 0.47770003304293795, + "learning_rate": 1.7720847675697152e-05, + "loss": 0.8731, + "num_tokens": 17918133681.0, + "step": 4287 + }, + { + "epoch": 0.5095662507427213, + "grad_norm": 0.4884407161627992, + "learning_rate": 1.7719695633856444e-05, + "loss": 0.882, + "num_tokens": 17922282578.0, + "step": 4288 + }, + { + "epoch": 0.5096850861556744, + "grad_norm": 0.4569197650038022, + "learning_rate": 1.7718543343162687e-05, + "loss": 0.8858, + "num_tokens": 17926453616.0, + "step": 4289 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.5742439007328676, + "learning_rate": 1.7717390803658555e-05, + "loss": 0.8701, + "num_tokens": 17930642528.0, + "step": 4290 + }, + { + "epoch": 0.5099227569815805, + "grad_norm": 0.45629420308472, + "learning_rate": 1.7716238015386725e-05, + "loss": 0.8723, + "num_tokens": 17934829852.0, + "step": 4291 + }, + { + "epoch": 0.5100415923945336, + "grad_norm": 0.5341252217977578, + "learning_rate": 1.77150849783899e-05, + "loss": 0.8669, + "num_tokens": 17939018225.0, + "step": 4292 + }, + { + "epoch": 0.5101604278074866, + "grad_norm": 0.5676121446584893, + "learning_rate": 1.771393169271077e-05, + "loss": 0.8574, + "num_tokens": 17943207526.0, + "step": 4293 + }, + { + "epoch": 0.5102792632204397, + "grad_norm": 0.4041362328056324, + "learning_rate": 1.771277815839205e-05, + "loss": 0.8514, + "num_tokens": 17947378819.0, + "step": 4294 + }, + { + "epoch": 0.5103980986333928, + "grad_norm": 0.43610607967337267, + "learning_rate": 1.771162437547646e-05, + "loss": 0.8513, + "num_tokens": 17951518569.0, + "step": 4295 + }, + { + "epoch": 0.5105169340463458, + "grad_norm": 0.5255121329327231, + "learning_rate": 1.771047034400672e-05, + "loss": 0.8519, + "num_tokens": 17955664594.0, + "step": 4296 + }, + { + "epoch": 0.5106357694592989, + "grad_norm": 0.5152894258001088, + "learning_rate": 1.770931606402558e-05, + "loss": 0.8808, + "num_tokens": 17959846534.0, + "step": 4297 + }, + { + "epoch": 0.5107546048722519, + "grad_norm": 0.5113915888976834, + "learning_rate": 1.770816153557578e-05, + "loss": 0.8738, + "num_tokens": 17964024547.0, + "step": 4298 + }, + { + "epoch": 0.510873440285205, + "grad_norm": 0.43631984745371577, + "learning_rate": 1.7707006758700077e-05, + "loss": 0.8906, + "num_tokens": 17968204060.0, + "step": 4299 + }, + { + "epoch": 0.5109922756981581, + "grad_norm": 0.4927096418805668, + "learning_rate": 1.770585173344124e-05, + "loss": 0.8632, + "num_tokens": 17972393592.0, + "step": 4300 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.4819138959243854, + "learning_rate": 1.770469645984204e-05, + "loss": 0.8648, + "num_tokens": 17976583200.0, + "step": 4301 + }, + { + "epoch": 0.5112299465240642, + "grad_norm": 0.4604469914835174, + "learning_rate": 1.7703540937945258e-05, + "loss": 0.858, + "num_tokens": 17980772071.0, + "step": 4302 + }, + { + "epoch": 0.5113487819370173, + "grad_norm": 0.3992757060325507, + "learning_rate": 1.7702385167793694e-05, + "loss": 0.867, + "num_tokens": 17984960044.0, + "step": 4303 + }, + { + "epoch": 0.5114676173499703, + "grad_norm": 0.4538087517826556, + "learning_rate": 1.7701229149430143e-05, + "loss": 0.8491, + "num_tokens": 17989123355.0, + "step": 4304 + }, + { + "epoch": 0.5115864527629234, + "grad_norm": 0.5452543118144495, + "learning_rate": 1.770007288289742e-05, + "loss": 0.8333, + "num_tokens": 17993312762.0, + "step": 4305 + }, + { + "epoch": 0.5117052881758765, + "grad_norm": 0.41649421196165815, + "learning_rate": 1.7698916368238343e-05, + "loss": 0.8626, + "num_tokens": 17997499692.0, + "step": 4306 + }, + { + "epoch": 0.5118241235888294, + "grad_norm": 0.3983479040745025, + "learning_rate": 1.7697759605495742e-05, + "loss": 0.8794, + "num_tokens": 18001686222.0, + "step": 4307 + }, + { + "epoch": 0.5119429590017825, + "grad_norm": 0.48826072494292794, + "learning_rate": 1.769660259471246e-05, + "loss": 0.8832, + "num_tokens": 18005874591.0, + "step": 4308 + }, + { + "epoch": 0.5120617944147355, + "grad_norm": 0.4410918832658596, + "learning_rate": 1.7695445335931342e-05, + "loss": 0.8873, + "num_tokens": 18010045035.0, + "step": 4309 + }, + { + "epoch": 0.5121806298276886, + "grad_norm": 0.4831829421072685, + "learning_rate": 1.7694287829195245e-05, + "loss": 0.8811, + "num_tokens": 18014235249.0, + "step": 4310 + }, + { + "epoch": 0.5122994652406417, + "grad_norm": 0.6357088513929717, + "learning_rate": 1.7693130074547038e-05, + "loss": 0.8463, + "num_tokens": 18018425172.0, + "step": 4311 + }, + { + "epoch": 0.5124183006535947, + "grad_norm": 0.4170818653412423, + "learning_rate": 1.7691972072029587e-05, + "loss": 0.8713, + "num_tokens": 18022614257.0, + "step": 4312 + }, + { + "epoch": 0.5125371360665478, + "grad_norm": 0.5272277350338133, + "learning_rate": 1.7690813821685794e-05, + "loss": 0.8696, + "num_tokens": 18026793760.0, + "step": 4313 + }, + { + "epoch": 0.5126559714795009, + "grad_norm": 0.5174608332106302, + "learning_rate": 1.7689655323558537e-05, + "loss": 0.9176, + "num_tokens": 18030980983.0, + "step": 4314 + }, + { + "epoch": 0.5127748068924539, + "grad_norm": 0.5637198306547657, + "learning_rate": 1.7688496577690725e-05, + "loss": 0.8891, + "num_tokens": 18035148224.0, + "step": 4315 + }, + { + "epoch": 0.512893642305407, + "grad_norm": 0.42407952023828804, + "learning_rate": 1.768733758412527e-05, + "loss": 0.881, + "num_tokens": 18039316946.0, + "step": 4316 + }, + { + "epoch": 0.5130124777183601, + "grad_norm": 0.474057651395649, + "learning_rate": 1.7686178342905097e-05, + "loss": 0.86, + "num_tokens": 18043504131.0, + "step": 4317 + }, + { + "epoch": 0.5131313131313131, + "grad_norm": 0.4564799753738343, + "learning_rate": 1.7685018854073136e-05, + "loss": 0.8674, + "num_tokens": 18047666507.0, + "step": 4318 + }, + { + "epoch": 0.5132501485442662, + "grad_norm": 0.4345801432853324, + "learning_rate": 1.7683859117672317e-05, + "loss": 0.8841, + "num_tokens": 18051856382.0, + "step": 4319 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 0.5018186165462059, + "learning_rate": 1.76826991337456e-05, + "loss": 0.8627, + "num_tokens": 18056046002.0, + "step": 4320 + }, + { + "epoch": 0.5134878193701723, + "grad_norm": 0.4131064137391866, + "learning_rate": 1.7681538902335936e-05, + "loss": 0.8592, + "num_tokens": 18060235286.0, + "step": 4321 + }, + { + "epoch": 0.5136066547831254, + "grad_norm": 0.5779029493642258, + "learning_rate": 1.7680378423486303e-05, + "loss": 0.8823, + "num_tokens": 18064425095.0, + "step": 4322 + }, + { + "epoch": 0.5137254901960784, + "grad_norm": 0.504467449818223, + "learning_rate": 1.7679217697239665e-05, + "loss": 0.8882, + "num_tokens": 18068614863.0, + "step": 4323 + }, + { + "epoch": 0.5138443256090315, + "grad_norm": 0.4876847853632972, + "learning_rate": 1.7678056723639013e-05, + "loss": 0.8869, + "num_tokens": 18072801689.0, + "step": 4324 + }, + { + "epoch": 0.5139631610219846, + "grad_norm": 0.4629551309062822, + "learning_rate": 1.767689550272734e-05, + "loss": 0.8759, + "num_tokens": 18076990833.0, + "step": 4325 + }, + { + "epoch": 0.5140819964349376, + "grad_norm": 0.4377718447156749, + "learning_rate": 1.7675734034547652e-05, + "loss": 0.8695, + "num_tokens": 18081153447.0, + "step": 4326 + }, + { + "epoch": 0.5142008318478907, + "grad_norm": 0.40410322533228, + "learning_rate": 1.7674572319142965e-05, + "loss": 0.9076, + "num_tokens": 18085341483.0, + "step": 4327 + }, + { + "epoch": 0.5143196672608438, + "grad_norm": 0.48422332692876163, + "learning_rate": 1.7673410356556293e-05, + "loss": 0.8908, + "num_tokens": 18089529161.0, + "step": 4328 + }, + { + "epoch": 0.5144385026737968, + "grad_norm": 0.49134303985702116, + "learning_rate": 1.7672248146830672e-05, + "loss": 0.858, + "num_tokens": 18093696057.0, + "step": 4329 + }, + { + "epoch": 0.5145573380867499, + "grad_norm": 0.43897597860210885, + "learning_rate": 1.767108569000915e-05, + "loss": 0.8662, + "num_tokens": 18097862342.0, + "step": 4330 + }, + { + "epoch": 0.514676173499703, + "grad_norm": 0.5650322458289303, + "learning_rate": 1.7669922986134763e-05, + "loss": 0.9023, + "num_tokens": 18102051263.0, + "step": 4331 + }, + { + "epoch": 0.514795008912656, + "grad_norm": 0.43430419860611413, + "learning_rate": 1.7668760035250577e-05, + "loss": 0.8928, + "num_tokens": 18106230489.0, + "step": 4332 + }, + { + "epoch": 0.514913844325609, + "grad_norm": 0.593499808019085, + "learning_rate": 1.7667596837399658e-05, + "loss": 0.9098, + "num_tokens": 18110418992.0, + "step": 4333 + }, + { + "epoch": 0.515032679738562, + "grad_norm": 0.40255456665412453, + "learning_rate": 1.766643339262509e-05, + "loss": 0.8773, + "num_tokens": 18114609887.0, + "step": 4334 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 0.5884105688856648, + "learning_rate": 1.766526970096995e-05, + "loss": 0.8747, + "num_tokens": 18118798959.0, + "step": 4335 + }, + { + "epoch": 0.5152703505644682, + "grad_norm": 0.46682910132207056, + "learning_rate": 1.7664105762477337e-05, + "loss": 0.871, + "num_tokens": 18122915669.0, + "step": 4336 + }, + { + "epoch": 0.5153891859774212, + "grad_norm": 0.47661087674625185, + "learning_rate": 1.7662941577190357e-05, + "loss": 0.8735, + "num_tokens": 18127102094.0, + "step": 4337 + }, + { + "epoch": 0.5155080213903743, + "grad_norm": 0.46504772267541433, + "learning_rate": 1.766177714515212e-05, + "loss": 0.8536, + "num_tokens": 18131290905.0, + "step": 4338 + }, + { + "epoch": 0.5156268568033274, + "grad_norm": 0.466467618365408, + "learning_rate": 1.7660612466405754e-05, + "loss": 0.8396, + "num_tokens": 18135449122.0, + "step": 4339 + }, + { + "epoch": 0.5157456922162804, + "grad_norm": 0.4755833696513891, + "learning_rate": 1.7659447540994388e-05, + "loss": 0.8697, + "num_tokens": 18139620932.0, + "step": 4340 + }, + { + "epoch": 0.5158645276292335, + "grad_norm": 0.5743790906478616, + "learning_rate": 1.7658282368961163e-05, + "loss": 0.9046, + "num_tokens": 18143809576.0, + "step": 4341 + }, + { + "epoch": 0.5159833630421866, + "grad_norm": 0.47330996433448647, + "learning_rate": 1.765711695034923e-05, + "loss": 0.8901, + "num_tokens": 18147998997.0, + "step": 4342 + }, + { + "epoch": 0.5161021984551396, + "grad_norm": 0.5280676260914828, + "learning_rate": 1.7655951285201747e-05, + "loss": 0.8741, + "num_tokens": 18152188493.0, + "step": 4343 + }, + { + "epoch": 0.5162210338680927, + "grad_norm": 0.5053796791607922, + "learning_rate": 1.7654785373561884e-05, + "loss": 0.8841, + "num_tokens": 18156377703.0, + "step": 4344 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.46621831955958876, + "learning_rate": 1.765361921547282e-05, + "loss": 0.8911, + "num_tokens": 18160565748.0, + "step": 4345 + }, + { + "epoch": 0.5164587046939988, + "grad_norm": 0.4009412356842149, + "learning_rate": 1.7652452810977736e-05, + "loss": 0.8703, + "num_tokens": 18164716101.0, + "step": 4346 + }, + { + "epoch": 0.5165775401069519, + "grad_norm": 0.5217436894224841, + "learning_rate": 1.7651286160119835e-05, + "loss": 0.9271, + "num_tokens": 18168905033.0, + "step": 4347 + }, + { + "epoch": 0.5166963755199049, + "grad_norm": 0.5194614520302433, + "learning_rate": 1.7650119262942318e-05, + "loss": 0.8752, + "num_tokens": 18173080010.0, + "step": 4348 + }, + { + "epoch": 0.516815210932858, + "grad_norm": 0.524722794191444, + "learning_rate": 1.7648952119488402e-05, + "loss": 0.8859, + "num_tokens": 18177269616.0, + "step": 4349 + }, + { + "epoch": 0.5169340463458111, + "grad_norm": 0.47978887657497404, + "learning_rate": 1.7647784729801304e-05, + "loss": 0.8495, + "num_tokens": 18181452023.0, + "step": 4350 + }, + { + "epoch": 0.5170528817587641, + "grad_norm": 0.6495657075729312, + "learning_rate": 1.7646617093924263e-05, + "loss": 0.8676, + "num_tokens": 18185620283.0, + "step": 4351 + }, + { + "epoch": 0.5171717171717172, + "grad_norm": 0.373483800987821, + "learning_rate": 1.764544921190052e-05, + "loss": 0.922, + "num_tokens": 18189810179.0, + "step": 4352 + }, + { + "epoch": 0.5172905525846703, + "grad_norm": 0.4582095768149489, + "learning_rate": 1.764428108377332e-05, + "loss": 0.8667, + "num_tokens": 18194000485.0, + "step": 4353 + }, + { + "epoch": 0.5174093879976233, + "grad_norm": 0.5260513636548751, + "learning_rate": 1.764311270958593e-05, + "loss": 0.8686, + "num_tokens": 18198177085.0, + "step": 4354 + }, + { + "epoch": 0.5175282234105764, + "grad_norm": 0.44823441345258574, + "learning_rate": 1.7641944089381615e-05, + "loss": 0.8876, + "num_tokens": 18202365926.0, + "step": 4355 + }, + { + "epoch": 0.5176470588235295, + "grad_norm": 0.4652735085269206, + "learning_rate": 1.7640775223203655e-05, + "loss": 0.8518, + "num_tokens": 18206525803.0, + "step": 4356 + }, + { + "epoch": 0.5177658942364824, + "grad_norm": 0.4588044585749858, + "learning_rate": 1.7639606111095336e-05, + "loss": 0.8627, + "num_tokens": 18210713638.0, + "step": 4357 + }, + { + "epoch": 0.5178847296494355, + "grad_norm": 0.4757671263018179, + "learning_rate": 1.763843675309995e-05, + "loss": 0.9017, + "num_tokens": 18214890704.0, + "step": 4358 + }, + { + "epoch": 0.5180035650623885, + "grad_norm": 0.582662085449545, + "learning_rate": 1.7637267149260806e-05, + "loss": 0.8666, + "num_tokens": 18219035971.0, + "step": 4359 + }, + { + "epoch": 0.5181224004753416, + "grad_norm": 0.4695529608881566, + "learning_rate": 1.763609729962122e-05, + "loss": 0.8941, + "num_tokens": 18223224075.0, + "step": 4360 + }, + { + "epoch": 0.5182412358882947, + "grad_norm": 0.49511558473437245, + "learning_rate": 1.7634927204224516e-05, + "loss": 0.8661, + "num_tokens": 18227412803.0, + "step": 4361 + }, + { + "epoch": 0.5183600713012477, + "grad_norm": 0.4765438808107385, + "learning_rate": 1.763375686311402e-05, + "loss": 0.8672, + "num_tokens": 18231581841.0, + "step": 4362 + }, + { + "epoch": 0.5184789067142008, + "grad_norm": 0.4828608682731938, + "learning_rate": 1.763258627633308e-05, + "loss": 0.8323, + "num_tokens": 18235739721.0, + "step": 4363 + }, + { + "epoch": 0.5185977421271539, + "grad_norm": 0.5765179861181704, + "learning_rate": 1.763141544392504e-05, + "loss": 0.875, + "num_tokens": 18239928713.0, + "step": 4364 + }, + { + "epoch": 0.5187165775401069, + "grad_norm": 1.0647909182205508, + "learning_rate": 1.7630244365933276e-05, + "loss": 0.9042, + "num_tokens": 18244078856.0, + "step": 4365 + }, + { + "epoch": 0.51883541295306, + "grad_norm": 0.676572079940037, + "learning_rate": 1.7629073042401138e-05, + "loss": 0.9124, + "num_tokens": 18248256723.0, + "step": 4366 + }, + { + "epoch": 0.5189542483660131, + "grad_norm": 0.8624402185188793, + "learning_rate": 1.7627901473372013e-05, + "loss": 0.874, + "num_tokens": 18252445524.0, + "step": 4367 + }, + { + "epoch": 0.5190730837789661, + "grad_norm": 0.5565025070634574, + "learning_rate": 1.7626729658889285e-05, + "loss": 0.8332, + "num_tokens": 18256636109.0, + "step": 4368 + }, + { + "epoch": 0.5191919191919192, + "grad_norm": 1.0778402377928407, + "learning_rate": 1.7625557598996356e-05, + "loss": 0.8547, + "num_tokens": 18260826127.0, + "step": 4369 + }, + { + "epoch": 0.5193107546048723, + "grad_norm": 0.7966382519998041, + "learning_rate": 1.762438529373663e-05, + "loss": 0.8757, + "num_tokens": 18265016460.0, + "step": 4370 + }, + { + "epoch": 0.5194295900178253, + "grad_norm": 0.9213969165177615, + "learning_rate": 1.762321274315351e-05, + "loss": 0.8368, + "num_tokens": 18269185016.0, + "step": 4371 + }, + { + "epoch": 0.5195484254307784, + "grad_norm": 0.8297704769597505, + "learning_rate": 1.7622039947290435e-05, + "loss": 0.8664, + "num_tokens": 18273374648.0, + "step": 4372 + }, + { + "epoch": 0.5196672608437314, + "grad_norm": 0.804226792592651, + "learning_rate": 1.762086690619083e-05, + "loss": 0.879, + "num_tokens": 18277539936.0, + "step": 4373 + }, + { + "epoch": 0.5197860962566845, + "grad_norm": 0.7156892726068789, + "learning_rate": 1.7619693619898136e-05, + "loss": 0.8575, + "num_tokens": 18281703161.0, + "step": 4374 + }, + { + "epoch": 0.5199049316696376, + "grad_norm": 0.8047639537776947, + "learning_rate": 1.7618520088455805e-05, + "loss": 0.8586, + "num_tokens": 18285873588.0, + "step": 4375 + }, + { + "epoch": 0.5200237670825906, + "grad_norm": 0.6820569713941408, + "learning_rate": 1.76173463119073e-05, + "loss": 0.8582, + "num_tokens": 18290033627.0, + "step": 4376 + }, + { + "epoch": 0.5201426024955437, + "grad_norm": 0.7314344096120857, + "learning_rate": 1.7616172290296086e-05, + "loss": 0.8852, + "num_tokens": 18294201404.0, + "step": 4377 + }, + { + "epoch": 0.5202614379084968, + "grad_norm": 0.5668277456484548, + "learning_rate": 1.7614998023665644e-05, + "loss": 0.8465, + "num_tokens": 18298371092.0, + "step": 4378 + }, + { + "epoch": 0.5203802733214498, + "grad_norm": 0.8217937195092111, + "learning_rate": 1.7613823512059457e-05, + "loss": 0.8674, + "num_tokens": 18302556913.0, + "step": 4379 + }, + { + "epoch": 0.5204991087344029, + "grad_norm": 0.6354049172257533, + "learning_rate": 1.7612648755521025e-05, + "loss": 0.8329, + "num_tokens": 18306745836.0, + "step": 4380 + }, + { + "epoch": 0.520617944147356, + "grad_norm": 0.8692769123230957, + "learning_rate": 1.761147375409385e-05, + "loss": 0.8646, + "num_tokens": 18310934061.0, + "step": 4381 + }, + { + "epoch": 0.520736779560309, + "grad_norm": 0.7462503442094799, + "learning_rate": 1.7610298507821454e-05, + "loss": 0.8705, + "num_tokens": 18315088210.0, + "step": 4382 + }, + { + "epoch": 0.520855614973262, + "grad_norm": 0.7082390593742619, + "learning_rate": 1.7609123016747346e-05, + "loss": 0.8841, + "num_tokens": 18319277406.0, + "step": 4383 + }, + { + "epoch": 0.520974450386215, + "grad_norm": 0.6718580358350749, + "learning_rate": 1.760794728091507e-05, + "loss": 0.8743, + "num_tokens": 18323466241.0, + "step": 4384 + }, + { + "epoch": 0.5210932857991681, + "grad_norm": 0.6292538666037543, + "learning_rate": 1.7606771300368166e-05, + "loss": 0.8607, + "num_tokens": 18327632215.0, + "step": 4385 + }, + { + "epoch": 0.5212121212121212, + "grad_norm": 0.5566009406716761, + "learning_rate": 1.7605595075150182e-05, + "loss": 0.8543, + "num_tokens": 18331808423.0, + "step": 4386 + }, + { + "epoch": 0.5213309566250742, + "grad_norm": 0.6383960501724926, + "learning_rate": 1.7604418605304675e-05, + "loss": 0.8582, + "num_tokens": 18335986951.0, + "step": 4387 + }, + { + "epoch": 0.5214497920380273, + "grad_norm": 0.4906563734639506, + "learning_rate": 1.760324189087522e-05, + "loss": 0.8669, + "num_tokens": 18340176158.0, + "step": 4388 + }, + { + "epoch": 0.5215686274509804, + "grad_norm": 0.7421078059695336, + "learning_rate": 1.760206493190539e-05, + "loss": 0.8758, + "num_tokens": 18344364356.0, + "step": 4389 + }, + { + "epoch": 0.5216874628639334, + "grad_norm": 0.5058094953223907, + "learning_rate": 1.7600887728438777e-05, + "loss": 0.863, + "num_tokens": 18348554958.0, + "step": 4390 + }, + { + "epoch": 0.5218062982768865, + "grad_norm": 0.8629215519481955, + "learning_rate": 1.759971028051897e-05, + "loss": 0.8873, + "num_tokens": 18352744222.0, + "step": 4391 + }, + { + "epoch": 0.5219251336898396, + "grad_norm": 0.7389002059131675, + "learning_rate": 1.7598532588189577e-05, + "loss": 0.8789, + "num_tokens": 18356931619.0, + "step": 4392 + }, + { + "epoch": 0.5220439691027926, + "grad_norm": 0.700112136040788, + "learning_rate": 1.7597354651494215e-05, + "loss": 0.8953, + "num_tokens": 18361099884.0, + "step": 4393 + }, + { + "epoch": 0.5221628045157457, + "grad_norm": 0.7083100847114334, + "learning_rate": 1.7596176470476502e-05, + "loss": 0.8415, + "num_tokens": 18365271019.0, + "step": 4394 + }, + { + "epoch": 0.5222816399286988, + "grad_norm": 0.5647432462683845, + "learning_rate": 1.7594998045180075e-05, + "loss": 0.8956, + "num_tokens": 18369459672.0, + "step": 4395 + }, + { + "epoch": 0.5224004753416518, + "grad_norm": 0.6344589226345065, + "learning_rate": 1.7593819375648568e-05, + "loss": 0.8508, + "num_tokens": 18373627038.0, + "step": 4396 + }, + { + "epoch": 0.5225193107546049, + "grad_norm": 0.5171207206238232, + "learning_rate": 1.7592640461925638e-05, + "loss": 0.8905, + "num_tokens": 18377800958.0, + "step": 4397 + }, + { + "epoch": 0.5226381461675579, + "grad_norm": 0.5798810664092399, + "learning_rate": 1.7591461304054937e-05, + "loss": 0.8781, + "num_tokens": 18381982620.0, + "step": 4398 + }, + { + "epoch": 0.522756981580511, + "grad_norm": 0.5075733843977007, + "learning_rate": 1.7590281902080143e-05, + "loss": 0.8623, + "num_tokens": 18386149333.0, + "step": 4399 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.5331635436879618, + "learning_rate": 1.7589102256044927e-05, + "loss": 0.868, + "num_tokens": 18390340205.0, + "step": 4400 + }, + { + "epoch": 0.5229946524064171, + "grad_norm": 0.49726576067484807, + "learning_rate": 1.7587922365992974e-05, + "loss": 0.8389, + "num_tokens": 18394529013.0, + "step": 4401 + }, + { + "epoch": 0.5231134878193702, + "grad_norm": 0.5901717165947348, + "learning_rate": 1.7586742231967983e-05, + "loss": 0.8822, + "num_tokens": 18398716725.0, + "step": 4402 + }, + { + "epoch": 0.5232323232323233, + "grad_norm": 0.45363387395015314, + "learning_rate": 1.7585561854013654e-05, + "loss": 0.8324, + "num_tokens": 18402905196.0, + "step": 4403 + }, + { + "epoch": 0.5233511586452763, + "grad_norm": 0.6457477536060249, + "learning_rate": 1.7584381232173705e-05, + "loss": 0.8612, + "num_tokens": 18407094500.0, + "step": 4404 + }, + { + "epoch": 0.5234699940582294, + "grad_norm": 0.5053750393685, + "learning_rate": 1.758320036649186e-05, + "loss": 0.8617, + "num_tokens": 18411285824.0, + "step": 4405 + }, + { + "epoch": 0.5235888294711825, + "grad_norm": 0.6122323235351294, + "learning_rate": 1.7582019257011843e-05, + "loss": 0.84, + "num_tokens": 18415475700.0, + "step": 4406 + }, + { + "epoch": 0.5237076648841354, + "grad_norm": 0.5223757005544829, + "learning_rate": 1.7580837903777403e-05, + "loss": 0.8737, + "num_tokens": 18419652409.0, + "step": 4407 + }, + { + "epoch": 0.5238265002970885, + "grad_norm": 0.5113709309434573, + "learning_rate": 1.7579656306832278e-05, + "loss": 0.8956, + "num_tokens": 18423841268.0, + "step": 4408 + }, + { + "epoch": 0.5239453357100415, + "grad_norm": 0.4891393540599449, + "learning_rate": 1.757847446622024e-05, + "loss": 0.8723, + "num_tokens": 18428017731.0, + "step": 4409 + }, + { + "epoch": 0.5240641711229946, + "grad_norm": 0.5441304064410991, + "learning_rate": 1.7577292381985046e-05, + "loss": 0.8977, + "num_tokens": 18432181405.0, + "step": 4410 + }, + { + "epoch": 0.5241830065359477, + "grad_norm": 0.4669072998693083, + "learning_rate": 1.7576110054170478e-05, + "loss": 0.9295, + "num_tokens": 18436347192.0, + "step": 4411 + }, + { + "epoch": 0.5243018419489007, + "grad_norm": 0.5253254568669312, + "learning_rate": 1.757492748282032e-05, + "loss": 0.851, + "num_tokens": 18440522426.0, + "step": 4412 + }, + { + "epoch": 0.5244206773618538, + "grad_norm": 0.4259943286602721, + "learning_rate": 1.757374466797837e-05, + "loss": 0.8353, + "num_tokens": 18444702070.0, + "step": 4413 + }, + { + "epoch": 0.5245395127748069, + "grad_norm": 0.5550630855097567, + "learning_rate": 1.7572561609688425e-05, + "loss": 0.8763, + "num_tokens": 18448889338.0, + "step": 4414 + }, + { + "epoch": 0.5246583481877599, + "grad_norm": 0.5007091400044709, + "learning_rate": 1.75713783079943e-05, + "loss": 0.8233, + "num_tokens": 18453060103.0, + "step": 4415 + }, + { + "epoch": 0.524777183600713, + "grad_norm": 0.5237655851198779, + "learning_rate": 1.7570194762939825e-05, + "loss": 0.8592, + "num_tokens": 18457230197.0, + "step": 4416 + }, + { + "epoch": 0.5248960190136661, + "grad_norm": 0.48179331969692857, + "learning_rate": 1.756901097456882e-05, + "loss": 0.8515, + "num_tokens": 18461389042.0, + "step": 4417 + }, + { + "epoch": 0.5250148544266191, + "grad_norm": 0.5592752386925092, + "learning_rate": 1.7567826942925126e-05, + "loss": 0.8391, + "num_tokens": 18465568603.0, + "step": 4418 + }, + { + "epoch": 0.5251336898395722, + "grad_norm": 0.46266196285740513, + "learning_rate": 1.7566642668052592e-05, + "loss": 0.8546, + "num_tokens": 18469758596.0, + "step": 4419 + }, + { + "epoch": 0.5252525252525253, + "grad_norm": 0.4606553387081845, + "learning_rate": 1.7565458149995083e-05, + "loss": 0.8628, + "num_tokens": 18473916992.0, + "step": 4420 + }, + { + "epoch": 0.5253713606654783, + "grad_norm": 0.4892243361554254, + "learning_rate": 1.756427338879646e-05, + "loss": 0.8375, + "num_tokens": 18478108844.0, + "step": 4421 + }, + { + "epoch": 0.5254901960784314, + "grad_norm": 0.4764258158690294, + "learning_rate": 1.7563088384500596e-05, + "loss": 0.8801, + "num_tokens": 18482299685.0, + "step": 4422 + }, + { + "epoch": 0.5256090314913844, + "grad_norm": 0.4840048947425616, + "learning_rate": 1.756190313715138e-05, + "loss": 0.8716, + "num_tokens": 18486472006.0, + "step": 4423 + }, + { + "epoch": 0.5257278669043375, + "grad_norm": 0.4873038099699775, + "learning_rate": 1.7560717646792704e-05, + "loss": 0.8905, + "num_tokens": 18490662643.0, + "step": 4424 + }, + { + "epoch": 0.5258467023172906, + "grad_norm": 0.4848904628748749, + "learning_rate": 1.7559531913468474e-05, + "loss": 0.9081, + "num_tokens": 18494850558.0, + "step": 4425 + }, + { + "epoch": 0.5259655377302436, + "grad_norm": 0.5581347118087961, + "learning_rate": 1.7558345937222597e-05, + "loss": 0.8438, + "num_tokens": 18499038834.0, + "step": 4426 + }, + { + "epoch": 0.5260843731431967, + "grad_norm": 0.49808002455510797, + "learning_rate": 1.7557159718098994e-05, + "loss": 0.883, + "num_tokens": 18503227339.0, + "step": 4427 + }, + { + "epoch": 0.5262032085561498, + "grad_norm": 0.3794119200674595, + "learning_rate": 1.75559732561416e-05, + "loss": 0.8581, + "num_tokens": 18507415460.0, + "step": 4428 + }, + { + "epoch": 0.5263220439691028, + "grad_norm": 0.5809134561960585, + "learning_rate": 1.7554786551394347e-05, + "loss": 0.9245, + "num_tokens": 18511596013.0, + "step": 4429 + }, + { + "epoch": 0.5264408793820559, + "grad_norm": 0.44684492014622806, + "learning_rate": 1.7553599603901187e-05, + "loss": 0.8839, + "num_tokens": 18515755769.0, + "step": 4430 + }, + { + "epoch": 0.526559714795009, + "grad_norm": 0.4644142758261422, + "learning_rate": 1.755241241370607e-05, + "loss": 0.883, + "num_tokens": 18519943628.0, + "step": 4431 + }, + { + "epoch": 0.526678550207962, + "grad_norm": 0.4462846488421064, + "learning_rate": 1.7551224980852977e-05, + "loss": 0.8165, + "num_tokens": 18524114413.0, + "step": 4432 + }, + { + "epoch": 0.526797385620915, + "grad_norm": 0.5222118190741646, + "learning_rate": 1.7550037305385868e-05, + "loss": 0.8856, + "num_tokens": 18528295764.0, + "step": 4433 + }, + { + "epoch": 0.526916221033868, + "grad_norm": 0.5038491823312515, + "learning_rate": 1.754884938734873e-05, + "loss": 0.897, + "num_tokens": 18532448377.0, + "step": 4434 + }, + { + "epoch": 0.5270350564468211, + "grad_norm": 0.48374441651054056, + "learning_rate": 1.7547661226785558e-05, + "loss": 0.8688, + "num_tokens": 18536619655.0, + "step": 4435 + }, + { + "epoch": 0.5271538918597742, + "grad_norm": 0.45049486911146125, + "learning_rate": 1.7546472823740353e-05, + "loss": 0.8545, + "num_tokens": 18540802060.0, + "step": 4436 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 0.45413476759600513, + "learning_rate": 1.7545284178257127e-05, + "loss": 0.8335, + "num_tokens": 18544979062.0, + "step": 4437 + }, + { + "epoch": 0.5273915626856803, + "grad_norm": 0.42513988252696033, + "learning_rate": 1.7544095290379896e-05, + "loss": 0.8688, + "num_tokens": 18549095030.0, + "step": 4438 + }, + { + "epoch": 0.5275103980986334, + "grad_norm": 0.4520277503383116, + "learning_rate": 1.7542906160152688e-05, + "loss": 0.8826, + "num_tokens": 18553254643.0, + "step": 4439 + }, + { + "epoch": 0.5276292335115864, + "grad_norm": 0.4524782241598767, + "learning_rate": 1.7541716787619548e-05, + "loss": 0.857, + "num_tokens": 18557419377.0, + "step": 4440 + }, + { + "epoch": 0.5277480689245395, + "grad_norm": 0.46450948609870374, + "learning_rate": 1.7540527172824513e-05, + "loss": 0.8495, + "num_tokens": 18561609380.0, + "step": 4441 + }, + { + "epoch": 0.5278669043374926, + "grad_norm": 0.504877708586687, + "learning_rate": 1.753933731581164e-05, + "loss": 0.8493, + "num_tokens": 18565796014.0, + "step": 4442 + }, + { + "epoch": 0.5279857397504456, + "grad_norm": 0.4328021647334306, + "learning_rate": 1.7538147216625003e-05, + "loss": 0.8623, + "num_tokens": 18569953563.0, + "step": 4443 + }, + { + "epoch": 0.5281045751633987, + "grad_norm": 0.47996429790306455, + "learning_rate": 1.7536956875308667e-05, + "loss": 0.811, + "num_tokens": 18574143043.0, + "step": 4444 + }, + { + "epoch": 0.5282234105763518, + "grad_norm": 0.507044377691552, + "learning_rate": 1.7535766291906713e-05, + "loss": 0.868, + "num_tokens": 18578332346.0, + "step": 4445 + }, + { + "epoch": 0.5283422459893048, + "grad_norm": 0.5518242251317217, + "learning_rate": 1.753457546646324e-05, + "loss": 0.8636, + "num_tokens": 18582521713.0, + "step": 4446 + }, + { + "epoch": 0.5284610814022579, + "grad_norm": 0.4774101488234927, + "learning_rate": 1.753338439902234e-05, + "loss": 0.8598, + "num_tokens": 18586679359.0, + "step": 4447 + }, + { + "epoch": 0.5285799168152109, + "grad_norm": 0.4515291280915783, + "learning_rate": 1.7532193089628126e-05, + "loss": 0.8902, + "num_tokens": 18590849866.0, + "step": 4448 + }, + { + "epoch": 0.528698752228164, + "grad_norm": 0.5325722609110717, + "learning_rate": 1.7531001538324713e-05, + "loss": 0.8728, + "num_tokens": 18595039888.0, + "step": 4449 + }, + { + "epoch": 0.5288175876411171, + "grad_norm": 0.40695558798105286, + "learning_rate": 1.7529809745156236e-05, + "loss": 0.8771, + "num_tokens": 18599226491.0, + "step": 4450 + }, + { + "epoch": 0.5289364230540701, + "grad_norm": 0.506360017541241, + "learning_rate": 1.7528617710166826e-05, + "loss": 0.922, + "num_tokens": 18603415915.0, + "step": 4451 + }, + { + "epoch": 0.5290552584670232, + "grad_norm": 0.6248566854102192, + "learning_rate": 1.7527425433400626e-05, + "loss": 0.8628, + "num_tokens": 18607605504.0, + "step": 4452 + }, + { + "epoch": 0.5291740938799763, + "grad_norm": 0.4230012548138344, + "learning_rate": 1.7526232914901797e-05, + "loss": 0.8644, + "num_tokens": 18611794162.0, + "step": 4453 + }, + { + "epoch": 0.5292929292929293, + "grad_norm": 0.5757699678526157, + "learning_rate": 1.7525040154714493e-05, + "loss": 0.8925, + "num_tokens": 18615972202.0, + "step": 4454 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.48884905135060286, + "learning_rate": 1.752384715288289e-05, + "loss": 0.8447, + "num_tokens": 18620161315.0, + "step": 4455 + }, + { + "epoch": 0.5295306001188355, + "grad_norm": 0.5662984710395157, + "learning_rate": 1.752265390945117e-05, + "loss": 0.8594, + "num_tokens": 18624334461.0, + "step": 4456 + }, + { + "epoch": 0.5296494355317884, + "grad_norm": 0.4988240051373355, + "learning_rate": 1.7521460424463523e-05, + "loss": 0.847, + "num_tokens": 18628502945.0, + "step": 4457 + }, + { + "epoch": 0.5297682709447415, + "grad_norm": 0.4559045251711661, + "learning_rate": 1.7520266697964147e-05, + "loss": 0.9055, + "num_tokens": 18632674688.0, + "step": 4458 + }, + { + "epoch": 0.5298871063576946, + "grad_norm": 0.5205112490904611, + "learning_rate": 1.751907272999725e-05, + "loss": 0.8704, + "num_tokens": 18636864116.0, + "step": 4459 + }, + { + "epoch": 0.5300059417706476, + "grad_norm": 0.45807363394509715, + "learning_rate": 1.751787852060705e-05, + "loss": 0.8528, + "num_tokens": 18641054601.0, + "step": 4460 + }, + { + "epoch": 0.5301247771836007, + "grad_norm": 0.5413017914704727, + "learning_rate": 1.7516684069837766e-05, + "loss": 0.8901, + "num_tokens": 18645212321.0, + "step": 4461 + }, + { + "epoch": 0.5302436125965537, + "grad_norm": 0.5038748803881716, + "learning_rate": 1.751548937773364e-05, + "loss": 0.8216, + "num_tokens": 18649401495.0, + "step": 4462 + }, + { + "epoch": 0.5303624480095068, + "grad_norm": 0.4780411434176341, + "learning_rate": 1.751429444433891e-05, + "loss": 0.9014, + "num_tokens": 18653586204.0, + "step": 4463 + }, + { + "epoch": 0.5304812834224599, + "grad_norm": 0.45433700728822496, + "learning_rate": 1.751309926969784e-05, + "loss": 0.857, + "num_tokens": 18657738587.0, + "step": 4464 + }, + { + "epoch": 0.5306001188354129, + "grad_norm": 0.5732085761418126, + "learning_rate": 1.7511903853854675e-05, + "loss": 0.887, + "num_tokens": 18661902861.0, + "step": 4465 + }, + { + "epoch": 0.530718954248366, + "grad_norm": 0.4821015092148918, + "learning_rate": 1.7510708196853693e-05, + "loss": 0.894, + "num_tokens": 18666068907.0, + "step": 4466 + }, + { + "epoch": 0.5308377896613191, + "grad_norm": 0.49303623143958447, + "learning_rate": 1.7509512298739178e-05, + "loss": 0.8857, + "num_tokens": 18670215209.0, + "step": 4467 + }, + { + "epoch": 0.5309566250742721, + "grad_norm": 0.44535079187277044, + "learning_rate": 1.750831615955541e-05, + "loss": 0.8588, + "num_tokens": 18674399422.0, + "step": 4468 + }, + { + "epoch": 0.5310754604872252, + "grad_norm": 0.5428354498358278, + "learning_rate": 1.750711977934669e-05, + "loss": 0.8922, + "num_tokens": 18678583338.0, + "step": 4469 + }, + { + "epoch": 0.5311942959001783, + "grad_norm": 0.43538846176606055, + "learning_rate": 1.7505923158157325e-05, + "loss": 0.8806, + "num_tokens": 18682745719.0, + "step": 4470 + }, + { + "epoch": 0.5313131313131313, + "grad_norm": 0.4958245912295841, + "learning_rate": 1.7504726296031625e-05, + "loss": 0.9163, + "num_tokens": 18686926653.0, + "step": 4471 + }, + { + "epoch": 0.5314319667260844, + "grad_norm": 0.4683708961293329, + "learning_rate": 1.7503529193013918e-05, + "loss": 0.8988, + "num_tokens": 18691095449.0, + "step": 4472 + }, + { + "epoch": 0.5315508021390374, + "grad_norm": 0.5235985454756582, + "learning_rate": 1.750233184914854e-05, + "loss": 0.8963, + "num_tokens": 18695284789.0, + "step": 4473 + }, + { + "epoch": 0.5316696375519905, + "grad_norm": 0.49171207332585365, + "learning_rate": 1.7501134264479828e-05, + "loss": 0.911, + "num_tokens": 18699470091.0, + "step": 4474 + }, + { + "epoch": 0.5317884729649436, + "grad_norm": 0.5084214115706457, + "learning_rate": 1.749993643905213e-05, + "loss": 0.8526, + "num_tokens": 18703659434.0, + "step": 4475 + }, + { + "epoch": 0.5319073083778966, + "grad_norm": 0.4585678400506868, + "learning_rate": 1.7498738372909812e-05, + "loss": 0.85, + "num_tokens": 18707850437.0, + "step": 4476 + }, + { + "epoch": 0.5320261437908497, + "grad_norm": 0.5026262423338694, + "learning_rate": 1.7497540066097236e-05, + "loss": 0.8869, + "num_tokens": 18712008386.0, + "step": 4477 + }, + { + "epoch": 0.5321449792038028, + "grad_norm": 0.4944475074915402, + "learning_rate": 1.7496341518658785e-05, + "loss": 0.874, + "num_tokens": 18716186817.0, + "step": 4478 + }, + { + "epoch": 0.5322638146167558, + "grad_norm": 0.4521044217866577, + "learning_rate": 1.7495142730638843e-05, + "loss": 0.843, + "num_tokens": 18720354761.0, + "step": 4479 + }, + { + "epoch": 0.5323826500297089, + "grad_norm": 0.5401713876780855, + "learning_rate": 1.7493943702081805e-05, + "loss": 0.8744, + "num_tokens": 18724484857.0, + "step": 4480 + }, + { + "epoch": 0.532501485442662, + "grad_norm": 0.4886832434023453, + "learning_rate": 1.7492744433032072e-05, + "loss": 0.8773, + "num_tokens": 18728641059.0, + "step": 4481 + }, + { + "epoch": 0.532620320855615, + "grad_norm": 0.41218178887066464, + "learning_rate": 1.7491544923534063e-05, + "loss": 0.8332, + "num_tokens": 18732830085.0, + "step": 4482 + }, + { + "epoch": 0.532739156268568, + "grad_norm": 0.4202498144480108, + "learning_rate": 1.7490345173632197e-05, + "loss": 0.8916, + "num_tokens": 18737004621.0, + "step": 4483 + }, + { + "epoch": 0.5328579916815211, + "grad_norm": 0.43481958423818157, + "learning_rate": 1.7489145183370905e-05, + "loss": 0.8385, + "num_tokens": 18741193355.0, + "step": 4484 + }, + { + "epoch": 0.5329768270944741, + "grad_norm": 0.49140672986039513, + "learning_rate": 1.7487944952794626e-05, + "loss": 0.8612, + "num_tokens": 18745382311.0, + "step": 4485 + }, + { + "epoch": 0.5330956625074272, + "grad_norm": 0.5153334353308672, + "learning_rate": 1.748674448194781e-05, + "loss": 0.858, + "num_tokens": 18749515096.0, + "step": 4486 + }, + { + "epoch": 0.5332144979203802, + "grad_norm": 0.4629811646491701, + "learning_rate": 1.7485543770874912e-05, + "loss": 0.8994, + "num_tokens": 18753687147.0, + "step": 4487 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.434140016883735, + "learning_rate": 1.7484342819620403e-05, + "loss": 0.8585, + "num_tokens": 18757876848.0, + "step": 4488 + }, + { + "epoch": 0.5334521687462864, + "grad_norm": 0.657668352446056, + "learning_rate": 1.7483141628228752e-05, + "loss": 0.867, + "num_tokens": 18762057539.0, + "step": 4489 + }, + { + "epoch": 0.5335710041592394, + "grad_norm": 0.4344906329663881, + "learning_rate": 1.7481940196744448e-05, + "loss": 0.831, + "num_tokens": 18766226577.0, + "step": 4490 + }, + { + "epoch": 0.5336898395721925, + "grad_norm": 0.4400223788373486, + "learning_rate": 1.748073852521198e-05, + "loss": 0.8368, + "num_tokens": 18770404972.0, + "step": 4491 + }, + { + "epoch": 0.5338086749851456, + "grad_norm": 0.5247320650996209, + "learning_rate": 1.7479536613675858e-05, + "loss": 0.9007, + "num_tokens": 18774594598.0, + "step": 4492 + }, + { + "epoch": 0.5339275103980986, + "grad_norm": 0.5483678531648711, + "learning_rate": 1.747833446218058e-05, + "loss": 0.8795, + "num_tokens": 18778752354.0, + "step": 4493 + }, + { + "epoch": 0.5340463458110517, + "grad_norm": 0.41528862904821545, + "learning_rate": 1.747713207077068e-05, + "loss": 0.8434, + "num_tokens": 18782941324.0, + "step": 4494 + }, + { + "epoch": 0.5341651812240048, + "grad_norm": 0.5825088140032324, + "learning_rate": 1.7475929439490674e-05, + "loss": 0.8796, + "num_tokens": 18787129994.0, + "step": 4495 + }, + { + "epoch": 0.5342840166369578, + "grad_norm": 0.4665987296587682, + "learning_rate": 1.7474726568385106e-05, + "loss": 0.8026, + "num_tokens": 18791291513.0, + "step": 4496 + }, + { + "epoch": 0.5344028520499109, + "grad_norm": 0.5530851348833598, + "learning_rate": 1.747352345749853e-05, + "loss": 0.9328, + "num_tokens": 18795481878.0, + "step": 4497 + }, + { + "epoch": 0.5345216874628639, + "grad_norm": 0.4689549938683091, + "learning_rate": 1.7472320106875483e-05, + "loss": 0.8774, + "num_tokens": 18799671073.0, + "step": 4498 + }, + { + "epoch": 0.534640522875817, + "grad_norm": 0.5152783879555696, + "learning_rate": 1.7471116516560546e-05, + "loss": 0.8863, + "num_tokens": 18803852060.0, + "step": 4499 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 0.4343684052718077, + "learning_rate": 1.746991268659828e-05, + "loss": 0.8715, + "num_tokens": 18808042100.0, + "step": 4500 + }, + { + "epoch": 0.5348781937017231, + "grad_norm": 0.6058082663580605, + "learning_rate": 1.7468708617033273e-05, + "loss": 0.8511, + "num_tokens": 18812232786.0, + "step": 4501 + }, + { + "epoch": 0.5349970291146762, + "grad_norm": 0.5094637239512927, + "learning_rate": 1.7467504307910116e-05, + "loss": 0.9009, + "num_tokens": 18816417840.0, + "step": 4502 + }, + { + "epoch": 0.5351158645276293, + "grad_norm": 0.4364145512516372, + "learning_rate": 1.7466299759273405e-05, + "loss": 0.9038, + "num_tokens": 18820608139.0, + "step": 4503 + }, + { + "epoch": 0.5352346999405823, + "grad_norm": 0.46751696530873604, + "learning_rate": 1.746509497116775e-05, + "loss": 0.8511, + "num_tokens": 18824777716.0, + "step": 4504 + }, + { + "epoch": 0.5353535353535354, + "grad_norm": 0.5565680955257926, + "learning_rate": 1.7463889943637774e-05, + "loss": 0.8949, + "num_tokens": 18828967623.0, + "step": 4505 + }, + { + "epoch": 0.5354723707664885, + "grad_norm": 0.4838212827788449, + "learning_rate": 1.7462684676728097e-05, + "loss": 0.866, + "num_tokens": 18833124679.0, + "step": 4506 + }, + { + "epoch": 0.5355912061794414, + "grad_norm": 0.5395878940502478, + "learning_rate": 1.7461479170483356e-05, + "loss": 0.855, + "num_tokens": 18837313776.0, + "step": 4507 + }, + { + "epoch": 0.5357100415923945, + "grad_norm": 0.6102419060487967, + "learning_rate": 1.7460273424948192e-05, + "loss": 0.8668, + "num_tokens": 18841503319.0, + "step": 4508 + }, + { + "epoch": 0.5358288770053476, + "grad_norm": 0.4259334618479881, + "learning_rate": 1.745906744016726e-05, + "loss": 0.8613, + "num_tokens": 18845668691.0, + "step": 4509 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.5937009706863774, + "learning_rate": 1.7457861216185225e-05, + "loss": 0.8951, + "num_tokens": 18849828366.0, + "step": 4510 + }, + { + "epoch": 0.5360665478312537, + "grad_norm": 0.4465018336586834, + "learning_rate": 1.7456654753046753e-05, + "loss": 0.8382, + "num_tokens": 18854019350.0, + "step": 4511 + }, + { + "epoch": 0.5361853832442067, + "grad_norm": 0.5740055615541008, + "learning_rate": 1.7455448050796525e-05, + "loss": 0.8774, + "num_tokens": 18858181772.0, + "step": 4512 + }, + { + "epoch": 0.5363042186571598, + "grad_norm": 0.43806528011588686, + "learning_rate": 1.745424110947923e-05, + "loss": 0.8614, + "num_tokens": 18862371682.0, + "step": 4513 + }, + { + "epoch": 0.5364230540701129, + "grad_norm": 0.4435476835834161, + "learning_rate": 1.7453033929139565e-05, + "loss": 0.893, + "num_tokens": 18866560709.0, + "step": 4514 + }, + { + "epoch": 0.5365418894830659, + "grad_norm": 0.5059403806406741, + "learning_rate": 1.745182650982223e-05, + "loss": 0.8878, + "num_tokens": 18870750503.0, + "step": 4515 + }, + { + "epoch": 0.536660724896019, + "grad_norm": 0.44918982328956075, + "learning_rate": 1.745061885157195e-05, + "loss": 0.881, + "num_tokens": 18874940457.0, + "step": 4516 + }, + { + "epoch": 0.5367795603089721, + "grad_norm": 0.560616104636336, + "learning_rate": 1.744941095443345e-05, + "loss": 0.8577, + "num_tokens": 18879128693.0, + "step": 4517 + }, + { + "epoch": 0.5368983957219251, + "grad_norm": 0.4552856536846012, + "learning_rate": 1.744820281845145e-05, + "loss": 0.8449, + "num_tokens": 18883316465.0, + "step": 4518 + }, + { + "epoch": 0.5370172311348782, + "grad_norm": 0.5202239064574453, + "learning_rate": 1.7446994443670695e-05, + "loss": 0.8851, + "num_tokens": 18887505416.0, + "step": 4519 + }, + { + "epoch": 0.5371360665478313, + "grad_norm": 0.45088880388991115, + "learning_rate": 1.7445785830135938e-05, + "loss": 0.8361, + "num_tokens": 18891660475.0, + "step": 4520 + }, + { + "epoch": 0.5372549019607843, + "grad_norm": 0.5031946393390749, + "learning_rate": 1.744457697789194e-05, + "loss": 0.8498, + "num_tokens": 18895850251.0, + "step": 4521 + }, + { + "epoch": 0.5373737373737374, + "grad_norm": 0.4389329993119233, + "learning_rate": 1.744336788698347e-05, + "loss": 0.9104, + "num_tokens": 18900025550.0, + "step": 4522 + }, + { + "epoch": 0.5374925727866904, + "grad_norm": 0.422190938579056, + "learning_rate": 1.74421585574553e-05, + "loss": 0.8852, + "num_tokens": 18904213258.0, + "step": 4523 + }, + { + "epoch": 0.5376114081996435, + "grad_norm": 0.45037805301021416, + "learning_rate": 1.7440948989352215e-05, + "loss": 0.882, + "num_tokens": 18908371235.0, + "step": 4524 + }, + { + "epoch": 0.5377302436125966, + "grad_norm": 0.4610929780269172, + "learning_rate": 1.743973918271901e-05, + "loss": 0.8901, + "num_tokens": 18912557551.0, + "step": 4525 + }, + { + "epoch": 0.5378490790255496, + "grad_norm": 0.4530772874764572, + "learning_rate": 1.7438529137600494e-05, + "loss": 0.8811, + "num_tokens": 18916740624.0, + "step": 4526 + }, + { + "epoch": 0.5379679144385027, + "grad_norm": 0.5214278130113305, + "learning_rate": 1.7437318854041473e-05, + "loss": 0.8549, + "num_tokens": 18920930149.0, + "step": 4527 + }, + { + "epoch": 0.5380867498514558, + "grad_norm": 0.4419802294976652, + "learning_rate": 1.743610833208677e-05, + "loss": 0.8676, + "num_tokens": 18925118026.0, + "step": 4528 + }, + { + "epoch": 0.5382055852644088, + "grad_norm": 0.5142322713823068, + "learning_rate": 1.7434897571781212e-05, + "loss": 0.8799, + "num_tokens": 18929305315.0, + "step": 4529 + }, + { + "epoch": 0.5383244206773619, + "grad_norm": 0.5291368359555676, + "learning_rate": 1.7433686573169642e-05, + "loss": 0.8806, + "num_tokens": 18933487509.0, + "step": 4530 + }, + { + "epoch": 0.538443256090315, + "grad_norm": 0.43594071844266197, + "learning_rate": 1.743247533629691e-05, + "loss": 0.871, + "num_tokens": 18937676014.0, + "step": 4531 + }, + { + "epoch": 0.538562091503268, + "grad_norm": 0.48902133275841125, + "learning_rate": 1.7431263861207863e-05, + "loss": 0.8347, + "num_tokens": 18941848039.0, + "step": 4532 + }, + { + "epoch": 0.538680926916221, + "grad_norm": 0.5100794712059366, + "learning_rate": 1.743005214794737e-05, + "loss": 0.8446, + "num_tokens": 18946037323.0, + "step": 4533 + }, + { + "epoch": 0.5387997623291741, + "grad_norm": 0.46494881127464727, + "learning_rate": 1.742884019656031e-05, + "loss": 0.8807, + "num_tokens": 18950226468.0, + "step": 4534 + }, + { + "epoch": 0.5389185977421271, + "grad_norm": 0.50044326240646, + "learning_rate": 1.7427628007091552e-05, + "loss": 0.8584, + "num_tokens": 18954383235.0, + "step": 4535 + }, + { + "epoch": 0.5390374331550802, + "grad_norm": 0.5326579616001694, + "learning_rate": 1.7426415579586005e-05, + "loss": 0.862, + "num_tokens": 18958571498.0, + "step": 4536 + }, + { + "epoch": 0.5391562685680332, + "grad_norm": 0.4632082210752738, + "learning_rate": 1.742520291408856e-05, + "loss": 0.8603, + "num_tokens": 18962761322.0, + "step": 4537 + }, + { + "epoch": 0.5392751039809863, + "grad_norm": 0.4510392448428986, + "learning_rate": 1.742399001064412e-05, + "loss": 0.8549, + "num_tokens": 18966942150.0, + "step": 4538 + }, + { + "epoch": 0.5393939393939394, + "grad_norm": 0.5287709427689, + "learning_rate": 1.742277686929762e-05, + "loss": 0.9089, + "num_tokens": 18971130899.0, + "step": 4539 + }, + { + "epoch": 0.5395127748068924, + "grad_norm": 0.4475240753494148, + "learning_rate": 1.742156349009397e-05, + "loss": 0.8766, + "num_tokens": 18975317238.0, + "step": 4540 + }, + { + "epoch": 0.5396316102198455, + "grad_norm": 0.5181815290689458, + "learning_rate": 1.7420349873078113e-05, + "loss": 0.8796, + "num_tokens": 18979504858.0, + "step": 4541 + }, + { + "epoch": 0.5397504456327986, + "grad_norm": 0.4965766830848925, + "learning_rate": 1.7419136018294993e-05, + "loss": 0.906, + "num_tokens": 18983679435.0, + "step": 4542 + }, + { + "epoch": 0.5398692810457516, + "grad_norm": 0.4644447944004763, + "learning_rate": 1.7417921925789565e-05, + "loss": 0.8364, + "num_tokens": 18987851972.0, + "step": 4543 + }, + { + "epoch": 0.5399881164587047, + "grad_norm": 0.5569727818474242, + "learning_rate": 1.7416707595606783e-05, + "loss": 0.8834, + "num_tokens": 18992028035.0, + "step": 4544 + }, + { + "epoch": 0.5401069518716578, + "grad_norm": 0.5677540338841137, + "learning_rate": 1.741549302779163e-05, + "loss": 0.8626, + "num_tokens": 18996192872.0, + "step": 4545 + }, + { + "epoch": 0.5402257872846108, + "grad_norm": 0.46911278082240837, + "learning_rate": 1.7414278222389073e-05, + "loss": 0.8723, + "num_tokens": 19000382206.0, + "step": 4546 + }, + { + "epoch": 0.5403446226975639, + "grad_norm": 0.44993207129958396, + "learning_rate": 1.7413063179444106e-05, + "loss": 0.8585, + "num_tokens": 19004550876.0, + "step": 4547 + }, + { + "epoch": 0.5404634581105169, + "grad_norm": 0.5107804865194178, + "learning_rate": 1.7411847899001732e-05, + "loss": 0.8508, + "num_tokens": 19008737496.0, + "step": 4548 + }, + { + "epoch": 0.54058229352347, + "grad_norm": 0.4690895788700119, + "learning_rate": 1.741063238110695e-05, + "loss": 0.8614, + "num_tokens": 19012896850.0, + "step": 4549 + }, + { + "epoch": 0.5407011289364231, + "grad_norm": 0.46899364371408975, + "learning_rate": 1.7409416625804774e-05, + "loss": 0.8364, + "num_tokens": 19017086129.0, + "step": 4550 + }, + { + "epoch": 0.5408199643493761, + "grad_norm": 0.4649895830387586, + "learning_rate": 1.740820063314023e-05, + "loss": 0.8709, + "num_tokens": 19021275243.0, + "step": 4551 + }, + { + "epoch": 0.5409387997623292, + "grad_norm": 0.5405671612470548, + "learning_rate": 1.7406984403158353e-05, + "loss": 0.8839, + "num_tokens": 19025433374.0, + "step": 4552 + }, + { + "epoch": 0.5410576351752823, + "grad_norm": 0.4736304526914953, + "learning_rate": 1.7405767935904174e-05, + "loss": 0.8628, + "num_tokens": 19029622674.0, + "step": 4553 + }, + { + "epoch": 0.5411764705882353, + "grad_norm": 0.41535921635650974, + "learning_rate": 1.7404551231422756e-05, + "loss": 0.8706, + "num_tokens": 19033813975.0, + "step": 4554 + }, + { + "epoch": 0.5412953060011884, + "grad_norm": 0.48122329519660195, + "learning_rate": 1.740333428975915e-05, + "loss": 0.8861, + "num_tokens": 19038002136.0, + "step": 4555 + }, + { + "epoch": 0.5414141414141415, + "grad_norm": 0.45951855833846095, + "learning_rate": 1.7402117110958423e-05, + "loss": 0.8737, + "num_tokens": 19042192012.0, + "step": 4556 + }, + { + "epoch": 0.5415329768270944, + "grad_norm": 0.3957487686632552, + "learning_rate": 1.7400899695065653e-05, + "loss": 0.8516, + "num_tokens": 19046380656.0, + "step": 4557 + }, + { + "epoch": 0.5416518122400475, + "grad_norm": 0.5418842018068711, + "learning_rate": 1.739968204212593e-05, + "loss": 0.8532, + "num_tokens": 19050558408.0, + "step": 4558 + }, + { + "epoch": 0.5417706476530006, + "grad_norm": 0.48652517881592355, + "learning_rate": 1.739846415218434e-05, + "loss": 0.8852, + "num_tokens": 19054742215.0, + "step": 4559 + }, + { + "epoch": 0.5418894830659536, + "grad_norm": 0.4878343437452266, + "learning_rate": 1.739724602528599e-05, + "loss": 0.8406, + "num_tokens": 19058905991.0, + "step": 4560 + }, + { + "epoch": 0.5420083184789067, + "grad_norm": 0.4874918580162164, + "learning_rate": 1.739602766147599e-05, + "loss": 0.8915, + "num_tokens": 19063064228.0, + "step": 4561 + }, + { + "epoch": 0.5421271538918597, + "grad_norm": 0.48924702718310825, + "learning_rate": 1.739480906079946e-05, + "loss": 0.8922, + "num_tokens": 19067252370.0, + "step": 4562 + }, + { + "epoch": 0.5422459893048128, + "grad_norm": 0.4162924714951684, + "learning_rate": 1.7393590223301526e-05, + "loss": 0.8472, + "num_tokens": 19071440614.0, + "step": 4563 + }, + { + "epoch": 0.5423648247177659, + "grad_norm": 0.5135384855042597, + "learning_rate": 1.7392371149027333e-05, + "loss": 0.859, + "num_tokens": 19075628971.0, + "step": 4564 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.48355384660943496, + "learning_rate": 1.739115183802202e-05, + "loss": 0.8583, + "num_tokens": 19079778931.0, + "step": 4565 + }, + { + "epoch": 0.542602495543672, + "grad_norm": 0.445794625158777, + "learning_rate": 1.738993229033075e-05, + "loss": 0.8338, + "num_tokens": 19083968682.0, + "step": 4566 + }, + { + "epoch": 0.5427213309566251, + "grad_norm": 0.49252430808886194, + "learning_rate": 1.7388712505998676e-05, + "loss": 0.875, + "num_tokens": 19088134389.0, + "step": 4567 + }, + { + "epoch": 0.5428401663695781, + "grad_norm": 0.5271074698579961, + "learning_rate": 1.7387492485070982e-05, + "loss": 0.8671, + "num_tokens": 19092322706.0, + "step": 4568 + }, + { + "epoch": 0.5429590017825312, + "grad_norm": 0.45204591382027703, + "learning_rate": 1.7386272227592843e-05, + "loss": 0.9115, + "num_tokens": 19096496907.0, + "step": 4569 + }, + { + "epoch": 0.5430778371954843, + "grad_norm": 0.4867699542659822, + "learning_rate": 1.7385051733609452e-05, + "loss": 0.8816, + "num_tokens": 19100667992.0, + "step": 4570 + }, + { + "epoch": 0.5431966726084373, + "grad_norm": 0.45273158523428353, + "learning_rate": 1.738383100316601e-05, + "loss": 0.8931, + "num_tokens": 19104857966.0, + "step": 4571 + }, + { + "epoch": 0.5433155080213904, + "grad_norm": 0.4212901539600239, + "learning_rate": 1.7382610036307714e-05, + "loss": 0.8703, + "num_tokens": 19109047759.0, + "step": 4572 + }, + { + "epoch": 0.5434343434343434, + "grad_norm": 0.4281945727388302, + "learning_rate": 1.7381388833079794e-05, + "loss": 0.8859, + "num_tokens": 19113207012.0, + "step": 4573 + }, + { + "epoch": 0.5435531788472965, + "grad_norm": 0.5420966027420848, + "learning_rate": 1.738016739352747e-05, + "loss": 0.8955, + "num_tokens": 19117396344.0, + "step": 4574 + }, + { + "epoch": 0.5436720142602496, + "grad_norm": 0.49859216839908244, + "learning_rate": 1.737894571769597e-05, + "loss": 0.906, + "num_tokens": 19121586165.0, + "step": 4575 + }, + { + "epoch": 0.5437908496732026, + "grad_norm": 0.5164521628294573, + "learning_rate": 1.7377723805630544e-05, + "loss": 0.8416, + "num_tokens": 19125750758.0, + "step": 4576 + }, + { + "epoch": 0.5439096850861557, + "grad_norm": 0.46468845929614316, + "learning_rate": 1.7376501657376444e-05, + "loss": 0.8898, + "num_tokens": 19129938805.0, + "step": 4577 + }, + { + "epoch": 0.5440285204991088, + "grad_norm": 0.4897806234816006, + "learning_rate": 1.7375279272978928e-05, + "loss": 0.8493, + "num_tokens": 19134128665.0, + "step": 4578 + }, + { + "epoch": 0.5441473559120618, + "grad_norm": 0.4471231913349778, + "learning_rate": 1.7374056652483265e-05, + "loss": 0.8724, + "num_tokens": 19138316043.0, + "step": 4579 + }, + { + "epoch": 0.5442661913250149, + "grad_norm": 0.5184925587483844, + "learning_rate": 1.737283379593473e-05, + "loss": 0.87, + "num_tokens": 19142490306.0, + "step": 4580 + }, + { + "epoch": 0.544385026737968, + "grad_norm": 0.45838424368816394, + "learning_rate": 1.7371610703378615e-05, + "loss": 0.8951, + "num_tokens": 19146654481.0, + "step": 4581 + }, + { + "epoch": 0.544503862150921, + "grad_norm": 0.4813863938831143, + "learning_rate": 1.7370387374860214e-05, + "loss": 0.8918, + "num_tokens": 19150839554.0, + "step": 4582 + }, + { + "epoch": 0.544622697563874, + "grad_norm": 0.4962638360953826, + "learning_rate": 1.736916381042483e-05, + "loss": 0.8503, + "num_tokens": 19155028708.0, + "step": 4583 + }, + { + "epoch": 0.5447415329768271, + "grad_norm": 0.4068042563250248, + "learning_rate": 1.7367940010117774e-05, + "loss": 0.8596, + "num_tokens": 19159217292.0, + "step": 4584 + }, + { + "epoch": 0.5448603683897801, + "grad_norm": 0.5049696010948017, + "learning_rate": 1.7366715973984367e-05, + "loss": 0.9071, + "num_tokens": 19163385567.0, + "step": 4585 + }, + { + "epoch": 0.5449792038027332, + "grad_norm": 0.46885541196560154, + "learning_rate": 1.7365491702069947e-05, + "loss": 0.8701, + "num_tokens": 19167575320.0, + "step": 4586 + }, + { + "epoch": 0.5450980392156862, + "grad_norm": 0.4875568588046537, + "learning_rate": 1.736426719441984e-05, + "loss": 0.8436, + "num_tokens": 19171762563.0, + "step": 4587 + }, + { + "epoch": 0.5452168746286393, + "grad_norm": 0.5110953382555569, + "learning_rate": 1.7363042451079405e-05, + "loss": 0.854, + "num_tokens": 19175952437.0, + "step": 4588 + }, + { + "epoch": 0.5453357100415924, + "grad_norm": 0.5242376169370607, + "learning_rate": 1.7361817472093993e-05, + "loss": 0.9055, + "num_tokens": 19180142694.0, + "step": 4589 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.4608591587294279, + "learning_rate": 1.7360592257508972e-05, + "loss": 0.8653, + "num_tokens": 19184333087.0, + "step": 4590 + }, + { + "epoch": 0.5455733808674985, + "grad_norm": 0.457252586506212, + "learning_rate": 1.735936680736971e-05, + "loss": 0.8911, + "num_tokens": 19188521767.0, + "step": 4591 + }, + { + "epoch": 0.5456922162804516, + "grad_norm": 0.5022178964025865, + "learning_rate": 1.7358141121721597e-05, + "loss": 0.8813, + "num_tokens": 19192706044.0, + "step": 4592 + }, + { + "epoch": 0.5458110516934046, + "grad_norm": 0.4856368468824444, + "learning_rate": 1.7356915200610017e-05, + "loss": 0.8584, + "num_tokens": 19196896209.0, + "step": 4593 + }, + { + "epoch": 0.5459298871063577, + "grad_norm": 0.4744459629675697, + "learning_rate": 1.7355689044080378e-05, + "loss": 0.884, + "num_tokens": 19201086478.0, + "step": 4594 + }, + { + "epoch": 0.5460487225193108, + "grad_norm": 0.426404689966519, + "learning_rate": 1.7354462652178084e-05, + "loss": 0.863, + "num_tokens": 19205255310.0, + "step": 4595 + }, + { + "epoch": 0.5461675579322638, + "grad_norm": 0.48606627549812476, + "learning_rate": 1.7353236024948553e-05, + "loss": 0.8601, + "num_tokens": 19209445531.0, + "step": 4596 + }, + { + "epoch": 0.5462863933452169, + "grad_norm": 0.43578230734556717, + "learning_rate": 1.735200916243721e-05, + "loss": 0.8821, + "num_tokens": 19213591257.0, + "step": 4597 + }, + { + "epoch": 0.5464052287581699, + "grad_norm": 0.46941214333018194, + "learning_rate": 1.7350782064689493e-05, + "loss": 0.8795, + "num_tokens": 19217780858.0, + "step": 4598 + }, + { + "epoch": 0.546524064171123, + "grad_norm": 0.4910471386256383, + "learning_rate": 1.734955473175084e-05, + "loss": 0.8553, + "num_tokens": 19221946295.0, + "step": 4599 + }, + { + "epoch": 0.5466428995840761, + "grad_norm": 0.5716065997043888, + "learning_rate": 1.7348327163666706e-05, + "loss": 0.8632, + "num_tokens": 19226135300.0, + "step": 4600 + }, + { + "epoch": 0.5467617349970291, + "grad_norm": 0.51554607287201, + "learning_rate": 1.7347099360482558e-05, + "loss": 0.8868, + "num_tokens": 19230324009.0, + "step": 4601 + }, + { + "epoch": 0.5468805704099822, + "grad_norm": 0.48418281782190437, + "learning_rate": 1.734587132224386e-05, + "loss": 0.8357, + "num_tokens": 19234482909.0, + "step": 4602 + }, + { + "epoch": 0.5469994058229353, + "grad_norm": 0.543228737216241, + "learning_rate": 1.734464304899609e-05, + "loss": 0.9146, + "num_tokens": 19238671288.0, + "step": 4603 + }, + { + "epoch": 0.5471182412358883, + "grad_norm": 0.423392476736495, + "learning_rate": 1.734341454078473e-05, + "loss": 0.8762, + "num_tokens": 19242853026.0, + "step": 4604 + }, + { + "epoch": 0.5472370766488414, + "grad_norm": 0.49339161895970424, + "learning_rate": 1.734218579765529e-05, + "loss": 0.8721, + "num_tokens": 19247041964.0, + "step": 4605 + }, + { + "epoch": 0.5473559120617945, + "grad_norm": 0.43821868910562645, + "learning_rate": 1.7340956819653266e-05, + "loss": 0.8833, + "num_tokens": 19251232053.0, + "step": 4606 + }, + { + "epoch": 0.5474747474747474, + "grad_norm": 0.4326234170162899, + "learning_rate": 1.733972760682417e-05, + "loss": 0.8899, + "num_tokens": 19255421133.0, + "step": 4607 + }, + { + "epoch": 0.5475935828877005, + "grad_norm": 0.46801537422914935, + "learning_rate": 1.7338498159213525e-05, + "loss": 0.8934, + "num_tokens": 19259611561.0, + "step": 4608 + }, + { + "epoch": 0.5477124183006536, + "grad_norm": 0.46943707512928795, + "learning_rate": 1.7337268476866864e-05, + "loss": 0.8705, + "num_tokens": 19263776460.0, + "step": 4609 + }, + { + "epoch": 0.5478312537136066, + "grad_norm": 0.492358351998099, + "learning_rate": 1.733603855982972e-05, + "loss": 0.8829, + "num_tokens": 19267966891.0, + "step": 4610 + }, + { + "epoch": 0.5479500891265597, + "grad_norm": 0.3801030770809148, + "learning_rate": 1.733480840814765e-05, + "loss": 0.8483, + "num_tokens": 19272155121.0, + "step": 4611 + }, + { + "epoch": 0.5480689245395127, + "grad_norm": 0.5553351283086151, + "learning_rate": 1.7333578021866206e-05, + "loss": 0.8513, + "num_tokens": 19276344936.0, + "step": 4612 + }, + { + "epoch": 0.5481877599524658, + "grad_norm": 0.41794088033579796, + "learning_rate": 1.7332347401030953e-05, + "loss": 0.8656, + "num_tokens": 19280536114.0, + "step": 4613 + }, + { + "epoch": 0.5483065953654189, + "grad_norm": 0.5414332421862791, + "learning_rate": 1.7331116545687462e-05, + "loss": 0.9092, + "num_tokens": 19284725362.0, + "step": 4614 + }, + { + "epoch": 0.5484254307783719, + "grad_norm": 0.46660600731355406, + "learning_rate": 1.7329885455881322e-05, + "loss": 0.8585, + "num_tokens": 19288911932.0, + "step": 4615 + }, + { + "epoch": 0.548544266191325, + "grad_norm": 0.4948948206306555, + "learning_rate": 1.7328654131658122e-05, + "loss": 0.8892, + "num_tokens": 19293099769.0, + "step": 4616 + }, + { + "epoch": 0.5486631016042781, + "grad_norm": 0.4191376844629435, + "learning_rate": 1.7327422573063464e-05, + "loss": 0.9069, + "num_tokens": 19297256254.0, + "step": 4617 + }, + { + "epoch": 0.5487819370172311, + "grad_norm": 0.42373541124194286, + "learning_rate": 1.732619078014295e-05, + "loss": 0.8455, + "num_tokens": 19301426870.0, + "step": 4618 + }, + { + "epoch": 0.5489007724301842, + "grad_norm": 0.3789837877875115, + "learning_rate": 1.7324958752942206e-05, + "loss": 0.8139, + "num_tokens": 19305610095.0, + "step": 4619 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.42702369023949244, + "learning_rate": 1.7323726491506852e-05, + "loss": 0.8808, + "num_tokens": 19309791403.0, + "step": 4620 + }, + { + "epoch": 0.5491384432560903, + "grad_norm": 0.5343089694371854, + "learning_rate": 1.7322493995882528e-05, + "loss": 0.8453, + "num_tokens": 19313953856.0, + "step": 4621 + }, + { + "epoch": 0.5492572786690434, + "grad_norm": 0.5333906509094757, + "learning_rate": 1.732126126611487e-05, + "loss": 0.8572, + "num_tokens": 19318117222.0, + "step": 4622 + }, + { + "epoch": 0.5493761140819964, + "grad_norm": 0.4748286714827387, + "learning_rate": 1.7320028302249535e-05, + "loss": 0.8492, + "num_tokens": 19322305467.0, + "step": 4623 + }, + { + "epoch": 0.5494949494949495, + "grad_norm": 0.5144874525329358, + "learning_rate": 1.7318795104332186e-05, + "loss": 0.8652, + "num_tokens": 19326495178.0, + "step": 4624 + }, + { + "epoch": 0.5496137849079026, + "grad_norm": 0.4364415546885114, + "learning_rate": 1.731756167240849e-05, + "loss": 0.8462, + "num_tokens": 19330657707.0, + "step": 4625 + }, + { + "epoch": 0.5497326203208556, + "grad_norm": 0.4940030414829801, + "learning_rate": 1.731632800652412e-05, + "loss": 0.8695, + "num_tokens": 19334847647.0, + "step": 4626 + }, + { + "epoch": 0.5498514557338087, + "grad_norm": 0.4470374751636113, + "learning_rate": 1.7315094106724776e-05, + "loss": 0.8482, + "num_tokens": 19339032568.0, + "step": 4627 + }, + { + "epoch": 0.5499702911467618, + "grad_norm": 0.4177800404892448, + "learning_rate": 1.731385997305614e-05, + "loss": 0.8824, + "num_tokens": 19343203854.0, + "step": 4628 + }, + { + "epoch": 0.5500891265597148, + "grad_norm": 0.4136963683834326, + "learning_rate": 1.731262560556392e-05, + "loss": 0.86, + "num_tokens": 19347361719.0, + "step": 4629 + }, + { + "epoch": 0.5502079619726679, + "grad_norm": 0.49597898734960333, + "learning_rate": 1.7311391004293837e-05, + "loss": 0.8493, + "num_tokens": 19351539342.0, + "step": 4630 + }, + { + "epoch": 0.550326797385621, + "grad_norm": 0.4289039600795631, + "learning_rate": 1.7310156169291602e-05, + "loss": 0.8648, + "num_tokens": 19355729762.0, + "step": 4631 + }, + { + "epoch": 0.550445632798574, + "grad_norm": 0.48455125785387587, + "learning_rate": 1.730892110060295e-05, + "loss": 0.8751, + "num_tokens": 19359918134.0, + "step": 4632 + }, + { + "epoch": 0.550564468211527, + "grad_norm": 0.4592848830629736, + "learning_rate": 1.7307685798273617e-05, + "loss": 0.8689, + "num_tokens": 19364080273.0, + "step": 4633 + }, + { + "epoch": 0.5506833036244801, + "grad_norm": 0.48464305239964733, + "learning_rate": 1.7306450262349354e-05, + "loss": 0.89, + "num_tokens": 19368269599.0, + "step": 4634 + }, + { + "epoch": 0.5508021390374331, + "grad_norm": 0.45830827241873445, + "learning_rate": 1.7305214492875912e-05, + "loss": 0.8689, + "num_tokens": 19372458934.0, + "step": 4635 + }, + { + "epoch": 0.5509209744503862, + "grad_norm": 0.4962429998103111, + "learning_rate": 1.730397848989906e-05, + "loss": 0.8403, + "num_tokens": 19376646956.0, + "step": 4636 + }, + { + "epoch": 0.5510398098633392, + "grad_norm": 0.45946651348425477, + "learning_rate": 1.730274225346457e-05, + "loss": 0.8498, + "num_tokens": 19380836231.0, + "step": 4637 + }, + { + "epoch": 0.5511586452762923, + "grad_norm": 0.4594133658122297, + "learning_rate": 1.7301505783618227e-05, + "loss": 0.868, + "num_tokens": 19385007589.0, + "step": 4638 + }, + { + "epoch": 0.5512774806892454, + "grad_norm": 0.4830637506837563, + "learning_rate": 1.7300269080405815e-05, + "loss": 0.8613, + "num_tokens": 19389197280.0, + "step": 4639 + }, + { + "epoch": 0.5513963161021984, + "grad_norm": 0.4616225387076105, + "learning_rate": 1.7299032143873143e-05, + "loss": 0.8512, + "num_tokens": 19393387742.0, + "step": 4640 + }, + { + "epoch": 0.5515151515151515, + "grad_norm": 0.447804371411744, + "learning_rate": 1.7297794974066008e-05, + "loss": 0.8445, + "num_tokens": 19397527143.0, + "step": 4641 + }, + { + "epoch": 0.5516339869281046, + "grad_norm": 0.5374101077301494, + "learning_rate": 1.729655757103023e-05, + "loss": 0.8342, + "num_tokens": 19401714885.0, + "step": 4642 + }, + { + "epoch": 0.5517528223410576, + "grad_norm": 0.40741586336323354, + "learning_rate": 1.729531993481164e-05, + "loss": 0.8775, + "num_tokens": 19405903800.0, + "step": 4643 + }, + { + "epoch": 0.5518716577540107, + "grad_norm": 0.43511757323347094, + "learning_rate": 1.7294082065456072e-05, + "loss": 0.8748, + "num_tokens": 19410093344.0, + "step": 4644 + }, + { + "epoch": 0.5519904931669638, + "grad_norm": 0.3727035075142714, + "learning_rate": 1.7292843963009355e-05, + "loss": 0.8842, + "num_tokens": 19414239393.0, + "step": 4645 + }, + { + "epoch": 0.5521093285799168, + "grad_norm": 0.49966727341659156, + "learning_rate": 1.7291605627517355e-05, + "loss": 0.8653, + "num_tokens": 19418427986.0, + "step": 4646 + }, + { + "epoch": 0.5522281639928699, + "grad_norm": 0.508708571500936, + "learning_rate": 1.7290367059025927e-05, + "loss": 0.8876, + "num_tokens": 19422618559.0, + "step": 4647 + }, + { + "epoch": 0.5523469994058229, + "grad_norm": 0.4846101214010212, + "learning_rate": 1.7289128257580937e-05, + "loss": 0.842, + "num_tokens": 19426806867.0, + "step": 4648 + }, + { + "epoch": 0.552465834818776, + "grad_norm": 0.49193449977507364, + "learning_rate": 1.7287889223228263e-05, + "loss": 0.8725, + "num_tokens": 19430962881.0, + "step": 4649 + }, + { + "epoch": 0.5525846702317291, + "grad_norm": 0.38135419553532673, + "learning_rate": 1.728664995601379e-05, + "loss": 0.8453, + "num_tokens": 19435149956.0, + "step": 4650 + }, + { + "epoch": 0.5527035056446821, + "grad_norm": 0.5575270299739633, + "learning_rate": 1.7285410455983415e-05, + "loss": 0.8826, + "num_tokens": 19439282626.0, + "step": 4651 + }, + { + "epoch": 0.5528223410576352, + "grad_norm": 0.4674807790192834, + "learning_rate": 1.728417072318304e-05, + "loss": 0.869, + "num_tokens": 19443471972.0, + "step": 4652 + }, + { + "epoch": 0.5529411764705883, + "grad_norm": 0.42876336050624825, + "learning_rate": 1.7282930757658576e-05, + "loss": 0.8723, + "num_tokens": 19447644342.0, + "step": 4653 + }, + { + "epoch": 0.5530600118835413, + "grad_norm": 0.5489504226659052, + "learning_rate": 1.7281690559455945e-05, + "loss": 0.8702, + "num_tokens": 19451812187.0, + "step": 4654 + }, + { + "epoch": 0.5531788472964944, + "grad_norm": 0.44823605521889737, + "learning_rate": 1.728045012862107e-05, + "loss": 0.8889, + "num_tokens": 19456002973.0, + "step": 4655 + }, + { + "epoch": 0.5532976827094475, + "grad_norm": 0.5146907664573119, + "learning_rate": 1.727920946519989e-05, + "loss": 0.877, + "num_tokens": 19460191560.0, + "step": 4656 + }, + { + "epoch": 0.5534165181224004, + "grad_norm": 0.5294665859717115, + "learning_rate": 1.7277968569238357e-05, + "loss": 0.8539, + "num_tokens": 19464380518.0, + "step": 4657 + }, + { + "epoch": 0.5535353535353535, + "grad_norm": 0.5458520149419218, + "learning_rate": 1.727672744078242e-05, + "loss": 0.8519, + "num_tokens": 19468569856.0, + "step": 4658 + }, + { + "epoch": 0.5536541889483066, + "grad_norm": 0.3910364169275629, + "learning_rate": 1.7275486079878042e-05, + "loss": 0.8528, + "num_tokens": 19472730353.0, + "step": 4659 + }, + { + "epoch": 0.5537730243612596, + "grad_norm": 0.43342525760228134, + "learning_rate": 1.72742444865712e-05, + "loss": 0.9011, + "num_tokens": 19476898599.0, + "step": 4660 + }, + { + "epoch": 0.5538918597742127, + "grad_norm": 0.5252876665169144, + "learning_rate": 1.7273002660907873e-05, + "loss": 0.9091, + "num_tokens": 19481014161.0, + "step": 4661 + }, + { + "epoch": 0.5540106951871657, + "grad_norm": 0.364953971402854, + "learning_rate": 1.7271760602934044e-05, + "loss": 0.8493, + "num_tokens": 19485203274.0, + "step": 4662 + }, + { + "epoch": 0.5541295306001188, + "grad_norm": 0.5135997644400595, + "learning_rate": 1.7270518312695713e-05, + "loss": 0.86, + "num_tokens": 19489366393.0, + "step": 4663 + }, + { + "epoch": 0.5542483660130719, + "grad_norm": 0.6189155021130499, + "learning_rate": 1.726927579023889e-05, + "loss": 0.8618, + "num_tokens": 19493555005.0, + "step": 4664 + }, + { + "epoch": 0.5543672014260249, + "grad_norm": 0.4013828221711954, + "learning_rate": 1.726803303560959e-05, + "loss": 0.865, + "num_tokens": 19497727927.0, + "step": 4665 + }, + { + "epoch": 0.554486036838978, + "grad_norm": 0.47778133961430236, + "learning_rate": 1.726679004885383e-05, + "loss": 0.836, + "num_tokens": 19501898159.0, + "step": 4666 + }, + { + "epoch": 0.5546048722519311, + "grad_norm": 0.48242264777257954, + "learning_rate": 1.7265546830017646e-05, + "loss": 0.8519, + "num_tokens": 19506087942.0, + "step": 4667 + }, + { + "epoch": 0.5547237076648841, + "grad_norm": 0.5190003775002564, + "learning_rate": 1.7264303379147077e-05, + "loss": 0.8492, + "num_tokens": 19510279001.0, + "step": 4668 + }, + { + "epoch": 0.5548425430778372, + "grad_norm": 0.5012787584748596, + "learning_rate": 1.726305969628818e-05, + "loss": 0.8655, + "num_tokens": 19514468005.0, + "step": 4669 + }, + { + "epoch": 0.5549613784907903, + "grad_norm": 0.3820329983671644, + "learning_rate": 1.7261815781487002e-05, + "loss": 0.8583, + "num_tokens": 19518633310.0, + "step": 4670 + }, + { + "epoch": 0.5550802139037433, + "grad_norm": 0.4177012884219991, + "learning_rate": 1.7260571634789614e-05, + "loss": 0.8486, + "num_tokens": 19522823065.0, + "step": 4671 + }, + { + "epoch": 0.5551990493166964, + "grad_norm": 0.5238975059271278, + "learning_rate": 1.7259327256242096e-05, + "loss": 0.858, + "num_tokens": 19527011769.0, + "step": 4672 + }, + { + "epoch": 0.5553178847296494, + "grad_norm": 0.4967707359150572, + "learning_rate": 1.725808264589052e-05, + "loss": 0.8474, + "num_tokens": 19531171772.0, + "step": 4673 + }, + { + "epoch": 0.5554367201426025, + "grad_norm": 0.49681890075277013, + "learning_rate": 1.725683780378099e-05, + "loss": 0.848, + "num_tokens": 19535342862.0, + "step": 4674 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.4281392408866829, + "learning_rate": 1.7255592729959597e-05, + "loss": 0.8664, + "num_tokens": 19539531283.0, + "step": 4675 + }, + { + "epoch": 0.5556743909685086, + "grad_norm": 0.5149713599712771, + "learning_rate": 1.7254347424472456e-05, + "loss": 0.8141, + "num_tokens": 19543691014.0, + "step": 4676 + }, + { + "epoch": 0.5557932263814617, + "grad_norm": 0.5149185813564108, + "learning_rate": 1.7253101887365683e-05, + "loss": 0.8434, + "num_tokens": 19547869525.0, + "step": 4677 + }, + { + "epoch": 0.5559120617944148, + "grad_norm": 0.453998430361357, + "learning_rate": 1.7251856118685406e-05, + "loss": 0.8973, + "num_tokens": 19552030712.0, + "step": 4678 + }, + { + "epoch": 0.5560308972073678, + "grad_norm": 0.49904557984704867, + "learning_rate": 1.725061011847776e-05, + "loss": 0.8477, + "num_tokens": 19556197221.0, + "step": 4679 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 0.44199769407010864, + "learning_rate": 1.7249363886788882e-05, + "loss": 0.8793, + "num_tokens": 19560387247.0, + "step": 4680 + }, + { + "epoch": 0.556268568033274, + "grad_norm": 0.502406122800526, + "learning_rate": 1.7248117423664933e-05, + "loss": 0.8775, + "num_tokens": 19564532385.0, + "step": 4681 + }, + { + "epoch": 0.556387403446227, + "grad_norm": 0.4434009860322637, + "learning_rate": 1.724687072915207e-05, + "loss": 0.8383, + "num_tokens": 19568720565.0, + "step": 4682 + }, + { + "epoch": 0.55650623885918, + "grad_norm": 0.5061260264712847, + "learning_rate": 1.7245623803296465e-05, + "loss": 0.8613, + "num_tokens": 19572895902.0, + "step": 4683 + }, + { + "epoch": 0.5566250742721331, + "grad_norm": 0.5040115473580816, + "learning_rate": 1.724437664614429e-05, + "loss": 0.8644, + "num_tokens": 19577086442.0, + "step": 4684 + }, + { + "epoch": 0.5567439096850861, + "grad_norm": 0.5578053785182874, + "learning_rate": 1.7243129257741738e-05, + "loss": 0.8512, + "num_tokens": 19581270716.0, + "step": 4685 + }, + { + "epoch": 0.5568627450980392, + "grad_norm": 0.4442082838522541, + "learning_rate": 1.7241881638134998e-05, + "loss": 0.8828, + "num_tokens": 19585460299.0, + "step": 4686 + }, + { + "epoch": 0.5569815805109922, + "grad_norm": 0.5002944008481317, + "learning_rate": 1.724063378737028e-05, + "loss": 0.8856, + "num_tokens": 19589649202.0, + "step": 4687 + }, + { + "epoch": 0.5571004159239453, + "grad_norm": 0.42168481349763315, + "learning_rate": 1.723938570549379e-05, + "loss": 0.8747, + "num_tokens": 19593839763.0, + "step": 4688 + }, + { + "epoch": 0.5572192513368984, + "grad_norm": 0.41915077754893976, + "learning_rate": 1.7238137392551756e-05, + "loss": 0.8892, + "num_tokens": 19598018690.0, + "step": 4689 + }, + { + "epoch": 0.5573380867498514, + "grad_norm": 0.4590321975817334, + "learning_rate": 1.72368888485904e-05, + "loss": 0.8568, + "num_tokens": 19602181668.0, + "step": 4690 + }, + { + "epoch": 0.5574569221628045, + "grad_norm": 0.4050249015567612, + "learning_rate": 1.723564007365596e-05, + "loss": 0.8577, + "num_tokens": 19606353172.0, + "step": 4691 + }, + { + "epoch": 0.5575757575757576, + "grad_norm": 0.4305078541831397, + "learning_rate": 1.723439106779469e-05, + "loss": 0.8547, + "num_tokens": 19610541414.0, + "step": 4692 + }, + { + "epoch": 0.5576945929887106, + "grad_norm": 0.4210377663397118, + "learning_rate": 1.7233141831052832e-05, + "loss": 0.8637, + "num_tokens": 19614730213.0, + "step": 4693 + }, + { + "epoch": 0.5578134284016637, + "grad_norm": 0.4335369214394817, + "learning_rate": 1.7231892363476662e-05, + "loss": 0.8248, + "num_tokens": 19618884368.0, + "step": 4694 + }, + { + "epoch": 0.5579322638146168, + "grad_norm": 0.4897080784367228, + "learning_rate": 1.7230642665112448e-05, + "loss": 0.8177, + "num_tokens": 19623075085.0, + "step": 4695 + }, + { + "epoch": 0.5580510992275698, + "grad_norm": 0.4748973671405157, + "learning_rate": 1.722939273600647e-05, + "loss": 0.9122, + "num_tokens": 19627258419.0, + "step": 4696 + }, + { + "epoch": 0.5581699346405229, + "grad_norm": 0.3758996201470104, + "learning_rate": 1.7228142576205018e-05, + "loss": 0.8519, + "num_tokens": 19631429688.0, + "step": 4697 + }, + { + "epoch": 0.558288770053476, + "grad_norm": 0.5362591642534705, + "learning_rate": 1.7226892185754384e-05, + "loss": 0.8518, + "num_tokens": 19635610497.0, + "step": 4698 + }, + { + "epoch": 0.558407605466429, + "grad_norm": 0.4842562143381104, + "learning_rate": 1.7225641564700883e-05, + "loss": 0.8667, + "num_tokens": 19639799725.0, + "step": 4699 + }, + { + "epoch": 0.5585264408793821, + "grad_norm": 0.5011592522232262, + "learning_rate": 1.722439071309082e-05, + "loss": 0.8442, + "num_tokens": 19643976107.0, + "step": 4700 + }, + { + "epoch": 0.5586452762923351, + "grad_norm": 0.4817169615033385, + "learning_rate": 1.722313963097053e-05, + "loss": 0.8631, + "num_tokens": 19648164309.0, + "step": 4701 + }, + { + "epoch": 0.5587641117052882, + "grad_norm": 0.4520113724507499, + "learning_rate": 1.7221888318386335e-05, + "loss": 0.8697, + "num_tokens": 19652333789.0, + "step": 4702 + }, + { + "epoch": 0.5588829471182413, + "grad_norm": 0.4287302176520182, + "learning_rate": 1.722063677538458e-05, + "loss": 0.8683, + "num_tokens": 19656523232.0, + "step": 4703 + }, + { + "epoch": 0.5590017825311943, + "grad_norm": 0.4772058356903854, + "learning_rate": 1.7219385002011614e-05, + "loss": 0.8711, + "num_tokens": 19660713807.0, + "step": 4704 + }, + { + "epoch": 0.5591206179441474, + "grad_norm": 0.4829658779340776, + "learning_rate": 1.7218132998313792e-05, + "loss": 0.8798, + "num_tokens": 19664902948.0, + "step": 4705 + }, + { + "epoch": 0.5592394533571005, + "grad_norm": 0.37815150735044234, + "learning_rate": 1.721688076433748e-05, + "loss": 0.8823, + "num_tokens": 19669090847.0, + "step": 4706 + }, + { + "epoch": 0.5593582887700534, + "grad_norm": 0.4930147865931186, + "learning_rate": 1.7215628300129054e-05, + "loss": 0.8603, + "num_tokens": 19673249946.0, + "step": 4707 + }, + { + "epoch": 0.5594771241830065, + "grad_norm": 0.4646718290866098, + "learning_rate": 1.7214375605734898e-05, + "loss": 0.8475, + "num_tokens": 19677421073.0, + "step": 4708 + }, + { + "epoch": 0.5595959595959596, + "grad_norm": 0.4138341317230721, + "learning_rate": 1.72131226812014e-05, + "loss": 0.8484, + "num_tokens": 19681594407.0, + "step": 4709 + }, + { + "epoch": 0.5597147950089126, + "grad_norm": 0.5407882190059119, + "learning_rate": 1.7211869526574964e-05, + "loss": 0.8622, + "num_tokens": 19685753968.0, + "step": 4710 + }, + { + "epoch": 0.5598336304218657, + "grad_norm": 0.49423481794866997, + "learning_rate": 1.7210616141901998e-05, + "loss": 0.838, + "num_tokens": 19689943768.0, + "step": 4711 + }, + { + "epoch": 0.5599524658348187, + "grad_norm": 0.47501735279799134, + "learning_rate": 1.720936252722891e-05, + "loss": 0.8642, + "num_tokens": 19694121141.0, + "step": 4712 + }, + { + "epoch": 0.5600713012477718, + "grad_norm": 0.40700783138972607, + "learning_rate": 1.7208108682602142e-05, + "loss": 0.8496, + "num_tokens": 19698311286.0, + "step": 4713 + }, + { + "epoch": 0.5601901366607249, + "grad_norm": 0.42576919008328284, + "learning_rate": 1.7206854608068114e-05, + "loss": 0.8593, + "num_tokens": 19702499983.0, + "step": 4714 + }, + { + "epoch": 0.5603089720736779, + "grad_norm": 0.44659790232431895, + "learning_rate": 1.7205600303673275e-05, + "loss": 0.8404, + "num_tokens": 19706659903.0, + "step": 4715 + }, + { + "epoch": 0.560427807486631, + "grad_norm": 0.5642287622094884, + "learning_rate": 1.720434576946408e-05, + "loss": 0.9071, + "num_tokens": 19710847867.0, + "step": 4716 + }, + { + "epoch": 0.5605466428995841, + "grad_norm": 0.4636718381279223, + "learning_rate": 1.7203091005486978e-05, + "loss": 0.8469, + "num_tokens": 19715037543.0, + "step": 4717 + }, + { + "epoch": 0.5606654783125371, + "grad_norm": 0.563172839759402, + "learning_rate": 1.7201836011788442e-05, + "loss": 0.8537, + "num_tokens": 19719227181.0, + "step": 4718 + }, + { + "epoch": 0.5607843137254902, + "grad_norm": 0.4128532204682457, + "learning_rate": 1.7200580788414955e-05, + "loss": 0.8333, + "num_tokens": 19723391609.0, + "step": 4719 + }, + { + "epoch": 0.5609031491384433, + "grad_norm": 0.4165460012128474, + "learning_rate": 1.7199325335412994e-05, + "loss": 0.8381, + "num_tokens": 19727578590.0, + "step": 4720 + }, + { + "epoch": 0.5610219845513963, + "grad_norm": 0.45825288307594486, + "learning_rate": 1.7198069652829054e-05, + "loss": 0.8521, + "num_tokens": 19731768217.0, + "step": 4721 + }, + { + "epoch": 0.5611408199643494, + "grad_norm": 0.5501474850001262, + "learning_rate": 1.719681374070964e-05, + "loss": 0.857, + "num_tokens": 19735957397.0, + "step": 4722 + }, + { + "epoch": 0.5612596553773025, + "grad_norm": 0.46279537417446315, + "learning_rate": 1.719555759910126e-05, + "loss": 0.8517, + "num_tokens": 19740124771.0, + "step": 4723 + }, + { + "epoch": 0.5613784907902555, + "grad_norm": 0.44068759729918905, + "learning_rate": 1.7194301228050436e-05, + "loss": 0.8479, + "num_tokens": 19744314288.0, + "step": 4724 + }, + { + "epoch": 0.5614973262032086, + "grad_norm": 0.3984817126650916, + "learning_rate": 1.7193044627603695e-05, + "loss": 0.8608, + "num_tokens": 19748502640.0, + "step": 4725 + }, + { + "epoch": 0.5616161616161616, + "grad_norm": 0.5665908702849985, + "learning_rate": 1.7191787797807568e-05, + "loss": 0.8546, + "num_tokens": 19752692339.0, + "step": 4726 + }, + { + "epoch": 0.5617349970291147, + "grad_norm": 0.4885307830432511, + "learning_rate": 1.7190530738708607e-05, + "loss": 0.8679, + "num_tokens": 19756819004.0, + "step": 4727 + }, + { + "epoch": 0.5618538324420678, + "grad_norm": 0.44407772849285104, + "learning_rate": 1.7189273450353362e-05, + "loss": 0.8465, + "num_tokens": 19760993786.0, + "step": 4728 + }, + { + "epoch": 0.5619726678550208, + "grad_norm": 0.4287444007586254, + "learning_rate": 1.71880159327884e-05, + "loss": 0.8681, + "num_tokens": 19765182655.0, + "step": 4729 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.4871613868340157, + "learning_rate": 1.718675818606028e-05, + "loss": 0.8452, + "num_tokens": 19769372225.0, + "step": 4730 + }, + { + "epoch": 0.562210338680927, + "grad_norm": 0.3923995878068182, + "learning_rate": 1.7185500210215583e-05, + "loss": 0.8607, + "num_tokens": 19773539284.0, + "step": 4731 + }, + { + "epoch": 0.56232917409388, + "grad_norm": 0.5954044587751598, + "learning_rate": 1.7184242005300907e-05, + "loss": 0.8628, + "num_tokens": 19777729054.0, + "step": 4732 + }, + { + "epoch": 0.562448009506833, + "grad_norm": 0.40553191123237475, + "learning_rate": 1.7182983571362834e-05, + "loss": 0.8347, + "num_tokens": 19781919589.0, + "step": 4733 + }, + { + "epoch": 0.5625668449197861, + "grad_norm": 0.5412932105913937, + "learning_rate": 1.7181724908447977e-05, + "loss": 0.8223, + "num_tokens": 19786108718.0, + "step": 4734 + }, + { + "epoch": 0.5626856803327391, + "grad_norm": 0.5477774558766124, + "learning_rate": 1.7180466016602946e-05, + "loss": 0.8723, + "num_tokens": 19790296640.0, + "step": 4735 + }, + { + "epoch": 0.5628045157456922, + "grad_norm": 0.4220880389306951, + "learning_rate": 1.7179206895874364e-05, + "loss": 0.8629, + "num_tokens": 19794479357.0, + "step": 4736 + }, + { + "epoch": 0.5629233511586452, + "grad_norm": 0.5447287948434022, + "learning_rate": 1.7177947546308853e-05, + "loss": 0.8634, + "num_tokens": 19798668925.0, + "step": 4737 + }, + { + "epoch": 0.5630421865715983, + "grad_norm": 0.5393778341971589, + "learning_rate": 1.7176687967953062e-05, + "loss": 0.8639, + "num_tokens": 19802858166.0, + "step": 4738 + }, + { + "epoch": 0.5631610219845514, + "grad_norm": 0.5314764243528841, + "learning_rate": 1.7175428160853625e-05, + "loss": 0.841, + "num_tokens": 19807037501.0, + "step": 4739 + }, + { + "epoch": 0.5632798573975044, + "grad_norm": 0.41179056276666387, + "learning_rate": 1.7174168125057206e-05, + "loss": 0.8489, + "num_tokens": 19811227861.0, + "step": 4740 + }, + { + "epoch": 0.5633986928104575, + "grad_norm": 0.4946833579263455, + "learning_rate": 1.7172907860610467e-05, + "loss": 0.8551, + "num_tokens": 19815400093.0, + "step": 4741 + }, + { + "epoch": 0.5635175282234106, + "grad_norm": 0.48691309976843977, + "learning_rate": 1.717164736756008e-05, + "loss": 0.8615, + "num_tokens": 19819545517.0, + "step": 4742 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 0.46525646521457675, + "learning_rate": 1.7170386645952722e-05, + "loss": 0.8502, + "num_tokens": 19823735630.0, + "step": 4743 + }, + { + "epoch": 0.5637551990493167, + "grad_norm": 0.3753163640381191, + "learning_rate": 1.7169125695835086e-05, + "loss": 0.8447, + "num_tokens": 19827912783.0, + "step": 4744 + }, + { + "epoch": 0.5638740344622698, + "grad_norm": 0.4763494354446806, + "learning_rate": 1.7167864517253866e-05, + "loss": 0.8266, + "num_tokens": 19832101290.0, + "step": 4745 + }, + { + "epoch": 0.5639928698752228, + "grad_norm": 0.4566510326492189, + "learning_rate": 1.716660311025577e-05, + "loss": 0.8825, + "num_tokens": 19836289201.0, + "step": 4746 + }, + { + "epoch": 0.5641117052881759, + "grad_norm": 0.4860345755444132, + "learning_rate": 1.716534147488751e-05, + "loss": 0.8618, + "num_tokens": 19840477673.0, + "step": 4747 + }, + { + "epoch": 0.564230540701129, + "grad_norm": 0.5145223010868889, + "learning_rate": 1.7164079611195806e-05, + "loss": 0.8759, + "num_tokens": 19844665364.0, + "step": 4748 + }, + { + "epoch": 0.564349376114082, + "grad_norm": 0.4877566406115308, + "learning_rate": 1.71628175192274e-05, + "loss": 0.8379, + "num_tokens": 19848812681.0, + "step": 4749 + }, + { + "epoch": 0.5644682115270351, + "grad_norm": 0.4500488004775183, + "learning_rate": 1.7161555199029018e-05, + "loss": 0.8762, + "num_tokens": 19853002537.0, + "step": 4750 + }, + { + "epoch": 0.5645870469399881, + "grad_norm": 0.39812567365593016, + "learning_rate": 1.716029265064742e-05, + "loss": 0.8721, + "num_tokens": 19857190878.0, + "step": 4751 + }, + { + "epoch": 0.5647058823529412, + "grad_norm": 0.5163324158448094, + "learning_rate": 1.7159029874129354e-05, + "loss": 0.8851, + "num_tokens": 19861357440.0, + "step": 4752 + }, + { + "epoch": 0.5648247177658943, + "grad_norm": 0.4247010440531958, + "learning_rate": 1.7157766869521585e-05, + "loss": 0.8923, + "num_tokens": 19865546709.0, + "step": 4753 + }, + { + "epoch": 0.5649435531788473, + "grad_norm": 0.4495039430417641, + "learning_rate": 1.7156503636870892e-05, + "loss": 0.9268, + "num_tokens": 19869734599.0, + "step": 4754 + }, + { + "epoch": 0.5650623885918004, + "grad_norm": 0.43230542294459784, + "learning_rate": 1.7155240176224053e-05, + "loss": 0.8767, + "num_tokens": 19873924251.0, + "step": 4755 + }, + { + "epoch": 0.5651812240047535, + "grad_norm": 0.4665659429407901, + "learning_rate": 1.715397648762786e-05, + "loss": 0.8774, + "num_tokens": 19878106971.0, + "step": 4756 + }, + { + "epoch": 0.5653000594177064, + "grad_norm": 0.42888166807093026, + "learning_rate": 1.7152712571129107e-05, + "loss": 0.8517, + "num_tokens": 19882296149.0, + "step": 4757 + }, + { + "epoch": 0.5654188948306595, + "grad_norm": 0.5141927667634859, + "learning_rate": 1.7151448426774607e-05, + "loss": 0.8838, + "num_tokens": 19886486162.0, + "step": 4758 + }, + { + "epoch": 0.5655377302436126, + "grad_norm": 0.4155998346971357, + "learning_rate": 1.7150184054611175e-05, + "loss": 0.8316, + "num_tokens": 19890675198.0, + "step": 4759 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 0.4440100268718133, + "learning_rate": 1.7148919454685632e-05, + "loss": 0.8815, + "num_tokens": 19894839082.0, + "step": 4760 + }, + { + "epoch": 0.5657754010695187, + "grad_norm": 0.5074926703145038, + "learning_rate": 1.714765462704481e-05, + "loss": 0.887, + "num_tokens": 19899029436.0, + "step": 4761 + }, + { + "epoch": 0.5658942364824717, + "grad_norm": 0.548817967695246, + "learning_rate": 1.7146389571735553e-05, + "loss": 0.8711, + "num_tokens": 19903220737.0, + "step": 4762 + }, + { + "epoch": 0.5660130718954248, + "grad_norm": 0.44788898164818775, + "learning_rate": 1.7145124288804708e-05, + "loss": 0.8909, + "num_tokens": 19907409504.0, + "step": 4763 + }, + { + "epoch": 0.5661319073083779, + "grad_norm": 0.48941086785846244, + "learning_rate": 1.7143858778299133e-05, + "loss": 0.8891, + "num_tokens": 19911584465.0, + "step": 4764 + }, + { + "epoch": 0.5662507427213309, + "grad_norm": 0.43795034382056064, + "learning_rate": 1.7142593040265695e-05, + "loss": 0.8483, + "num_tokens": 19915746940.0, + "step": 4765 + }, + { + "epoch": 0.566369578134284, + "grad_norm": 0.4719493149130243, + "learning_rate": 1.714132707475127e-05, + "loss": 0.8837, + "num_tokens": 19919935775.0, + "step": 4766 + }, + { + "epoch": 0.5664884135472371, + "grad_norm": 0.46302359942572807, + "learning_rate": 1.7140060881802738e-05, + "loss": 0.8825, + "num_tokens": 19924109265.0, + "step": 4767 + }, + { + "epoch": 0.5666072489601901, + "grad_norm": 0.5138931854950628, + "learning_rate": 1.713879446146699e-05, + "loss": 0.8405, + "num_tokens": 19928299441.0, + "step": 4768 + }, + { + "epoch": 0.5667260843731432, + "grad_norm": 0.5176785776795925, + "learning_rate": 1.7137527813790926e-05, + "loss": 0.836, + "num_tokens": 19932453278.0, + "step": 4769 + }, + { + "epoch": 0.5668449197860963, + "grad_norm": 0.3863364664361464, + "learning_rate": 1.7136260938821457e-05, + "loss": 0.8598, + "num_tokens": 19936643241.0, + "step": 4770 + }, + { + "epoch": 0.5669637551990493, + "grad_norm": 0.5329451498822243, + "learning_rate": 1.7134993836605503e-05, + "loss": 0.8613, + "num_tokens": 19940801452.0, + "step": 4771 + }, + { + "epoch": 0.5670825906120024, + "grad_norm": 0.40150921159270925, + "learning_rate": 1.713372650718998e-05, + "loss": 0.8622, + "num_tokens": 19944969417.0, + "step": 4772 + }, + { + "epoch": 0.5672014260249555, + "grad_norm": 0.44570914229023373, + "learning_rate": 1.7132458950621825e-05, + "loss": 0.8851, + "num_tokens": 19949157969.0, + "step": 4773 + }, + { + "epoch": 0.5673202614379085, + "grad_norm": 0.4633979071037749, + "learning_rate": 1.7131191166947983e-05, + "loss": 0.8996, + "num_tokens": 19953346135.0, + "step": 4774 + }, + { + "epoch": 0.5674390968508616, + "grad_norm": 0.48864643896573773, + "learning_rate": 1.71299231562154e-05, + "loss": 0.8845, + "num_tokens": 19957506939.0, + "step": 4775 + }, + { + "epoch": 0.5675579322638146, + "grad_norm": 0.5243269956028972, + "learning_rate": 1.7128654918471037e-05, + "loss": 0.9128, + "num_tokens": 19961670085.0, + "step": 4776 + }, + { + "epoch": 0.5676767676767677, + "grad_norm": 0.4356107308875078, + "learning_rate": 1.712738645376186e-05, + "loss": 0.8936, + "num_tokens": 19965848351.0, + "step": 4777 + }, + { + "epoch": 0.5677956030897208, + "grad_norm": 0.4640823629886041, + "learning_rate": 1.712611776213485e-05, + "loss": 0.8346, + "num_tokens": 19970037438.0, + "step": 4778 + }, + { + "epoch": 0.5679144385026738, + "grad_norm": 0.45446884706840673, + "learning_rate": 1.7124848843636983e-05, + "loss": 0.851, + "num_tokens": 19974225442.0, + "step": 4779 + }, + { + "epoch": 0.5680332739156269, + "grad_norm": 0.4205802105685168, + "learning_rate": 1.7123579698315255e-05, + "loss": 0.8198, + "num_tokens": 19978389600.0, + "step": 4780 + }, + { + "epoch": 0.56815210932858, + "grad_norm": 0.4951143449195155, + "learning_rate": 1.712231032621667e-05, + "loss": 0.8594, + "num_tokens": 19982579777.0, + "step": 4781 + }, + { + "epoch": 0.568270944741533, + "grad_norm": 0.5204577616002183, + "learning_rate": 1.7121040727388226e-05, + "loss": 0.848, + "num_tokens": 19986738722.0, + "step": 4782 + }, + { + "epoch": 0.568389780154486, + "grad_norm": 0.44871774120827035, + "learning_rate": 1.7119770901876955e-05, + "loss": 0.8597, + "num_tokens": 19990896434.0, + "step": 4783 + }, + { + "epoch": 0.5685086155674391, + "grad_norm": 0.4833230980254193, + "learning_rate": 1.7118500849729872e-05, + "loss": 0.861, + "num_tokens": 19995085104.0, + "step": 4784 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.44175498900526033, + "learning_rate": 1.7117230570994015e-05, + "loss": 0.8053, + "num_tokens": 19999266235.0, + "step": 4785 + }, + { + "epoch": 0.5687462863933452, + "grad_norm": 0.4588345564528099, + "learning_rate": 1.7115960065716432e-05, + "loss": 0.8572, + "num_tokens": 20003454748.0, + "step": 4786 + }, + { + "epoch": 0.5688651218062982, + "grad_norm": 0.43362997930616815, + "learning_rate": 1.7114689333944166e-05, + "loss": 0.8744, + "num_tokens": 20007644152.0, + "step": 4787 + }, + { + "epoch": 0.5689839572192513, + "grad_norm": 0.4307569471022482, + "learning_rate": 1.7113418375724276e-05, + "loss": 0.8615, + "num_tokens": 20011834263.0, + "step": 4788 + }, + { + "epoch": 0.5691027926322044, + "grad_norm": 0.4193771261797077, + "learning_rate": 1.711214719110384e-05, + "loss": 0.8386, + "num_tokens": 20016018788.0, + "step": 4789 + }, + { + "epoch": 0.5692216280451574, + "grad_norm": 0.5077069456771707, + "learning_rate": 1.7110875780129925e-05, + "loss": 0.8968, + "num_tokens": 20020207596.0, + "step": 4790 + }, + { + "epoch": 0.5693404634581105, + "grad_norm": 0.48903170671446156, + "learning_rate": 1.7109604142849617e-05, + "loss": 0.8605, + "num_tokens": 20024397422.0, + "step": 4791 + }, + { + "epoch": 0.5694592988710636, + "grad_norm": 0.395407908723474, + "learning_rate": 1.7108332279310017e-05, + "loss": 0.8509, + "num_tokens": 20028586543.0, + "step": 4792 + }, + { + "epoch": 0.5695781342840166, + "grad_norm": 0.5428277226967364, + "learning_rate": 1.7107060189558213e-05, + "loss": 0.847, + "num_tokens": 20032777273.0, + "step": 4793 + }, + { + "epoch": 0.5696969696969697, + "grad_norm": 0.4617523790498623, + "learning_rate": 1.7105787873641324e-05, + "loss": 0.8744, + "num_tokens": 20036942413.0, + "step": 4794 + }, + { + "epoch": 0.5698158051099228, + "grad_norm": 0.4084229906385924, + "learning_rate": 1.7104515331606466e-05, + "loss": 0.8806, + "num_tokens": 20041126362.0, + "step": 4795 + }, + { + "epoch": 0.5699346405228758, + "grad_norm": 0.4836421568073155, + "learning_rate": 1.7103242563500765e-05, + "loss": 0.8871, + "num_tokens": 20045313355.0, + "step": 4796 + }, + { + "epoch": 0.5700534759358289, + "grad_norm": 0.4838116709983371, + "learning_rate": 1.7101969569371358e-05, + "loss": 0.8488, + "num_tokens": 20049502591.0, + "step": 4797 + }, + { + "epoch": 0.570172311348782, + "grad_norm": 0.4590438120791878, + "learning_rate": 1.7100696349265385e-05, + "loss": 0.8852, + "num_tokens": 20053691262.0, + "step": 4798 + }, + { + "epoch": 0.570291146761735, + "grad_norm": 0.5063066623075765, + "learning_rate": 1.709942290323e-05, + "loss": 0.8974, + "num_tokens": 20057878901.0, + "step": 4799 + }, + { + "epoch": 0.5704099821746881, + "grad_norm": 0.463417063264539, + "learning_rate": 1.709814923131236e-05, + "loss": 0.8722, + "num_tokens": 20062067296.0, + "step": 4800 + }, + { + "epoch": 0.5705288175876411, + "grad_norm": 0.5131040133198916, + "learning_rate": 1.7096875333559637e-05, + "loss": 0.8692, + "num_tokens": 20066241010.0, + "step": 4801 + }, + { + "epoch": 0.5706476530005942, + "grad_norm": 0.44186340431080556, + "learning_rate": 1.709560121001901e-05, + "loss": 0.8847, + "num_tokens": 20070430350.0, + "step": 4802 + }, + { + "epoch": 0.5707664884135473, + "grad_norm": 0.4677753146485937, + "learning_rate": 1.7094326860737653e-05, + "loss": 0.8636, + "num_tokens": 20074619548.0, + "step": 4803 + }, + { + "epoch": 0.5708853238265003, + "grad_norm": 0.410922998938208, + "learning_rate": 1.7093052285762773e-05, + "loss": 0.8619, + "num_tokens": 20078807286.0, + "step": 4804 + }, + { + "epoch": 0.5710041592394534, + "grad_norm": 0.433645526969928, + "learning_rate": 1.7091777485141565e-05, + "loss": 0.8519, + "num_tokens": 20082994358.0, + "step": 4805 + }, + { + "epoch": 0.5711229946524065, + "grad_norm": 0.4301918009250838, + "learning_rate": 1.7090502458921235e-05, + "loss": 0.8685, + "num_tokens": 20087182357.0, + "step": 4806 + }, + { + "epoch": 0.5712418300653594, + "grad_norm": 0.43203861363246704, + "learning_rate": 1.708922720714901e-05, + "loss": 0.8786, + "num_tokens": 20091370643.0, + "step": 4807 + }, + { + "epoch": 0.5713606654783125, + "grad_norm": 0.4117696131641011, + "learning_rate": 1.7087951729872118e-05, + "loss": 0.8829, + "num_tokens": 20095558185.0, + "step": 4808 + }, + { + "epoch": 0.5714795008912656, + "grad_norm": 0.5051897937280964, + "learning_rate": 1.7086676027137785e-05, + "loss": 0.8518, + "num_tokens": 20099746258.0, + "step": 4809 + }, + { + "epoch": 0.5715983363042186, + "grad_norm": 0.5160722572219266, + "learning_rate": 1.7085400098993256e-05, + "loss": 0.8718, + "num_tokens": 20103936169.0, + "step": 4810 + }, + { + "epoch": 0.5717171717171717, + "grad_norm": 0.4061887682074798, + "learning_rate": 1.708412394548579e-05, + "loss": 0.8628, + "num_tokens": 20108125148.0, + "step": 4811 + }, + { + "epoch": 0.5718360071301247, + "grad_norm": 0.5339827628107108, + "learning_rate": 1.7082847566662643e-05, + "loss": 0.8957, + "num_tokens": 20112315590.0, + "step": 4812 + }, + { + "epoch": 0.5719548425430778, + "grad_norm": 0.4494892223149869, + "learning_rate": 1.7081570962571084e-05, + "loss": 0.8666, + "num_tokens": 20116505131.0, + "step": 4813 + }, + { + "epoch": 0.5720736779560309, + "grad_norm": 0.4553959279678529, + "learning_rate": 1.708029413325839e-05, + "loss": 0.8556, + "num_tokens": 20120692596.0, + "step": 4814 + }, + { + "epoch": 0.5721925133689839, + "grad_norm": 0.4062306130499523, + "learning_rate": 1.707901707877185e-05, + "loss": 0.8686, + "num_tokens": 20124882203.0, + "step": 4815 + }, + { + "epoch": 0.572311348781937, + "grad_norm": 0.5172541491052934, + "learning_rate": 1.7077739799158748e-05, + "loss": 0.8758, + "num_tokens": 20129064008.0, + "step": 4816 + }, + { + "epoch": 0.5724301841948901, + "grad_norm": 0.5049609241455023, + "learning_rate": 1.7076462294466394e-05, + "loss": 0.8754, + "num_tokens": 20133245453.0, + "step": 4817 + }, + { + "epoch": 0.5725490196078431, + "grad_norm": 0.429023096146049, + "learning_rate": 1.70751845647421e-05, + "loss": 0.8676, + "num_tokens": 20137435138.0, + "step": 4818 + }, + { + "epoch": 0.5726678550207962, + "grad_norm": 0.504674496279051, + "learning_rate": 1.707390661003317e-05, + "loss": 0.8737, + "num_tokens": 20141625341.0, + "step": 4819 + }, + { + "epoch": 0.5727866904337493, + "grad_norm": 0.5084168002148088, + "learning_rate": 1.7072628430386954e-05, + "loss": 0.8448, + "num_tokens": 20145814313.0, + "step": 4820 + }, + { + "epoch": 0.5729055258467023, + "grad_norm": 0.46769795276618165, + "learning_rate": 1.7071350025850772e-05, + "loss": 0.8905, + "num_tokens": 20150003941.0, + "step": 4821 + }, + { + "epoch": 0.5730243612596554, + "grad_norm": 0.4772405181480031, + "learning_rate": 1.7070071396471967e-05, + "loss": 0.8881, + "num_tokens": 20154193553.0, + "step": 4822 + }, + { + "epoch": 0.5731431966726085, + "grad_norm": 0.4748144404042486, + "learning_rate": 1.7068792542297896e-05, + "loss": 0.8445, + "num_tokens": 20158383159.0, + "step": 4823 + }, + { + "epoch": 0.5732620320855615, + "grad_norm": 0.5355076953674163, + "learning_rate": 1.7067513463375917e-05, + "loss": 0.9016, + "num_tokens": 20162571849.0, + "step": 4824 + }, + { + "epoch": 0.5733808674985146, + "grad_norm": 0.45794618108935375, + "learning_rate": 1.7066234159753405e-05, + "loss": 0.8532, + "num_tokens": 20166753795.0, + "step": 4825 + }, + { + "epoch": 0.5734997029114676, + "grad_norm": 0.42081928108261607, + "learning_rate": 1.7064954631477727e-05, + "loss": 0.8857, + "num_tokens": 20170943521.0, + "step": 4826 + }, + { + "epoch": 0.5736185383244207, + "grad_norm": 0.5444056831559897, + "learning_rate": 1.706367487859627e-05, + "loss": 0.8626, + "num_tokens": 20175132711.0, + "step": 4827 + }, + { + "epoch": 0.5737373737373738, + "grad_norm": 0.46037223865447163, + "learning_rate": 1.706239490115644e-05, + "loss": 0.8493, + "num_tokens": 20179322641.0, + "step": 4828 + }, + { + "epoch": 0.5738562091503268, + "grad_norm": 0.41828794125045055, + "learning_rate": 1.706111469920562e-05, + "loss": 0.8578, + "num_tokens": 20183503714.0, + "step": 4829 + }, + { + "epoch": 0.5739750445632799, + "grad_norm": 0.4830665704340589, + "learning_rate": 1.7059834272791235e-05, + "loss": 0.8191, + "num_tokens": 20187694612.0, + "step": 4830 + }, + { + "epoch": 0.574093879976233, + "grad_norm": 0.5052639851248307, + "learning_rate": 1.7058553621960695e-05, + "loss": 0.8627, + "num_tokens": 20191883557.0, + "step": 4831 + }, + { + "epoch": 0.574212715389186, + "grad_norm": 0.4419198429007171, + "learning_rate": 1.7057272746761435e-05, + "loss": 0.8486, + "num_tokens": 20196072419.0, + "step": 4832 + }, + { + "epoch": 0.574331550802139, + "grad_norm": 0.5198200466970927, + "learning_rate": 1.705599164724088e-05, + "loss": 0.8548, + "num_tokens": 20200261078.0, + "step": 4833 + }, + { + "epoch": 0.5744503862150921, + "grad_norm": 0.42219484833167226, + "learning_rate": 1.705471032344648e-05, + "loss": 0.8463, + "num_tokens": 20204449970.0, + "step": 4834 + }, + { + "epoch": 0.5745692216280451, + "grad_norm": 0.47258506283469176, + "learning_rate": 1.7053428775425688e-05, + "loss": 0.8289, + "num_tokens": 20208607182.0, + "step": 4835 + }, + { + "epoch": 0.5746880570409982, + "grad_norm": 0.4630206970525551, + "learning_rate": 1.7052147003225956e-05, + "loss": 0.8612, + "num_tokens": 20212779837.0, + "step": 4836 + }, + { + "epoch": 0.5748068924539512, + "grad_norm": 0.48018352681655213, + "learning_rate": 1.705086500689476e-05, + "loss": 0.8567, + "num_tokens": 20216967926.0, + "step": 4837 + }, + { + "epoch": 0.5749257278669043, + "grad_norm": 0.3882603598633289, + "learning_rate": 1.7049582786479576e-05, + "loss": 0.8072, + "num_tokens": 20221123594.0, + "step": 4838 + }, + { + "epoch": 0.5750445632798574, + "grad_norm": 0.5084365070710714, + "learning_rate": 1.7048300342027883e-05, + "loss": 0.8844, + "num_tokens": 20225313264.0, + "step": 4839 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.43786251824475586, + "learning_rate": 1.7047017673587184e-05, + "loss": 0.8251, + "num_tokens": 20229503572.0, + "step": 4840 + }, + { + "epoch": 0.5752822341057635, + "grad_norm": 0.3913120356877067, + "learning_rate": 1.7045734781204973e-05, + "loss": 0.8718, + "num_tokens": 20233675972.0, + "step": 4841 + }, + { + "epoch": 0.5754010695187166, + "grad_norm": 0.46869789673835227, + "learning_rate": 1.704445166492876e-05, + "loss": 0.9033, + "num_tokens": 20237865243.0, + "step": 4842 + }, + { + "epoch": 0.5755199049316696, + "grad_norm": 0.5320344288286982, + "learning_rate": 1.704316832480607e-05, + "loss": 0.8654, + "num_tokens": 20242053409.0, + "step": 4843 + }, + { + "epoch": 0.5756387403446227, + "grad_norm": 0.4289367731175466, + "learning_rate": 1.704188476088442e-05, + "loss": 0.8414, + "num_tokens": 20246242319.0, + "step": 4844 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 0.48119775665660525, + "learning_rate": 1.704060097321135e-05, + "loss": 0.9023, + "num_tokens": 20250431550.0, + "step": 4845 + }, + { + "epoch": 0.5758764111705288, + "grad_norm": 0.4725638902699064, + "learning_rate": 1.70393169618344e-05, + "loss": 0.8583, + "num_tokens": 20254616979.0, + "step": 4846 + }, + { + "epoch": 0.5759952465834819, + "grad_norm": 0.3606375433704757, + "learning_rate": 1.703803272680113e-05, + "loss": 0.8446, + "num_tokens": 20258786972.0, + "step": 4847 + }, + { + "epoch": 0.576114081996435, + "grad_norm": 0.5669881934323892, + "learning_rate": 1.7036748268159088e-05, + "loss": 0.8727, + "num_tokens": 20262975489.0, + "step": 4848 + }, + { + "epoch": 0.576232917409388, + "grad_norm": 0.41172481615481676, + "learning_rate": 1.7035463585955846e-05, + "loss": 0.9191, + "num_tokens": 20267163390.0, + "step": 4849 + }, + { + "epoch": 0.5763517528223411, + "grad_norm": 0.43048077359100073, + "learning_rate": 1.7034178680238984e-05, + "loss": 0.8472, + "num_tokens": 20271295154.0, + "step": 4850 + }, + { + "epoch": 0.5764705882352941, + "grad_norm": 0.5259034364092937, + "learning_rate": 1.7032893551056083e-05, + "loss": 0.8737, + "num_tokens": 20275483727.0, + "step": 4851 + }, + { + "epoch": 0.5765894236482472, + "grad_norm": 0.48346184218357374, + "learning_rate": 1.7031608198454734e-05, + "loss": 0.8743, + "num_tokens": 20279673532.0, + "step": 4852 + }, + { + "epoch": 0.5767082590612003, + "grad_norm": 0.4773907399874668, + "learning_rate": 1.7030322622482543e-05, + "loss": 0.8431, + "num_tokens": 20283838486.0, + "step": 4853 + }, + { + "epoch": 0.5768270944741533, + "grad_norm": 0.43448159953173815, + "learning_rate": 1.7029036823187114e-05, + "loss": 0.8992, + "num_tokens": 20288028966.0, + "step": 4854 + }, + { + "epoch": 0.5769459298871064, + "grad_norm": 0.47212311453931216, + "learning_rate": 1.7027750800616067e-05, + "loss": 0.8551, + "num_tokens": 20292218304.0, + "step": 4855 + }, + { + "epoch": 0.5770647653000595, + "grad_norm": 0.5105658953372246, + "learning_rate": 1.7026464554817028e-05, + "loss": 0.8813, + "num_tokens": 20296400113.0, + "step": 4856 + }, + { + "epoch": 0.5771836007130124, + "grad_norm": 0.43726808668812933, + "learning_rate": 1.7025178085837632e-05, + "loss": 0.8166, + "num_tokens": 20300551898.0, + "step": 4857 + }, + { + "epoch": 0.5773024361259655, + "grad_norm": 0.3958554586483266, + "learning_rate": 1.702389139372552e-05, + "loss": 0.8653, + "num_tokens": 20304741155.0, + "step": 4858 + }, + { + "epoch": 0.5774212715389186, + "grad_norm": 0.43744290950201115, + "learning_rate": 1.702260447852834e-05, + "loss": 0.842, + "num_tokens": 20308906382.0, + "step": 4859 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 0.6030594704671423, + "learning_rate": 1.7021317340293758e-05, + "loss": 0.8311, + "num_tokens": 20313076355.0, + "step": 4860 + }, + { + "epoch": 0.5776589423648247, + "grad_norm": 0.3765806573568773, + "learning_rate": 1.702002997906943e-05, + "loss": 0.8577, + "num_tokens": 20317266061.0, + "step": 4861 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.5339559185818015, + "learning_rate": 1.701874239490304e-05, + "loss": 0.87, + "num_tokens": 20321454450.0, + "step": 4862 + }, + { + "epoch": 0.5778966131907308, + "grad_norm": 0.5569099860922182, + "learning_rate": 1.7017454587842267e-05, + "loss": 0.8676, + "num_tokens": 20325594793.0, + "step": 4863 + }, + { + "epoch": 0.5780154486036839, + "grad_norm": 0.38924913014967866, + "learning_rate": 1.701616655793481e-05, + "loss": 0.833, + "num_tokens": 20329785961.0, + "step": 4864 + }, + { + "epoch": 0.5781342840166369, + "grad_norm": 0.6001610555180033, + "learning_rate": 1.701487830522836e-05, + "loss": 0.881, + "num_tokens": 20333975717.0, + "step": 4865 + }, + { + "epoch": 0.57825311942959, + "grad_norm": 0.43989019398288004, + "learning_rate": 1.7013589829770635e-05, + "loss": 0.8491, + "num_tokens": 20338165814.0, + "step": 4866 + }, + { + "epoch": 0.5783719548425431, + "grad_norm": 0.5827869889597286, + "learning_rate": 1.7012301131609343e-05, + "loss": 0.8658, + "num_tokens": 20342301513.0, + "step": 4867 + }, + { + "epoch": 0.5784907902554961, + "grad_norm": 0.463266666319537, + "learning_rate": 1.701101221079221e-05, + "loss": 0.8213, + "num_tokens": 20346490530.0, + "step": 4868 + }, + { + "epoch": 0.5786096256684492, + "grad_norm": 0.5035719278300314, + "learning_rate": 1.7009723067366974e-05, + "loss": 0.8355, + "num_tokens": 20350647784.0, + "step": 4869 + }, + { + "epoch": 0.5787284610814023, + "grad_norm": 0.5408191540748665, + "learning_rate": 1.700843370138137e-05, + "loss": 0.8635, + "num_tokens": 20354837187.0, + "step": 4870 + }, + { + "epoch": 0.5788472964943553, + "grad_norm": 0.4633857878966789, + "learning_rate": 1.7007144112883155e-05, + "loss": 0.8213, + "num_tokens": 20359020763.0, + "step": 4871 + }, + { + "epoch": 0.5789661319073084, + "grad_norm": 0.4243922380708208, + "learning_rate": 1.7005854301920083e-05, + "loss": 0.8435, + "num_tokens": 20363186319.0, + "step": 4872 + }, + { + "epoch": 0.5790849673202615, + "grad_norm": 0.5561399278182277, + "learning_rate": 1.700456426853992e-05, + "loss": 0.89, + "num_tokens": 20367361709.0, + "step": 4873 + }, + { + "epoch": 0.5792038027332145, + "grad_norm": 0.3691896741282481, + "learning_rate": 1.7003274012790434e-05, + "loss": 0.8453, + "num_tokens": 20371526846.0, + "step": 4874 + }, + { + "epoch": 0.5793226381461676, + "grad_norm": 0.5197105212221151, + "learning_rate": 1.700198353471942e-05, + "loss": 0.8158, + "num_tokens": 20375689689.0, + "step": 4875 + }, + { + "epoch": 0.5794414735591206, + "grad_norm": 0.44021185440055594, + "learning_rate": 1.700069283437466e-05, + "loss": 0.8947, + "num_tokens": 20379859264.0, + "step": 4876 + }, + { + "epoch": 0.5795603089720737, + "grad_norm": 0.46065094325790074, + "learning_rate": 1.6999401911803955e-05, + "loss": 0.845, + "num_tokens": 20384049595.0, + "step": 4877 + }, + { + "epoch": 0.5796791443850268, + "grad_norm": 0.419974999098561, + "learning_rate": 1.6998110767055117e-05, + "loss": 0.8634, + "num_tokens": 20388215721.0, + "step": 4878 + }, + { + "epoch": 0.5797979797979798, + "grad_norm": 0.5165636520990983, + "learning_rate": 1.6996819400175954e-05, + "loss": 0.8505, + "num_tokens": 20392401500.0, + "step": 4879 + }, + { + "epoch": 0.5799168152109329, + "grad_norm": 0.45359235614925103, + "learning_rate": 1.699552781121429e-05, + "loss": 0.9122, + "num_tokens": 20396590478.0, + "step": 4880 + }, + { + "epoch": 0.580035650623886, + "grad_norm": 0.44809467845478623, + "learning_rate": 1.6994236000217964e-05, + "loss": 0.8465, + "num_tokens": 20400780048.0, + "step": 4881 + }, + { + "epoch": 0.580154486036839, + "grad_norm": 0.5328340727911492, + "learning_rate": 1.699294396723481e-05, + "loss": 0.869, + "num_tokens": 20404936979.0, + "step": 4882 + }, + { + "epoch": 0.580273321449792, + "grad_norm": 0.5402136206996059, + "learning_rate": 1.6991651712312677e-05, + "loss": 0.8504, + "num_tokens": 20409126023.0, + "step": 4883 + }, + { + "epoch": 0.5803921568627451, + "grad_norm": 0.38035451258089664, + "learning_rate": 1.6990359235499425e-05, + "loss": 0.8502, + "num_tokens": 20413316570.0, + "step": 4884 + }, + { + "epoch": 0.5805109922756981, + "grad_norm": 0.5535003391600261, + "learning_rate": 1.698906653684292e-05, + "loss": 0.8421, + "num_tokens": 20417479148.0, + "step": 4885 + }, + { + "epoch": 0.5806298276886512, + "grad_norm": 0.4229865429004102, + "learning_rate": 1.6987773616391025e-05, + "loss": 0.8278, + "num_tokens": 20421669177.0, + "step": 4886 + }, + { + "epoch": 0.5807486631016042, + "grad_norm": 0.6627748615158852, + "learning_rate": 1.6986480474191632e-05, + "loss": 0.8596, + "num_tokens": 20425835436.0, + "step": 4887 + }, + { + "epoch": 0.5808674985145573, + "grad_norm": 0.47063791140681005, + "learning_rate": 1.6985187110292623e-05, + "loss": 0.887, + "num_tokens": 20430024263.0, + "step": 4888 + }, + { + "epoch": 0.5809863339275104, + "grad_norm": 0.47295346502377333, + "learning_rate": 1.6983893524741904e-05, + "loss": 0.86, + "num_tokens": 20434214295.0, + "step": 4889 + }, + { + "epoch": 0.5811051693404634, + "grad_norm": 0.5341951133156821, + "learning_rate": 1.6982599717587373e-05, + "loss": 0.8607, + "num_tokens": 20438396699.0, + "step": 4890 + }, + { + "epoch": 0.5812240047534165, + "grad_norm": 0.4745363151923882, + "learning_rate": 1.6981305688876947e-05, + "loss": 0.9102, + "num_tokens": 20442557838.0, + "step": 4891 + }, + { + "epoch": 0.5813428401663696, + "grad_norm": 0.534772006625409, + "learning_rate": 1.6980011438658546e-05, + "loss": 0.9019, + "num_tokens": 20446736207.0, + "step": 4892 + }, + { + "epoch": 0.5814616755793226, + "grad_norm": 0.4605511597488605, + "learning_rate": 1.6978716966980105e-05, + "loss": 0.8165, + "num_tokens": 20450908420.0, + "step": 4893 + }, + { + "epoch": 0.5815805109922757, + "grad_norm": 0.4336394642765772, + "learning_rate": 1.6977422273889562e-05, + "loss": 0.9002, + "num_tokens": 20455096817.0, + "step": 4894 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.46480703312568444, + "learning_rate": 1.697612735943486e-05, + "loss": 0.8562, + "num_tokens": 20459287051.0, + "step": 4895 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 0.44605708110171477, + "learning_rate": 1.6974832223663952e-05, + "loss": 0.8524, + "num_tokens": 20463461476.0, + "step": 4896 + }, + { + "epoch": 0.5819370172311349, + "grad_norm": 0.5136032711719141, + "learning_rate": 1.697353686662481e-05, + "loss": 0.8515, + "num_tokens": 20467648615.0, + "step": 4897 + }, + { + "epoch": 0.582055852644088, + "grad_norm": 0.40314453852329973, + "learning_rate": 1.69722412883654e-05, + "loss": 0.8666, + "num_tokens": 20471838453.0, + "step": 4898 + }, + { + "epoch": 0.582174688057041, + "grad_norm": 0.5019050187836226, + "learning_rate": 1.6970945488933702e-05, + "loss": 0.8474, + "num_tokens": 20476027187.0, + "step": 4899 + }, + { + "epoch": 0.5822935234699941, + "grad_norm": 0.4020046847446556, + "learning_rate": 1.6969649468377705e-05, + "loss": 0.847, + "num_tokens": 20480196363.0, + "step": 4900 + }, + { + "epoch": 0.5824123588829471, + "grad_norm": 0.3945052960390166, + "learning_rate": 1.6968353226745404e-05, + "loss": 0.8243, + "num_tokens": 20484280826.0, + "step": 4901 + }, + { + "epoch": 0.5825311942959002, + "grad_norm": 0.4931502205932421, + "learning_rate": 1.6967056764084805e-05, + "loss": 0.8336, + "num_tokens": 20488421633.0, + "step": 4902 + }, + { + "epoch": 0.5826500297088533, + "grad_norm": 0.49388381692674393, + "learning_rate": 1.6965760080443915e-05, + "loss": 0.8691, + "num_tokens": 20492608584.0, + "step": 4903 + }, + { + "epoch": 0.5827688651218063, + "grad_norm": 0.47996861805335356, + "learning_rate": 1.696446317587076e-05, + "loss": 0.8265, + "num_tokens": 20496774526.0, + "step": 4904 + }, + { + "epoch": 0.5828877005347594, + "grad_norm": 0.509060346046411, + "learning_rate": 1.696316605041337e-05, + "loss": 0.8927, + "num_tokens": 20500933606.0, + "step": 4905 + }, + { + "epoch": 0.5830065359477125, + "grad_norm": 0.3864445021681726, + "learning_rate": 1.6961868704119777e-05, + "loss": 0.8392, + "num_tokens": 20505095803.0, + "step": 4906 + }, + { + "epoch": 0.5831253713606654, + "grad_norm": 0.5620882878368706, + "learning_rate": 1.6960571137038027e-05, + "loss": 0.8524, + "num_tokens": 20509281300.0, + "step": 4907 + }, + { + "epoch": 0.5832442067736185, + "grad_norm": 0.37045231360215203, + "learning_rate": 1.6959273349216178e-05, + "loss": 0.8126, + "num_tokens": 20513469376.0, + "step": 4908 + }, + { + "epoch": 0.5833630421865716, + "grad_norm": 0.5580292599084807, + "learning_rate": 1.6957975340702287e-05, + "loss": 0.8567, + "num_tokens": 20517656977.0, + "step": 4909 + }, + { + "epoch": 0.5834818775995246, + "grad_norm": 0.4793562979343664, + "learning_rate": 1.6956677111544424e-05, + "loss": 0.8538, + "num_tokens": 20521846257.0, + "step": 4910 + }, + { + "epoch": 0.5836007130124777, + "grad_norm": 0.520704116328559, + "learning_rate": 1.695537866179067e-05, + "loss": 0.8849, + "num_tokens": 20526027299.0, + "step": 4911 + }, + { + "epoch": 0.5837195484254307, + "grad_norm": 0.47184170526687547, + "learning_rate": 1.6954079991489104e-05, + "loss": 0.8689, + "num_tokens": 20530191836.0, + "step": 4912 + }, + { + "epoch": 0.5838383838383838, + "grad_norm": 0.4448070272333262, + "learning_rate": 1.6952781100687826e-05, + "loss": 0.8352, + "num_tokens": 20534381542.0, + "step": 4913 + }, + { + "epoch": 0.5839572192513369, + "grad_norm": 0.519445225141111, + "learning_rate": 1.695148198943494e-05, + "loss": 0.8786, + "num_tokens": 20538569535.0, + "step": 4914 + }, + { + "epoch": 0.5840760546642899, + "grad_norm": 0.4750616258444337, + "learning_rate": 1.695018265777855e-05, + "loss": 0.8578, + "num_tokens": 20542745150.0, + "step": 4915 + }, + { + "epoch": 0.584194890077243, + "grad_norm": 0.4961323289367813, + "learning_rate": 1.694888310576678e-05, + "loss": 0.8717, + "num_tokens": 20546935807.0, + "step": 4916 + }, + { + "epoch": 0.5843137254901961, + "grad_norm": 0.4344996961732364, + "learning_rate": 1.6947583333447754e-05, + "loss": 0.8261, + "num_tokens": 20551098165.0, + "step": 4917 + }, + { + "epoch": 0.5844325609031491, + "grad_norm": 0.6394396366992185, + "learning_rate": 1.694628334086961e-05, + "loss": 0.8604, + "num_tokens": 20555258520.0, + "step": 4918 + }, + { + "epoch": 0.5845513963161022, + "grad_norm": 0.47838290253789983, + "learning_rate": 1.6944983128080487e-05, + "loss": 0.836, + "num_tokens": 20559447974.0, + "step": 4919 + }, + { + "epoch": 0.5846702317290553, + "grad_norm": 0.5903813201682916, + "learning_rate": 1.6943682695128538e-05, + "loss": 0.8343, + "num_tokens": 20563635950.0, + "step": 4920 + }, + { + "epoch": 0.5847890671420083, + "grad_norm": 0.4927484714371107, + "learning_rate": 1.6942382042061928e-05, + "loss": 0.8409, + "num_tokens": 20567806209.0, + "step": 4921 + }, + { + "epoch": 0.5849079025549614, + "grad_norm": 0.5824562200726241, + "learning_rate": 1.6941081168928813e-05, + "loss": 0.8566, + "num_tokens": 20571995887.0, + "step": 4922 + }, + { + "epoch": 0.5850267379679145, + "grad_norm": 0.5019399883678198, + "learning_rate": 1.6939780075777375e-05, + "loss": 0.8537, + "num_tokens": 20576185951.0, + "step": 4923 + }, + { + "epoch": 0.5851455733808675, + "grad_norm": 0.6200865409942501, + "learning_rate": 1.6938478762655798e-05, + "loss": 0.8565, + "num_tokens": 20580375321.0, + "step": 4924 + }, + { + "epoch": 0.5852644087938206, + "grad_norm": 0.5441303536890951, + "learning_rate": 1.6937177229612277e-05, + "loss": 0.87, + "num_tokens": 20584565248.0, + "step": 4925 + }, + { + "epoch": 0.5853832442067736, + "grad_norm": 0.6173394806217538, + "learning_rate": 1.693587547669501e-05, + "loss": 0.8708, + "num_tokens": 20588713789.0, + "step": 4926 + }, + { + "epoch": 0.5855020796197267, + "grad_norm": 0.5407394591743332, + "learning_rate": 1.6934573503952198e-05, + "loss": 0.8685, + "num_tokens": 20592874974.0, + "step": 4927 + }, + { + "epoch": 0.5856209150326798, + "grad_norm": 0.6344473997827551, + "learning_rate": 1.693327131143207e-05, + "loss": 0.8752, + "num_tokens": 20597055392.0, + "step": 4928 + }, + { + "epoch": 0.5857397504456328, + "grad_norm": 0.4685244604521167, + "learning_rate": 1.6931968899182844e-05, + "loss": 0.8225, + "num_tokens": 20601244893.0, + "step": 4929 + }, + { + "epoch": 0.5858585858585859, + "grad_norm": 0.6212851231378531, + "learning_rate": 1.693066626725275e-05, + "loss": 0.8532, + "num_tokens": 20605433181.0, + "step": 4930 + }, + { + "epoch": 0.585977421271539, + "grad_norm": 0.5029757443084315, + "learning_rate": 1.6929363415690038e-05, + "loss": 0.8604, + "num_tokens": 20609597112.0, + "step": 4931 + }, + { + "epoch": 0.586096256684492, + "grad_norm": 0.528121252694007, + "learning_rate": 1.6928060344542942e-05, + "loss": 0.8647, + "num_tokens": 20613785840.0, + "step": 4932 + }, + { + "epoch": 0.586215092097445, + "grad_norm": 0.4958308868064047, + "learning_rate": 1.6926757053859736e-05, + "loss": 0.861, + "num_tokens": 20617972373.0, + "step": 4933 + }, + { + "epoch": 0.5863339275103981, + "grad_norm": 0.4950841121763535, + "learning_rate": 1.6925453543688674e-05, + "loss": 0.8743, + "num_tokens": 20622150215.0, + "step": 4934 + }, + { + "epoch": 0.5864527629233511, + "grad_norm": 0.4802054637020396, + "learning_rate": 1.6924149814078033e-05, + "loss": 0.8421, + "num_tokens": 20626340458.0, + "step": 4935 + }, + { + "epoch": 0.5865715983363042, + "grad_norm": 0.4948388317297893, + "learning_rate": 1.6922845865076094e-05, + "loss": 0.8937, + "num_tokens": 20630529533.0, + "step": 4936 + }, + { + "epoch": 0.5866904337492573, + "grad_norm": 0.47128449705670467, + "learning_rate": 1.6921541696731148e-05, + "loss": 0.8738, + "num_tokens": 20634719373.0, + "step": 4937 + }, + { + "epoch": 0.5868092691622103, + "grad_norm": 0.49378896019601887, + "learning_rate": 1.6920237309091493e-05, + "loss": 0.8634, + "num_tokens": 20638905872.0, + "step": 4938 + }, + { + "epoch": 0.5869281045751634, + "grad_norm": 0.4079987758441891, + "learning_rate": 1.6918932702205434e-05, + "loss": 0.8798, + "num_tokens": 20643078028.0, + "step": 4939 + }, + { + "epoch": 0.5870469399881164, + "grad_norm": 0.49151685186214666, + "learning_rate": 1.6917627876121278e-05, + "loss": 0.8408, + "num_tokens": 20647268327.0, + "step": 4940 + }, + { + "epoch": 0.5871657754010695, + "grad_norm": 0.4182225406767248, + "learning_rate": 1.691632283088736e-05, + "loss": 0.8764, + "num_tokens": 20651456758.0, + "step": 4941 + }, + { + "epoch": 0.5872846108140226, + "grad_norm": 0.4715603235927195, + "learning_rate": 1.6915017566552003e-05, + "loss": 0.8563, + "num_tokens": 20655644719.0, + "step": 4942 + }, + { + "epoch": 0.5874034462269756, + "grad_norm": 0.4321290893629255, + "learning_rate": 1.6913712083163545e-05, + "loss": 0.8479, + "num_tokens": 20659833103.0, + "step": 4943 + }, + { + "epoch": 0.5875222816399287, + "grad_norm": 0.51600843140261, + "learning_rate": 1.6912406380770337e-05, + "loss": 0.8763, + "num_tokens": 20664002503.0, + "step": 4944 + }, + { + "epoch": 0.5876411170528818, + "grad_norm": 0.418358706698104, + "learning_rate": 1.6911100459420728e-05, + "loss": 0.8252, + "num_tokens": 20668152775.0, + "step": 4945 + }, + { + "epoch": 0.5877599524658348, + "grad_norm": 0.509195872277187, + "learning_rate": 1.6909794319163084e-05, + "loss": 0.8657, + "num_tokens": 20672316416.0, + "step": 4946 + }, + { + "epoch": 0.5878787878787879, + "grad_norm": 0.4136762700759945, + "learning_rate": 1.690848796004577e-05, + "loss": 0.8616, + "num_tokens": 20676468267.0, + "step": 4947 + }, + { + "epoch": 0.587997623291741, + "grad_norm": 0.542515176013773, + "learning_rate": 1.6907181382117177e-05, + "loss": 0.8763, + "num_tokens": 20680657916.0, + "step": 4948 + }, + { + "epoch": 0.588116458704694, + "grad_norm": 0.49617121412551624, + "learning_rate": 1.6905874585425682e-05, + "loss": 0.8905, + "num_tokens": 20684839413.0, + "step": 4949 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.4594673584754614, + "learning_rate": 1.6904567570019682e-05, + "loss": 0.8581, + "num_tokens": 20689002644.0, + "step": 4950 + }, + { + "epoch": 0.5883541295306001, + "grad_norm": 0.4461073485356652, + "learning_rate": 1.6903260335947584e-05, + "loss": 0.8917, + "num_tokens": 20693190798.0, + "step": 4951 + }, + { + "epoch": 0.5884729649435532, + "grad_norm": 0.47832985449617205, + "learning_rate": 1.690195288325779e-05, + "loss": 0.8592, + "num_tokens": 20697346234.0, + "step": 4952 + }, + { + "epoch": 0.5885918003565063, + "grad_norm": 0.4782952622466162, + "learning_rate": 1.6900645211998732e-05, + "loss": 0.8391, + "num_tokens": 20701535845.0, + "step": 4953 + }, + { + "epoch": 0.5887106357694593, + "grad_norm": 0.5019061757822738, + "learning_rate": 1.6899337322218828e-05, + "loss": 0.8656, + "num_tokens": 20705726373.0, + "step": 4954 + }, + { + "epoch": 0.5888294711824124, + "grad_norm": 0.46845763143538793, + "learning_rate": 1.689802921396652e-05, + "loss": 0.8608, + "num_tokens": 20709914772.0, + "step": 4955 + }, + { + "epoch": 0.5889483065953655, + "grad_norm": 0.53377865561488, + "learning_rate": 1.6896720887290243e-05, + "loss": 0.8634, + "num_tokens": 20714104600.0, + "step": 4956 + }, + { + "epoch": 0.5890671420083184, + "grad_norm": 0.4366881377696255, + "learning_rate": 1.689541234223846e-05, + "loss": 0.8365, + "num_tokens": 20718247955.0, + "step": 4957 + }, + { + "epoch": 0.5891859774212715, + "grad_norm": 0.5003285031170038, + "learning_rate": 1.6894103578859622e-05, + "loss": 0.8633, + "num_tokens": 20722435950.0, + "step": 4958 + }, + { + "epoch": 0.5893048128342246, + "grad_norm": 0.47099541245281223, + "learning_rate": 1.68927945972022e-05, + "loss": 0.8573, + "num_tokens": 20726609596.0, + "step": 4959 + }, + { + "epoch": 0.5894236482471776, + "grad_norm": 0.42555109135784003, + "learning_rate": 1.689148539731467e-05, + "loss": 0.8802, + "num_tokens": 20730798924.0, + "step": 4960 + }, + { + "epoch": 0.5895424836601307, + "grad_norm": 0.4537918176381125, + "learning_rate": 1.689017597924552e-05, + "loss": 0.8695, + "num_tokens": 20734961615.0, + "step": 4961 + }, + { + "epoch": 0.5896613190730838, + "grad_norm": 0.4445377115987642, + "learning_rate": 1.688886634304323e-05, + "loss": 0.8583, + "num_tokens": 20739149569.0, + "step": 4962 + }, + { + "epoch": 0.5897801544860368, + "grad_norm": 0.3759835819607905, + "learning_rate": 1.6887556488756317e-05, + "loss": 0.8986, + "num_tokens": 20743339302.0, + "step": 4963 + }, + { + "epoch": 0.5898989898989899, + "grad_norm": 0.5896457551213714, + "learning_rate": 1.688624641643328e-05, + "loss": 0.8698, + "num_tokens": 20747499780.0, + "step": 4964 + }, + { + "epoch": 0.5900178253119429, + "grad_norm": 0.4913354706491809, + "learning_rate": 1.6884936126122633e-05, + "loss": 0.8614, + "num_tokens": 20751690017.0, + "step": 4965 + }, + { + "epoch": 0.590136660724896, + "grad_norm": 0.4699302188201778, + "learning_rate": 1.6883625617872904e-05, + "loss": 0.8332, + "num_tokens": 20755878038.0, + "step": 4966 + }, + { + "epoch": 0.5902554961378491, + "grad_norm": 0.3835624730004861, + "learning_rate": 1.688231489173263e-05, + "loss": 0.827, + "num_tokens": 20760067765.0, + "step": 4967 + }, + { + "epoch": 0.5903743315508021, + "grad_norm": 0.5047662351621128, + "learning_rate": 1.6881003947750343e-05, + "loss": 0.8681, + "num_tokens": 20764257263.0, + "step": 4968 + }, + { + "epoch": 0.5904931669637552, + "grad_norm": 0.4606307916742936, + "learning_rate": 1.6879692785974596e-05, + "loss": 0.8702, + "num_tokens": 20768445482.0, + "step": 4969 + }, + { + "epoch": 0.5906120023767083, + "grad_norm": 0.45584752687757363, + "learning_rate": 1.6878381406453942e-05, + "loss": 0.8474, + "num_tokens": 20772634487.0, + "step": 4970 + }, + { + "epoch": 0.5907308377896613, + "grad_norm": 0.4622533087754394, + "learning_rate": 1.687706980923695e-05, + "loss": 0.8652, + "num_tokens": 20776823570.0, + "step": 4971 + }, + { + "epoch": 0.5908496732026144, + "grad_norm": 0.4425451062012503, + "learning_rate": 1.6875757994372198e-05, + "loss": 0.845, + "num_tokens": 20781012359.0, + "step": 4972 + }, + { + "epoch": 0.5909685086155675, + "grad_norm": 0.5229885922200476, + "learning_rate": 1.6874445961908256e-05, + "loss": 0.8815, + "num_tokens": 20785201800.0, + "step": 4973 + }, + { + "epoch": 0.5910873440285205, + "grad_norm": 0.45029756176067287, + "learning_rate": 1.6873133711893723e-05, + "loss": 0.8754, + "num_tokens": 20789365881.0, + "step": 4974 + }, + { + "epoch": 0.5912061794414736, + "grad_norm": 0.4408926975319691, + "learning_rate": 1.687182124437719e-05, + "loss": 0.8706, + "num_tokens": 20793554795.0, + "step": 4975 + }, + { + "epoch": 0.5913250148544266, + "grad_norm": 0.4486601753882866, + "learning_rate": 1.6870508559407256e-05, + "loss": 0.8697, + "num_tokens": 20797743934.0, + "step": 4976 + }, + { + "epoch": 0.5914438502673797, + "grad_norm": 0.49501183417806477, + "learning_rate": 1.6869195657032548e-05, + "loss": 0.805, + "num_tokens": 20801933344.0, + "step": 4977 + }, + { + "epoch": 0.5915626856803328, + "grad_norm": 0.43402037924733283, + "learning_rate": 1.6867882537301678e-05, + "loss": 0.8713, + "num_tokens": 20806121531.0, + "step": 4978 + }, + { + "epoch": 0.5916815210932858, + "grad_norm": 0.42861434643516055, + "learning_rate": 1.6866569200263282e-05, + "loss": 0.8487, + "num_tokens": 20810289063.0, + "step": 4979 + }, + { + "epoch": 0.5918003565062389, + "grad_norm": 0.41825260682129317, + "learning_rate": 1.686525564596599e-05, + "loss": 0.8505, + "num_tokens": 20814478861.0, + "step": 4980 + }, + { + "epoch": 0.591919191919192, + "grad_norm": 0.47469404772210083, + "learning_rate": 1.686394187445845e-05, + "loss": 0.8272, + "num_tokens": 20818667878.0, + "step": 4981 + }, + { + "epoch": 0.592038027332145, + "grad_norm": 0.3923098147714334, + "learning_rate": 1.6862627885789317e-05, + "loss": 0.8502, + "num_tokens": 20822857537.0, + "step": 4982 + }, + { + "epoch": 0.592156862745098, + "grad_norm": 0.4165242588728114, + "learning_rate": 1.686131368000725e-05, + "loss": 0.8468, + "num_tokens": 20827015735.0, + "step": 4983 + }, + { + "epoch": 0.5922756981580511, + "grad_norm": 0.4142319842248309, + "learning_rate": 1.6859999257160922e-05, + "loss": 0.8991, + "num_tokens": 20831202871.0, + "step": 4984 + }, + { + "epoch": 0.5923945335710041, + "grad_norm": 0.40758228390184276, + "learning_rate": 1.6858684617299002e-05, + "loss": 0.8599, + "num_tokens": 20835384519.0, + "step": 4985 + }, + { + "epoch": 0.5925133689839572, + "grad_norm": 0.40816532932343524, + "learning_rate": 1.6857369760470192e-05, + "loss": 0.8542, + "num_tokens": 20839574878.0, + "step": 4986 + }, + { + "epoch": 0.5926322043969103, + "grad_norm": 0.4555928260444771, + "learning_rate": 1.685605468672317e-05, + "loss": 0.8876, + "num_tokens": 20843765467.0, + "step": 4987 + }, + { + "epoch": 0.5927510398098633, + "grad_norm": 0.4463261419913265, + "learning_rate": 1.6854739396106646e-05, + "loss": 0.875, + "num_tokens": 20847928297.0, + "step": 4988 + }, + { + "epoch": 0.5928698752228164, + "grad_norm": 0.45975795433379785, + "learning_rate": 1.6853423888669323e-05, + "loss": 0.9003, + "num_tokens": 20852091679.0, + "step": 4989 + }, + { + "epoch": 0.5929887106357694, + "grad_norm": 0.47031834266430705, + "learning_rate": 1.6852108164459923e-05, + "loss": 0.8456, + "num_tokens": 20856280278.0, + "step": 4990 + }, + { + "epoch": 0.5931075460487225, + "grad_norm": 0.49277353304154614, + "learning_rate": 1.6850792223527176e-05, + "loss": 0.8623, + "num_tokens": 20860415864.0, + "step": 4991 + }, + { + "epoch": 0.5932263814616756, + "grad_norm": 0.48923863007192087, + "learning_rate": 1.6849476065919803e-05, + "loss": 0.8604, + "num_tokens": 20864605176.0, + "step": 4992 + }, + { + "epoch": 0.5933452168746286, + "grad_norm": 0.41274205005739023, + "learning_rate": 1.684815969168656e-05, + "loss": 0.8541, + "num_tokens": 20868794281.0, + "step": 4993 + }, + { + "epoch": 0.5934640522875817, + "grad_norm": 0.3930731698165571, + "learning_rate": 1.684684310087619e-05, + "loss": 0.8551, + "num_tokens": 20872982939.0, + "step": 4994 + }, + { + "epoch": 0.5935828877005348, + "grad_norm": 0.4409203294936172, + "learning_rate": 1.684552629353745e-05, + "loss": 0.8437, + "num_tokens": 20877172252.0, + "step": 4995 + }, + { + "epoch": 0.5937017231134878, + "grad_norm": 0.4547253276458369, + "learning_rate": 1.6844209269719107e-05, + "loss": 0.8549, + "num_tokens": 20881359832.0, + "step": 4996 + }, + { + "epoch": 0.5938205585264409, + "grad_norm": 0.41450681393475924, + "learning_rate": 1.6842892029469935e-05, + "loss": 0.8476, + "num_tokens": 20885548290.0, + "step": 4997 + }, + { + "epoch": 0.593939393939394, + "grad_norm": 0.4691062689104547, + "learning_rate": 1.6841574572838716e-05, + "loss": 0.8877, + "num_tokens": 20889714752.0, + "step": 4998 + }, + { + "epoch": 0.594058229352347, + "grad_norm": 0.45385419815650485, + "learning_rate": 1.684025689987424e-05, + "loss": 0.8505, + "num_tokens": 20893893289.0, + "step": 4999 + }, + { + "epoch": 0.5941770647653001, + "grad_norm": 0.4599460189611278, + "learning_rate": 1.6838939010625302e-05, + "loss": 0.8699, + "num_tokens": 20898048616.0, + "step": 5000 + }, + { + "epoch": 0.5942959001782531, + "grad_norm": 0.5161049377878053, + "learning_rate": 1.6837620905140716e-05, + "loss": 0.8888, + "num_tokens": 20902237720.0, + "step": 5001 + }, + { + "epoch": 0.5944147355912062, + "grad_norm": 0.4518213756828146, + "learning_rate": 1.6836302583469287e-05, + "loss": 0.8396, + "num_tokens": 20906423901.0, + "step": 5002 + }, + { + "epoch": 0.5945335710041593, + "grad_norm": 0.4421351828222788, + "learning_rate": 1.683498404565984e-05, + "loss": 0.8624, + "num_tokens": 20910612647.0, + "step": 5003 + }, + { + "epoch": 0.5946524064171123, + "grad_norm": 0.4567166660099924, + "learning_rate": 1.6833665291761205e-05, + "loss": 0.8565, + "num_tokens": 20914785243.0, + "step": 5004 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.4413879260058764, + "learning_rate": 1.683234632182222e-05, + "loss": 0.8836, + "num_tokens": 20918956674.0, + "step": 5005 + }, + { + "epoch": 0.5948900772430185, + "grad_norm": 0.42090605175898094, + "learning_rate": 1.6831027135891734e-05, + "loss": 0.8572, + "num_tokens": 20923136854.0, + "step": 5006 + }, + { + "epoch": 0.5950089126559714, + "grad_norm": 0.4722100770160738, + "learning_rate": 1.6829707734018594e-05, + "loss": 0.8492, + "num_tokens": 20927289700.0, + "step": 5007 + }, + { + "epoch": 0.5951277480689245, + "grad_norm": 0.4500384248532513, + "learning_rate": 1.682838811625167e-05, + "loss": 0.8583, + "num_tokens": 20931478386.0, + "step": 5008 + }, + { + "epoch": 0.5952465834818776, + "grad_norm": 0.47282364246669817, + "learning_rate": 1.6827068282639823e-05, + "loss": 0.8793, + "num_tokens": 20935651363.0, + "step": 5009 + }, + { + "epoch": 0.5953654188948306, + "grad_norm": 0.4653218119078582, + "learning_rate": 1.6825748233231944e-05, + "loss": 0.862, + "num_tokens": 20939840483.0, + "step": 5010 + }, + { + "epoch": 0.5954842543077837, + "grad_norm": 0.49001118531241333, + "learning_rate": 1.6824427968076905e-05, + "loss": 0.8878, + "num_tokens": 20944030167.0, + "step": 5011 + }, + { + "epoch": 0.5956030897207368, + "grad_norm": 0.4638756160458046, + "learning_rate": 1.68231074872236e-05, + "loss": 0.8783, + "num_tokens": 20948217154.0, + "step": 5012 + }, + { + "epoch": 0.5957219251336898, + "grad_norm": 0.49047254154242564, + "learning_rate": 1.6821786790720944e-05, + "loss": 0.8238, + "num_tokens": 20952407031.0, + "step": 5013 + }, + { + "epoch": 0.5958407605466429, + "grad_norm": 0.43409705162697665, + "learning_rate": 1.6820465878617837e-05, + "loss": 0.8502, + "num_tokens": 20956594238.0, + "step": 5014 + }, + { + "epoch": 0.5959595959595959, + "grad_norm": 0.48978517843183805, + "learning_rate": 1.6819144750963202e-05, + "loss": 0.8797, + "num_tokens": 20960759012.0, + "step": 5015 + }, + { + "epoch": 0.596078431372549, + "grad_norm": 0.407511695897615, + "learning_rate": 1.6817823407805957e-05, + "loss": 0.84, + "num_tokens": 20964948943.0, + "step": 5016 + }, + { + "epoch": 0.5961972667855021, + "grad_norm": 0.389333121218131, + "learning_rate": 1.6816501849195043e-05, + "loss": 0.8548, + "num_tokens": 20969084222.0, + "step": 5017 + }, + { + "epoch": 0.5963161021984551, + "grad_norm": 0.48488885288249384, + "learning_rate": 1.68151800751794e-05, + "loss": 0.8776, + "num_tokens": 20973242571.0, + "step": 5018 + }, + { + "epoch": 0.5964349376114082, + "grad_norm": 0.4531398572748504, + "learning_rate": 1.681385808580798e-05, + "loss": 0.8675, + "num_tokens": 20977431967.0, + "step": 5019 + }, + { + "epoch": 0.5965537730243613, + "grad_norm": 0.46320672235287796, + "learning_rate": 1.681253588112973e-05, + "loss": 0.8508, + "num_tokens": 20981620181.0, + "step": 5020 + }, + { + "epoch": 0.5966726084373143, + "grad_norm": 0.4708237712512896, + "learning_rate": 1.681121346119363e-05, + "loss": 0.8579, + "num_tokens": 20985786188.0, + "step": 5021 + }, + { + "epoch": 0.5967914438502674, + "grad_norm": 0.556309438772959, + "learning_rate": 1.6809890826048644e-05, + "loss": 0.8187, + "num_tokens": 20989959411.0, + "step": 5022 + }, + { + "epoch": 0.5969102792632205, + "grad_norm": 0.47199983990125066, + "learning_rate": 1.680856797574376e-05, + "loss": 0.8343, + "num_tokens": 20994149323.0, + "step": 5023 + }, + { + "epoch": 0.5970291146761735, + "grad_norm": 0.4320377078322682, + "learning_rate": 1.680724491032796e-05, + "loss": 0.8679, + "num_tokens": 20998305100.0, + "step": 5024 + }, + { + "epoch": 0.5971479500891266, + "grad_norm": 0.48779690851960233, + "learning_rate": 1.680592162985025e-05, + "loss": 0.8748, + "num_tokens": 21002450591.0, + "step": 5025 + }, + { + "epoch": 0.5972667855020796, + "grad_norm": 0.41224935288368103, + "learning_rate": 1.6804598134359632e-05, + "loss": 0.8453, + "num_tokens": 21006640004.0, + "step": 5026 + }, + { + "epoch": 0.5973856209150327, + "grad_norm": 0.5257395180567406, + "learning_rate": 1.6803274423905117e-05, + "loss": 0.792, + "num_tokens": 21010829578.0, + "step": 5027 + }, + { + "epoch": 0.5975044563279858, + "grad_norm": 0.4964546551476353, + "learning_rate": 1.680195049853573e-05, + "loss": 0.8569, + "num_tokens": 21014998246.0, + "step": 5028 + }, + { + "epoch": 0.5976232917409388, + "grad_norm": 0.4695378831866478, + "learning_rate": 1.68006263583005e-05, + "loss": 0.8721, + "num_tokens": 21019187054.0, + "step": 5029 + }, + { + "epoch": 0.5977421271538919, + "grad_norm": 0.5094711481857844, + "learning_rate": 1.6799302003248462e-05, + "loss": 0.8596, + "num_tokens": 21023375353.0, + "step": 5030 + }, + { + "epoch": 0.597860962566845, + "grad_norm": 0.501361049075119, + "learning_rate": 1.6797977433428664e-05, + "loss": 0.8551, + "num_tokens": 21027526861.0, + "step": 5031 + }, + { + "epoch": 0.597979797979798, + "grad_norm": 0.4407406033280715, + "learning_rate": 1.679665264889016e-05, + "loss": 0.8323, + "num_tokens": 21031691859.0, + "step": 5032 + }, + { + "epoch": 0.598098633392751, + "grad_norm": 0.4420904823716241, + "learning_rate": 1.6795327649682004e-05, + "loss": 0.8936, + "num_tokens": 21035879263.0, + "step": 5033 + }, + { + "epoch": 0.5982174688057041, + "grad_norm": 0.41032100472043626, + "learning_rate": 1.6794002435853276e-05, + "loss": 0.8591, + "num_tokens": 21040069431.0, + "step": 5034 + }, + { + "epoch": 0.5983363042186571, + "grad_norm": 0.5150361492538783, + "learning_rate": 1.6792677007453046e-05, + "loss": 0.8756, + "num_tokens": 21044258772.0, + "step": 5035 + }, + { + "epoch": 0.5984551396316102, + "grad_norm": 0.6031903119337693, + "learning_rate": 1.6791351364530403e-05, + "loss": 0.8622, + "num_tokens": 21048448087.0, + "step": 5036 + }, + { + "epoch": 0.5985739750445633, + "grad_norm": 0.43365785942684926, + "learning_rate": 1.6790025507134433e-05, + "loss": 0.8594, + "num_tokens": 21052638290.0, + "step": 5037 + }, + { + "epoch": 0.5986928104575163, + "grad_norm": 0.5151795765837116, + "learning_rate": 1.6788699435314245e-05, + "loss": 0.8527, + "num_tokens": 21056806572.0, + "step": 5038 + }, + { + "epoch": 0.5988116458704694, + "grad_norm": 0.4374507909563123, + "learning_rate": 1.6787373149118945e-05, + "loss": 0.8874, + "num_tokens": 21060991431.0, + "step": 5039 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 0.5144344875442489, + "learning_rate": 1.678604664859765e-05, + "loss": 0.8921, + "num_tokens": 21065180740.0, + "step": 5040 + }, + { + "epoch": 0.5990493166963755, + "grad_norm": 0.5868087147537582, + "learning_rate": 1.6784719933799483e-05, + "loss": 0.8907, + "num_tokens": 21069342484.0, + "step": 5041 + }, + { + "epoch": 0.5991681521093286, + "grad_norm": 0.3562300675637194, + "learning_rate": 1.678339300477358e-05, + "loss": 0.8429, + "num_tokens": 21073509466.0, + "step": 5042 + }, + { + "epoch": 0.5992869875222816, + "grad_norm": 0.623101326756686, + "learning_rate": 1.6782065861569077e-05, + "loss": 0.8627, + "num_tokens": 21077668565.0, + "step": 5043 + }, + { + "epoch": 0.5994058229352347, + "grad_norm": 0.4916729428475121, + "learning_rate": 1.6780738504235126e-05, + "loss": 0.8859, + "num_tokens": 21081850859.0, + "step": 5044 + }, + { + "epoch": 0.5995246583481878, + "grad_norm": 0.556433337177546, + "learning_rate": 1.6779410932820886e-05, + "loss": 0.8129, + "num_tokens": 21086040781.0, + "step": 5045 + }, + { + "epoch": 0.5996434937611408, + "grad_norm": 0.44994233656734073, + "learning_rate": 1.6778083147375518e-05, + "loss": 0.87, + "num_tokens": 21090225544.0, + "step": 5046 + }, + { + "epoch": 0.5997623291740939, + "grad_norm": 0.5503457343986546, + "learning_rate": 1.6776755147948194e-05, + "loss": 0.8056, + "num_tokens": 21094415235.0, + "step": 5047 + }, + { + "epoch": 0.599881164587047, + "grad_norm": 0.4314129948336062, + "learning_rate": 1.6775426934588093e-05, + "loss": 0.8377, + "num_tokens": 21098550342.0, + "step": 5048 + }, + { + "epoch": 0.6, + "grad_norm": 0.5340705719913587, + "learning_rate": 1.677409850734441e-05, + "loss": 0.8774, + "num_tokens": 21102738948.0, + "step": 5049 + }, + { + "epoch": 0.6001188354129531, + "grad_norm": 0.40165746060047897, + "learning_rate": 1.6772769866266333e-05, + "loss": 0.8635, + "num_tokens": 21106928412.0, + "step": 5050 + }, + { + "epoch": 0.6002376708259061, + "grad_norm": 0.5242528943570834, + "learning_rate": 1.677144101140307e-05, + "loss": 0.8205, + "num_tokens": 21111116661.0, + "step": 5051 + }, + { + "epoch": 0.6003565062388592, + "grad_norm": 0.44340926686988624, + "learning_rate": 1.6770111942803836e-05, + "loss": 0.8797, + "num_tokens": 21115305082.0, + "step": 5052 + }, + { + "epoch": 0.6004753416518123, + "grad_norm": 0.48018831652834376, + "learning_rate": 1.6768782660517843e-05, + "loss": 0.8397, + "num_tokens": 21119495101.0, + "step": 5053 + }, + { + "epoch": 0.6005941770647653, + "grad_norm": 0.39504671465049546, + "learning_rate": 1.6767453164594324e-05, + "loss": 0.8916, + "num_tokens": 21123680677.0, + "step": 5054 + }, + { + "epoch": 0.6007130124777184, + "grad_norm": 0.535395469718244, + "learning_rate": 1.676612345508252e-05, + "loss": 0.8592, + "num_tokens": 21127863372.0, + "step": 5055 + }, + { + "epoch": 0.6008318478906715, + "grad_norm": 0.4663756773740232, + "learning_rate": 1.676479353203166e-05, + "loss": 0.8618, + "num_tokens": 21132054000.0, + "step": 5056 + }, + { + "epoch": 0.6009506833036244, + "grad_norm": 0.4118420901989926, + "learning_rate": 1.6763463395491008e-05, + "loss": 0.8658, + "num_tokens": 21136243374.0, + "step": 5057 + }, + { + "epoch": 0.6010695187165775, + "grad_norm": 0.42478621550960655, + "learning_rate": 1.676213304550982e-05, + "loss": 0.8644, + "num_tokens": 21140431837.0, + "step": 5058 + }, + { + "epoch": 0.6011883541295306, + "grad_norm": 0.6392883778975956, + "learning_rate": 1.6760802482137367e-05, + "loss": 0.8366, + "num_tokens": 21144599490.0, + "step": 5059 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.4002071122544292, + "learning_rate": 1.6759471705422915e-05, + "loss": 0.8868, + "num_tokens": 21148788066.0, + "step": 5060 + }, + { + "epoch": 0.6014260249554367, + "grad_norm": 0.5415100232976116, + "learning_rate": 1.6758140715415754e-05, + "loss": 0.8444, + "num_tokens": 21152976697.0, + "step": 5061 + }, + { + "epoch": 0.6015448603683898, + "grad_norm": 0.5137780097676407, + "learning_rate": 1.6756809512165173e-05, + "loss": 0.8423, + "num_tokens": 21157164921.0, + "step": 5062 + }, + { + "epoch": 0.6016636957813428, + "grad_norm": 0.43076161721131273, + "learning_rate": 1.6755478095720475e-05, + "loss": 0.9033, + "num_tokens": 21161352082.0, + "step": 5063 + }, + { + "epoch": 0.6017825311942959, + "grad_norm": 0.5626091858504044, + "learning_rate": 1.6754146466130957e-05, + "loss": 0.8597, + "num_tokens": 21165541010.0, + "step": 5064 + }, + { + "epoch": 0.6019013666072489, + "grad_norm": 0.4069535488321272, + "learning_rate": 1.6752814623445944e-05, + "loss": 0.8412, + "num_tokens": 21169729791.0, + "step": 5065 + }, + { + "epoch": 0.602020202020202, + "grad_norm": 0.531886591532625, + "learning_rate": 1.6751482567714757e-05, + "loss": 0.8432, + "num_tokens": 21173918773.0, + "step": 5066 + }, + { + "epoch": 0.6021390374331551, + "grad_norm": 0.4347175393383517, + "learning_rate": 1.675015029898672e-05, + "loss": 0.9015, + "num_tokens": 21178106152.0, + "step": 5067 + }, + { + "epoch": 0.6022578728461081, + "grad_norm": 0.3902284478580528, + "learning_rate": 1.674881781731118e-05, + "loss": 0.8559, + "num_tokens": 21182293756.0, + "step": 5068 + }, + { + "epoch": 0.6023767082590612, + "grad_norm": 0.463611045361818, + "learning_rate": 1.6747485122737474e-05, + "loss": 0.8674, + "num_tokens": 21186482151.0, + "step": 5069 + }, + { + "epoch": 0.6024955436720143, + "grad_norm": 0.48808487319714206, + "learning_rate": 1.6746152215314967e-05, + "loss": 0.8776, + "num_tokens": 21190606650.0, + "step": 5070 + }, + { + "epoch": 0.6026143790849673, + "grad_norm": 0.4679775533886467, + "learning_rate": 1.674481909509301e-05, + "loss": 0.8611, + "num_tokens": 21194794993.0, + "step": 5071 + }, + { + "epoch": 0.6027332144979204, + "grad_norm": 0.458909621165923, + "learning_rate": 1.674348576212098e-05, + "loss": 0.8619, + "num_tokens": 21198984328.0, + "step": 5072 + }, + { + "epoch": 0.6028520499108735, + "grad_norm": 0.5615434321168049, + "learning_rate": 1.6742152216448254e-05, + "loss": 0.8296, + "num_tokens": 21203174199.0, + "step": 5073 + }, + { + "epoch": 0.6029708853238265, + "grad_norm": 0.42937024610658037, + "learning_rate": 1.6740818458124214e-05, + "loss": 0.8554, + "num_tokens": 21207363014.0, + "step": 5074 + }, + { + "epoch": 0.6030897207367796, + "grad_norm": 0.5497376516636253, + "learning_rate": 1.673948448719826e-05, + "loss": 0.8584, + "num_tokens": 21211552885.0, + "step": 5075 + }, + { + "epoch": 0.6032085561497326, + "grad_norm": 0.5078250499719983, + "learning_rate": 1.673815030371979e-05, + "loss": 0.8381, + "num_tokens": 21215724216.0, + "step": 5076 + }, + { + "epoch": 0.6033273915626857, + "grad_norm": 0.5209778330149158, + "learning_rate": 1.673681590773821e-05, + "loss": 0.8771, + "num_tokens": 21219913203.0, + "step": 5077 + }, + { + "epoch": 0.6034462269756388, + "grad_norm": 0.4884385078213062, + "learning_rate": 1.6735481299302938e-05, + "loss": 0.8771, + "num_tokens": 21224103359.0, + "step": 5078 + }, + { + "epoch": 0.6035650623885918, + "grad_norm": 0.4084863155887681, + "learning_rate": 1.6734146478463408e-05, + "loss": 0.8433, + "num_tokens": 21228271759.0, + "step": 5079 + }, + { + "epoch": 0.6036838978015449, + "grad_norm": 0.5313385670463542, + "learning_rate": 1.6732811445269042e-05, + "loss": 0.8716, + "num_tokens": 21232460512.0, + "step": 5080 + }, + { + "epoch": 0.603802733214498, + "grad_norm": 0.4464557720920321, + "learning_rate": 1.6731476199769287e-05, + "loss": 0.8324, + "num_tokens": 21236626918.0, + "step": 5081 + }, + { + "epoch": 0.6039215686274509, + "grad_norm": 0.5196650410332374, + "learning_rate": 1.673014074201359e-05, + "loss": 0.8712, + "num_tokens": 21240797657.0, + "step": 5082 + }, + { + "epoch": 0.604040404040404, + "grad_norm": 0.4870487543010355, + "learning_rate": 1.6728805072051403e-05, + "loss": 0.8513, + "num_tokens": 21244987129.0, + "step": 5083 + }, + { + "epoch": 0.6041592394533571, + "grad_norm": 0.39737973273434013, + "learning_rate": 1.67274691899322e-05, + "loss": 0.8929, + "num_tokens": 21249176874.0, + "step": 5084 + }, + { + "epoch": 0.6042780748663101, + "grad_norm": 0.4575433862136099, + "learning_rate": 1.6726133095705442e-05, + "loss": 0.8197, + "num_tokens": 21253362290.0, + "step": 5085 + }, + { + "epoch": 0.6043969102792632, + "grad_norm": 0.43675526461533826, + "learning_rate": 1.672479678942062e-05, + "loss": 0.8395, + "num_tokens": 21257551836.0, + "step": 5086 + }, + { + "epoch": 0.6045157456922163, + "grad_norm": 0.4832037075091, + "learning_rate": 1.6723460271127217e-05, + "loss": 0.8268, + "num_tokens": 21261739619.0, + "step": 5087 + }, + { + "epoch": 0.6046345811051693, + "grad_norm": 0.39960252862934187, + "learning_rate": 1.6722123540874728e-05, + "loss": 0.8757, + "num_tokens": 21265928343.0, + "step": 5088 + }, + { + "epoch": 0.6047534165181224, + "grad_norm": 0.4619431017956233, + "learning_rate": 1.6720786598712658e-05, + "loss": 0.7979, + "num_tokens": 21270118809.0, + "step": 5089 + }, + { + "epoch": 0.6048722519310754, + "grad_norm": 0.5676506079377875, + "learning_rate": 1.6719449444690514e-05, + "loss": 0.8542, + "num_tokens": 21274309470.0, + "step": 5090 + }, + { + "epoch": 0.6049910873440285, + "grad_norm": 0.4642396731256056, + "learning_rate": 1.6718112078857823e-05, + "loss": 0.8381, + "num_tokens": 21278499711.0, + "step": 5091 + }, + { + "epoch": 0.6051099227569816, + "grad_norm": 0.4573688180984317, + "learning_rate": 1.6716774501264108e-05, + "loss": 0.876, + "num_tokens": 21282688475.0, + "step": 5092 + }, + { + "epoch": 0.6052287581699346, + "grad_norm": 0.5494443361194083, + "learning_rate": 1.6715436711958905e-05, + "loss": 0.8753, + "num_tokens": 21286879199.0, + "step": 5093 + }, + { + "epoch": 0.6053475935828877, + "grad_norm": 0.46037761432020313, + "learning_rate": 1.6714098710991752e-05, + "loss": 0.8843, + "num_tokens": 21291069022.0, + "step": 5094 + }, + { + "epoch": 0.6054664289958408, + "grad_norm": 0.49759302593282406, + "learning_rate": 1.6712760498412205e-05, + "loss": 0.862, + "num_tokens": 21295256763.0, + "step": 5095 + }, + { + "epoch": 0.6055852644087938, + "grad_norm": 0.5119204618909391, + "learning_rate": 1.671142207426982e-05, + "loss": 0.836, + "num_tokens": 21299445649.0, + "step": 5096 + }, + { + "epoch": 0.6057040998217469, + "grad_norm": 0.5632568131595281, + "learning_rate": 1.6710083438614173e-05, + "loss": 0.8835, + "num_tokens": 21303620084.0, + "step": 5097 + }, + { + "epoch": 0.6058229352347, + "grad_norm": 0.3926192084516809, + "learning_rate": 1.670874459149482e-05, + "loss": 0.8407, + "num_tokens": 21307810253.0, + "step": 5098 + }, + { + "epoch": 0.605941770647653, + "grad_norm": 0.6074171457446669, + "learning_rate": 1.670740553296136e-05, + "loss": 0.8576, + "num_tokens": 21311989172.0, + "step": 5099 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.4142644726966017, + "learning_rate": 1.670606626306337e-05, + "loss": 0.8754, + "num_tokens": 21316162306.0, + "step": 5100 + }, + { + "epoch": 0.6061794414735591, + "grad_norm": 0.5669766325993537, + "learning_rate": 1.6704726781850463e-05, + "loss": 0.8592, + "num_tokens": 21320351169.0, + "step": 5101 + }, + { + "epoch": 0.6062982768865122, + "grad_norm": 0.47929350916321384, + "learning_rate": 1.670338708937223e-05, + "loss": 0.8398, + "num_tokens": 21324538062.0, + "step": 5102 + }, + { + "epoch": 0.6064171122994653, + "grad_norm": 0.4640315575459775, + "learning_rate": 1.6702047185678288e-05, + "loss": 0.8704, + "num_tokens": 21328724182.0, + "step": 5103 + }, + { + "epoch": 0.6065359477124183, + "grad_norm": 0.5734000189943173, + "learning_rate": 1.670070707081826e-05, + "loss": 0.8535, + "num_tokens": 21332893831.0, + "step": 5104 + }, + { + "epoch": 0.6066547831253714, + "grad_norm": 0.47090067817793746, + "learning_rate": 1.6699366744841775e-05, + "loss": 0.8889, + "num_tokens": 21337076144.0, + "step": 5105 + }, + { + "epoch": 0.6067736185383245, + "grad_norm": 0.48768893871602603, + "learning_rate": 1.669802620779847e-05, + "loss": 0.8542, + "num_tokens": 21341246422.0, + "step": 5106 + }, + { + "epoch": 0.6068924539512774, + "grad_norm": 0.4893752423747508, + "learning_rate": 1.6696685459737986e-05, + "loss": 0.8444, + "num_tokens": 21345435704.0, + "step": 5107 + }, + { + "epoch": 0.6070112893642305, + "grad_norm": 0.456077434121241, + "learning_rate": 1.6695344500709984e-05, + "loss": 0.8645, + "num_tokens": 21349615585.0, + "step": 5108 + }, + { + "epoch": 0.6071301247771836, + "grad_norm": 0.41559358988771017, + "learning_rate": 1.6694003330764114e-05, + "loss": 0.9081, + "num_tokens": 21353784376.0, + "step": 5109 + }, + { + "epoch": 0.6072489601901366, + "grad_norm": 0.3955878707890613, + "learning_rate": 1.669266194995005e-05, + "loss": 0.8483, + "num_tokens": 21357973995.0, + "step": 5110 + }, + { + "epoch": 0.6073677956030897, + "grad_norm": 0.4448583770947135, + "learning_rate": 1.669132035831747e-05, + "loss": 0.867, + "num_tokens": 21362152694.0, + "step": 5111 + }, + { + "epoch": 0.6074866310160428, + "grad_norm": 0.44590703039138, + "learning_rate": 1.6689978555916045e-05, + "loss": 0.8798, + "num_tokens": 21366333807.0, + "step": 5112 + }, + { + "epoch": 0.6076054664289958, + "grad_norm": 0.4509421623341591, + "learning_rate": 1.6688636542795487e-05, + "loss": 0.8538, + "num_tokens": 21370477693.0, + "step": 5113 + }, + { + "epoch": 0.6077243018419489, + "grad_norm": 0.42333583385878454, + "learning_rate": 1.668729431900548e-05, + "loss": 0.8642, + "num_tokens": 21374643035.0, + "step": 5114 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.48532259968512653, + "learning_rate": 1.6685951884595733e-05, + "loss": 0.8494, + "num_tokens": 21378831041.0, + "step": 5115 + }, + { + "epoch": 0.607961972667855, + "grad_norm": 0.4782191487738652, + "learning_rate": 1.6684609239615965e-05, + "loss": 0.8826, + "num_tokens": 21383019395.0, + "step": 5116 + }, + { + "epoch": 0.6080808080808081, + "grad_norm": 0.38514504479978096, + "learning_rate": 1.6683266384115896e-05, + "loss": 0.8562, + "num_tokens": 21387154803.0, + "step": 5117 + }, + { + "epoch": 0.6081996434937611, + "grad_norm": 0.5283334933552186, + "learning_rate": 1.6681923318145256e-05, + "loss": 0.8177, + "num_tokens": 21391307768.0, + "step": 5118 + }, + { + "epoch": 0.6083184789067142, + "grad_norm": 0.4614726777298411, + "learning_rate": 1.6680580041753787e-05, + "loss": 0.8679, + "num_tokens": 21395474348.0, + "step": 5119 + }, + { + "epoch": 0.6084373143196673, + "grad_norm": 0.4857537380865805, + "learning_rate": 1.6679236554991233e-05, + "loss": 0.8189, + "num_tokens": 21399638082.0, + "step": 5120 + }, + { + "epoch": 0.6085561497326203, + "grad_norm": 0.4534936505118022, + "learning_rate": 1.6677892857907346e-05, + "loss": 0.8189, + "num_tokens": 21403807205.0, + "step": 5121 + }, + { + "epoch": 0.6086749851455734, + "grad_norm": 0.5374983791189571, + "learning_rate": 1.6676548950551886e-05, + "loss": 0.8499, + "num_tokens": 21407996275.0, + "step": 5122 + }, + { + "epoch": 0.6087938205585265, + "grad_norm": 0.5516956577574632, + "learning_rate": 1.6675204832974633e-05, + "loss": 0.8786, + "num_tokens": 21412162032.0, + "step": 5123 + }, + { + "epoch": 0.6089126559714795, + "grad_norm": 0.36586287430034353, + "learning_rate": 1.667386050522535e-05, + "loss": 0.8688, + "num_tokens": 21416336404.0, + "step": 5124 + }, + { + "epoch": 0.6090314913844326, + "grad_norm": 0.5260616648208993, + "learning_rate": 1.667251596735383e-05, + "loss": 0.8602, + "num_tokens": 21420526685.0, + "step": 5125 + }, + { + "epoch": 0.6091503267973856, + "grad_norm": 0.5295946111426293, + "learning_rate": 1.6671171219409866e-05, + "loss": 0.897, + "num_tokens": 21424714943.0, + "step": 5126 + }, + { + "epoch": 0.6092691622103387, + "grad_norm": 0.42301001788179643, + "learning_rate": 1.6669826261443257e-05, + "loss": 0.8631, + "num_tokens": 21428903816.0, + "step": 5127 + }, + { + "epoch": 0.6093879976232918, + "grad_norm": 0.506153763847762, + "learning_rate": 1.666848109350381e-05, + "loss": 0.8996, + "num_tokens": 21433093014.0, + "step": 5128 + }, + { + "epoch": 0.6095068330362448, + "grad_norm": 0.42771198240218283, + "learning_rate": 1.6667135715641343e-05, + "loss": 0.8061, + "num_tokens": 21437282269.0, + "step": 5129 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 0.40940733087917414, + "learning_rate": 1.666579012790568e-05, + "loss": 0.879, + "num_tokens": 21441471388.0, + "step": 5130 + }, + { + "epoch": 0.609744503862151, + "grad_norm": 0.4172943253802085, + "learning_rate": 1.6664444330346654e-05, + "loss": 0.8623, + "num_tokens": 21445631800.0, + "step": 5131 + }, + { + "epoch": 0.6098633392751039, + "grad_norm": 0.4227903434312636, + "learning_rate": 1.66630983230141e-05, + "loss": 0.8769, + "num_tokens": 21449812010.0, + "step": 5132 + }, + { + "epoch": 0.609982174688057, + "grad_norm": 0.4093664087240828, + "learning_rate": 1.6661752105957868e-05, + "loss": 0.8394, + "num_tokens": 21454002118.0, + "step": 5133 + }, + { + "epoch": 0.6101010101010101, + "grad_norm": 0.4342241856356005, + "learning_rate": 1.6660405679227813e-05, + "loss": 0.8896, + "num_tokens": 21458177747.0, + "step": 5134 + }, + { + "epoch": 0.6102198455139631, + "grad_norm": 0.4259317560728627, + "learning_rate": 1.6659059042873795e-05, + "loss": 0.8744, + "num_tokens": 21462366847.0, + "step": 5135 + }, + { + "epoch": 0.6103386809269162, + "grad_norm": 0.46290404045729583, + "learning_rate": 1.665771219694569e-05, + "loss": 0.8583, + "num_tokens": 21466553035.0, + "step": 5136 + }, + { + "epoch": 0.6104575163398693, + "grad_norm": 0.4807412132394931, + "learning_rate": 1.6656365141493374e-05, + "loss": 0.8769, + "num_tokens": 21470742719.0, + "step": 5137 + }, + { + "epoch": 0.6105763517528223, + "grad_norm": 0.4530775287983449, + "learning_rate": 1.665501787656673e-05, + "loss": 0.8484, + "num_tokens": 21474931291.0, + "step": 5138 + }, + { + "epoch": 0.6106951871657754, + "grad_norm": 0.4649809530167946, + "learning_rate": 1.6653670402215652e-05, + "loss": 0.8349, + "num_tokens": 21479089734.0, + "step": 5139 + }, + { + "epoch": 0.6108140225787284, + "grad_norm": 0.4614027246655255, + "learning_rate": 1.665232271849005e-05, + "loss": 0.9042, + "num_tokens": 21483256048.0, + "step": 5140 + }, + { + "epoch": 0.6109328579916815, + "grad_norm": 0.4535399014867388, + "learning_rate": 1.665097482543982e-05, + "loss": 0.8459, + "num_tokens": 21487423303.0, + "step": 5141 + }, + { + "epoch": 0.6110516934046346, + "grad_norm": 0.4065977292717709, + "learning_rate": 1.6649626723114885e-05, + "loss": 0.8518, + "num_tokens": 21491595838.0, + "step": 5142 + }, + { + "epoch": 0.6111705288175876, + "grad_norm": 0.4189724955861982, + "learning_rate": 1.664827841156518e-05, + "loss": 0.8944, + "num_tokens": 21495769689.0, + "step": 5143 + }, + { + "epoch": 0.6112893642305407, + "grad_norm": 0.48544910675870395, + "learning_rate": 1.6646929890840618e-05, + "loss": 0.8995, + "num_tokens": 21499925870.0, + "step": 5144 + }, + { + "epoch": 0.6114081996434938, + "grad_norm": 0.49449944694976944, + "learning_rate": 1.6645581160991155e-05, + "loss": 0.8517, + "num_tokens": 21504115339.0, + "step": 5145 + }, + { + "epoch": 0.6115270350564468, + "grad_norm": 0.4382074677165655, + "learning_rate": 1.6644232222066733e-05, + "loss": 0.8625, + "num_tokens": 21508296761.0, + "step": 5146 + }, + { + "epoch": 0.6116458704693999, + "grad_norm": 0.4557562381560974, + "learning_rate": 1.6642883074117307e-05, + "loss": 0.8522, + "num_tokens": 21512487043.0, + "step": 5147 + }, + { + "epoch": 0.611764705882353, + "grad_norm": 0.47470028546834325, + "learning_rate": 1.6641533717192842e-05, + "loss": 0.8294, + "num_tokens": 21516649425.0, + "step": 5148 + }, + { + "epoch": 0.611883541295306, + "grad_norm": 0.4130971782353422, + "learning_rate": 1.664018415134331e-05, + "loss": 0.8726, + "num_tokens": 21520837453.0, + "step": 5149 + }, + { + "epoch": 0.6120023767082591, + "grad_norm": 0.4758697115315365, + "learning_rate": 1.6638834376618686e-05, + "loss": 0.8791, + "num_tokens": 21525025426.0, + "step": 5150 + }, + { + "epoch": 0.6121212121212121, + "grad_norm": 0.4533445681708397, + "learning_rate": 1.6637484393068964e-05, + "loss": 0.8676, + "num_tokens": 21529215280.0, + "step": 5151 + }, + { + "epoch": 0.6122400475341652, + "grad_norm": 0.5371769955094319, + "learning_rate": 1.6636134200744133e-05, + "loss": 0.8695, + "num_tokens": 21533404878.0, + "step": 5152 + }, + { + "epoch": 0.6123588829471183, + "grad_norm": 0.40980087103809726, + "learning_rate": 1.6634783799694192e-05, + "loss": 0.8426, + "num_tokens": 21537573119.0, + "step": 5153 + }, + { + "epoch": 0.6124777183600713, + "grad_norm": 0.4594280635339266, + "learning_rate": 1.663343318996916e-05, + "loss": 0.8814, + "num_tokens": 21541763218.0, + "step": 5154 + }, + { + "epoch": 0.6125965537730244, + "grad_norm": 0.546172346170713, + "learning_rate": 1.663208237161905e-05, + "loss": 0.8661, + "num_tokens": 21545953296.0, + "step": 5155 + }, + { + "epoch": 0.6127153891859775, + "grad_norm": 0.3685570837944041, + "learning_rate": 1.6630731344693886e-05, + "loss": 0.8704, + "num_tokens": 21550140385.0, + "step": 5156 + }, + { + "epoch": 0.6128342245989304, + "grad_norm": 0.49473680536121184, + "learning_rate": 1.66293801092437e-05, + "loss": 0.8573, + "num_tokens": 21554330119.0, + "step": 5157 + }, + { + "epoch": 0.6129530600118835, + "grad_norm": 0.5129648857275362, + "learning_rate": 1.6628028665318536e-05, + "loss": 0.8919, + "num_tokens": 21558516824.0, + "step": 5158 + }, + { + "epoch": 0.6130718954248366, + "grad_norm": 0.42886389324654733, + "learning_rate": 1.6626677012968445e-05, + "loss": 0.8686, + "num_tokens": 21562707411.0, + "step": 5159 + }, + { + "epoch": 0.6131907308377896, + "grad_norm": 0.4132269075423239, + "learning_rate": 1.6625325152243476e-05, + "loss": 0.8415, + "num_tokens": 21566895697.0, + "step": 5160 + }, + { + "epoch": 0.6133095662507427, + "grad_norm": 0.44821648682558507, + "learning_rate": 1.6623973083193697e-05, + "loss": 0.8746, + "num_tokens": 21571084608.0, + "step": 5161 + }, + { + "epoch": 0.6134284016636958, + "grad_norm": 0.5268767994796645, + "learning_rate": 1.6622620805869186e-05, + "loss": 0.8289, + "num_tokens": 21575274373.0, + "step": 5162 + }, + { + "epoch": 0.6135472370766488, + "grad_norm": 0.4454242134660817, + "learning_rate": 1.662126832032001e-05, + "loss": 0.8323, + "num_tokens": 21579455394.0, + "step": 5163 + }, + { + "epoch": 0.6136660724896019, + "grad_norm": 0.5150855555755779, + "learning_rate": 1.6619915626596263e-05, + "loss": 0.8667, + "num_tokens": 21583644363.0, + "step": 5164 + }, + { + "epoch": 0.6137849079025549, + "grad_norm": 0.4332363239184508, + "learning_rate": 1.6618562724748037e-05, + "loss": 0.8259, + "num_tokens": 21587830954.0, + "step": 5165 + }, + { + "epoch": 0.613903743315508, + "grad_norm": 0.45509260447432026, + "learning_rate": 1.6617209614825443e-05, + "loss": 0.8682, + "num_tokens": 21592022020.0, + "step": 5166 + }, + { + "epoch": 0.6140225787284611, + "grad_norm": 0.41167188470123095, + "learning_rate": 1.6615856296878578e-05, + "loss": 0.8812, + "num_tokens": 21596211135.0, + "step": 5167 + }, + { + "epoch": 0.6141414141414141, + "grad_norm": 0.4417542036134702, + "learning_rate": 1.661450277095757e-05, + "loss": 0.8812, + "num_tokens": 21600394824.0, + "step": 5168 + }, + { + "epoch": 0.6142602495543672, + "grad_norm": 0.5277427491278733, + "learning_rate": 1.661314903711254e-05, + "loss": 0.8425, + "num_tokens": 21604585166.0, + "step": 5169 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.3567321635695198, + "learning_rate": 1.6611795095393625e-05, + "loss": 0.8374, + "num_tokens": 21608774513.0, + "step": 5170 + }, + { + "epoch": 0.6144979203802733, + "grad_norm": 0.5399309973333928, + "learning_rate": 1.6610440945850957e-05, + "loss": 0.9001, + "num_tokens": 21612962368.0, + "step": 5171 + }, + { + "epoch": 0.6146167557932264, + "grad_norm": 0.4384175736301113, + "learning_rate": 1.66090865885347e-05, + "loss": 0.8756, + "num_tokens": 21617151740.0, + "step": 5172 + }, + { + "epoch": 0.6147355912061795, + "grad_norm": 0.4966601023198051, + "learning_rate": 1.6607732023495e-05, + "loss": 0.8987, + "num_tokens": 21621335258.0, + "step": 5173 + }, + { + "epoch": 0.6148544266191325, + "grad_norm": 0.4233022421932162, + "learning_rate": 1.6606377250782018e-05, + "loss": 0.837, + "num_tokens": 21625510226.0, + "step": 5174 + }, + { + "epoch": 0.6149732620320856, + "grad_norm": 0.4413849994134027, + "learning_rate": 1.6605022270445936e-05, + "loss": 0.8255, + "num_tokens": 21629699312.0, + "step": 5175 + }, + { + "epoch": 0.6150920974450387, + "grad_norm": 0.43264608558121864, + "learning_rate": 1.6603667082536928e-05, + "loss": 0.8769, + "num_tokens": 21633857379.0, + "step": 5176 + }, + { + "epoch": 0.6152109328579917, + "grad_norm": 0.4824440460537776, + "learning_rate": 1.6602311687105177e-05, + "loss": 0.8447, + "num_tokens": 21638021009.0, + "step": 5177 + }, + { + "epoch": 0.6153297682709448, + "grad_norm": 0.5808136935870387, + "learning_rate": 1.6600956084200888e-05, + "loss": 0.8429, + "num_tokens": 21642210897.0, + "step": 5178 + }, + { + "epoch": 0.6154486036838978, + "grad_norm": 0.39795472919550123, + "learning_rate": 1.6599600273874256e-05, + "loss": 0.8479, + "num_tokens": 21646400911.0, + "step": 5179 + }, + { + "epoch": 0.6155674390968509, + "grad_norm": 0.731607227525772, + "learning_rate": 1.659824425617549e-05, + "loss": 0.8586, + "num_tokens": 21650568064.0, + "step": 5180 + }, + { + "epoch": 0.615686274509804, + "grad_norm": 0.5156847269306682, + "learning_rate": 1.6596888031154815e-05, + "loss": 0.9023, + "num_tokens": 21654730094.0, + "step": 5181 + }, + { + "epoch": 0.6158051099227569, + "grad_norm": 0.7075860649162479, + "learning_rate": 1.659553159886245e-05, + "loss": 0.8548, + "num_tokens": 21658916116.0, + "step": 5182 + }, + { + "epoch": 0.61592394533571, + "grad_norm": 0.56013455003488, + "learning_rate": 1.6594174959348632e-05, + "loss": 0.8673, + "num_tokens": 21663104994.0, + "step": 5183 + }, + { + "epoch": 0.6160427807486631, + "grad_norm": 0.7171042115652465, + "learning_rate": 1.65928181126636e-05, + "loss": 0.8477, + "num_tokens": 21667278887.0, + "step": 5184 + }, + { + "epoch": 0.6161616161616161, + "grad_norm": 0.5611105739639416, + "learning_rate": 1.6591461058857604e-05, + "loss": 0.8718, + "num_tokens": 21671397976.0, + "step": 5185 + }, + { + "epoch": 0.6162804515745692, + "grad_norm": 0.6994549189393197, + "learning_rate": 1.65901037979809e-05, + "loss": 0.8583, + "num_tokens": 21675587920.0, + "step": 5186 + }, + { + "epoch": 0.6163992869875223, + "grad_norm": 0.5768279891619873, + "learning_rate": 1.6588746330083754e-05, + "loss": 0.8477, + "num_tokens": 21679761393.0, + "step": 5187 + }, + { + "epoch": 0.6165181224004753, + "grad_norm": 0.613034968360028, + "learning_rate": 1.658738865521643e-05, + "loss": 0.8724, + "num_tokens": 21683924237.0, + "step": 5188 + }, + { + "epoch": 0.6166369578134284, + "grad_norm": 0.5439696720376204, + "learning_rate": 1.658603077342922e-05, + "loss": 0.8573, + "num_tokens": 21688112929.0, + "step": 5189 + }, + { + "epoch": 0.6167557932263814, + "grad_norm": 0.6335541075311869, + "learning_rate": 1.65846726847724e-05, + "loss": 0.8299, + "num_tokens": 21692302221.0, + "step": 5190 + }, + { + "epoch": 0.6168746286393345, + "grad_norm": 0.5071535207572728, + "learning_rate": 1.6583314389296267e-05, + "loss": 0.8578, + "num_tokens": 21696478960.0, + "step": 5191 + }, + { + "epoch": 0.6169934640522876, + "grad_norm": 0.611129968444119, + "learning_rate": 1.6581955887051122e-05, + "loss": 0.8461, + "num_tokens": 21700654490.0, + "step": 5192 + }, + { + "epoch": 0.6171122994652406, + "grad_norm": 0.4776619230625896, + "learning_rate": 1.6580597178087285e-05, + "loss": 0.8499, + "num_tokens": 21704844050.0, + "step": 5193 + }, + { + "epoch": 0.6172311348781937, + "grad_norm": 0.5765738715062457, + "learning_rate": 1.6579238262455062e-05, + "loss": 0.8754, + "num_tokens": 21709033844.0, + "step": 5194 + }, + { + "epoch": 0.6173499702911468, + "grad_norm": 0.4819821935135368, + "learning_rate": 1.657787914020478e-05, + "loss": 0.8247, + "num_tokens": 21713216569.0, + "step": 5195 + }, + { + "epoch": 0.6174688057040998, + "grad_norm": 0.492603225520089, + "learning_rate": 1.6576519811386777e-05, + "loss": 0.8435, + "num_tokens": 21717378394.0, + "step": 5196 + }, + { + "epoch": 0.6175876411170529, + "grad_norm": 0.5362303328455115, + "learning_rate": 1.657516027605139e-05, + "loss": 0.8704, + "num_tokens": 21721566302.0, + "step": 5197 + }, + { + "epoch": 0.617706476530006, + "grad_norm": 0.4423172260100702, + "learning_rate": 1.6573800534248966e-05, + "loss": 0.867, + "num_tokens": 21725756557.0, + "step": 5198 + }, + { + "epoch": 0.617825311942959, + "grad_norm": 0.5906192185580437, + "learning_rate": 1.6572440586029865e-05, + "loss": 0.8288, + "num_tokens": 21729923374.0, + "step": 5199 + }, + { + "epoch": 0.6179441473559121, + "grad_norm": 0.4265173949012464, + "learning_rate": 1.657108043144445e-05, + "loss": 0.8587, + "num_tokens": 21734113111.0, + "step": 5200 + }, + { + "epoch": 0.6180629827688652, + "grad_norm": 0.6495994840181668, + "learning_rate": 1.656972007054309e-05, + "loss": 0.8865, + "num_tokens": 21738302896.0, + "step": 5201 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 0.4741428465256464, + "learning_rate": 1.656835950337616e-05, + "loss": 0.8532, + "num_tokens": 21742491881.0, + "step": 5202 + }, + { + "epoch": 0.6183006535947713, + "grad_norm": 0.6587361775711109, + "learning_rate": 1.6566998729994055e-05, + "loss": 0.8727, + "num_tokens": 21746681641.0, + "step": 5203 + }, + { + "epoch": 0.6184194890077243, + "grad_norm": 0.5187588727787497, + "learning_rate": 1.656563775044716e-05, + "loss": 0.8943, + "num_tokens": 21750869503.0, + "step": 5204 + }, + { + "epoch": 0.6185383244206774, + "grad_norm": 0.6625566794249786, + "learning_rate": 1.6564276564785887e-05, + "loss": 0.8599, + "num_tokens": 21755050365.0, + "step": 5205 + }, + { + "epoch": 0.6186571598336305, + "grad_norm": 0.5752721718513203, + "learning_rate": 1.6562915173060633e-05, + "loss": 0.8438, + "num_tokens": 21759233730.0, + "step": 5206 + }, + { + "epoch": 0.6187759952465834, + "grad_norm": 0.6161087482820613, + "learning_rate": 1.6561553575321826e-05, + "loss": 0.8316, + "num_tokens": 21763396104.0, + "step": 5207 + }, + { + "epoch": 0.6188948306595365, + "grad_norm": 0.5762043858409199, + "learning_rate": 1.6560191771619886e-05, + "loss": 0.8948, + "num_tokens": 21767583928.0, + "step": 5208 + }, + { + "epoch": 0.6190136660724896, + "grad_norm": 0.5384783360005732, + "learning_rate": 1.6558829762005243e-05, + "loss": 0.8628, + "num_tokens": 21771750257.0, + "step": 5209 + }, + { + "epoch": 0.6191325014854426, + "grad_norm": 0.5229794200588905, + "learning_rate": 1.655746754652834e-05, + "loss": 0.8708, + "num_tokens": 21775919084.0, + "step": 5210 + }, + { + "epoch": 0.6192513368983957, + "grad_norm": 0.5007049906205187, + "learning_rate": 1.6556105125239624e-05, + "loss": 0.8549, + "num_tokens": 21780107660.0, + "step": 5211 + }, + { + "epoch": 0.6193701723113488, + "grad_norm": 0.5265063475329047, + "learning_rate": 1.655474249818955e-05, + "loss": 0.8505, + "num_tokens": 21784239584.0, + "step": 5212 + }, + { + "epoch": 0.6194890077243018, + "grad_norm": 0.4808011260088851, + "learning_rate": 1.655337966542858e-05, + "loss": 0.8528, + "num_tokens": 21788391574.0, + "step": 5213 + }, + { + "epoch": 0.6196078431372549, + "grad_norm": 0.4327372364713091, + "learning_rate": 1.6552016627007185e-05, + "loss": 0.8929, + "num_tokens": 21792579954.0, + "step": 5214 + }, + { + "epoch": 0.6197266785502079, + "grad_norm": 0.40466078992269705, + "learning_rate": 1.6550653382975842e-05, + "loss": 0.858, + "num_tokens": 21796718482.0, + "step": 5215 + }, + { + "epoch": 0.619845513963161, + "grad_norm": 0.4570293947800885, + "learning_rate": 1.6549289933385037e-05, + "loss": 0.8861, + "num_tokens": 21800907111.0, + "step": 5216 + }, + { + "epoch": 0.6199643493761141, + "grad_norm": 0.43501670488241634, + "learning_rate": 1.654792627828526e-05, + "loss": 0.8141, + "num_tokens": 21805072533.0, + "step": 5217 + }, + { + "epoch": 0.6200831847890671, + "grad_norm": 0.43633607158202975, + "learning_rate": 1.6546562417727018e-05, + "loss": 0.9007, + "num_tokens": 21809238635.0, + "step": 5218 + }, + { + "epoch": 0.6202020202020202, + "grad_norm": 0.3689971783201624, + "learning_rate": 1.6545198351760816e-05, + "loss": 0.8354, + "num_tokens": 21813426728.0, + "step": 5219 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 0.4777130446509899, + "learning_rate": 1.6543834080437175e-05, + "loss": 0.86, + "num_tokens": 21817593703.0, + "step": 5220 + }, + { + "epoch": 0.6204396910279263, + "grad_norm": 0.4060722702267377, + "learning_rate": 1.6542469603806604e-05, + "loss": 0.9049, + "num_tokens": 21821781317.0, + "step": 5221 + }, + { + "epoch": 0.6205585264408794, + "grad_norm": 0.5181890693551021, + "learning_rate": 1.654110492191965e-05, + "loss": 0.8513, + "num_tokens": 21825970038.0, + "step": 5222 + }, + { + "epoch": 0.6206773618538325, + "grad_norm": 0.36734274348791723, + "learning_rate": 1.6539740034826847e-05, + "loss": 0.841, + "num_tokens": 21830158191.0, + "step": 5223 + }, + { + "epoch": 0.6207961972667855, + "grad_norm": 0.4977098974997541, + "learning_rate": 1.6538374942578738e-05, + "loss": 0.8453, + "num_tokens": 21834345997.0, + "step": 5224 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.4235183244323756, + "learning_rate": 1.653700964522588e-05, + "loss": 0.8858, + "num_tokens": 21838520764.0, + "step": 5225 + }, + { + "epoch": 0.6210338680926917, + "grad_norm": 0.5102425448245747, + "learning_rate": 1.6535644142818833e-05, + "loss": 0.8516, + "num_tokens": 21842708635.0, + "step": 5226 + }, + { + "epoch": 0.6211527035056447, + "grad_norm": 0.4319345336622126, + "learning_rate": 1.6534278435408167e-05, + "loss": 0.8285, + "num_tokens": 21846898031.0, + "step": 5227 + }, + { + "epoch": 0.6212715389185978, + "grad_norm": 0.44330904550711187, + "learning_rate": 1.653291252304446e-05, + "loss": 0.8708, + "num_tokens": 21851088245.0, + "step": 5228 + }, + { + "epoch": 0.6213903743315508, + "grad_norm": 0.47715527584245204, + "learning_rate": 1.653154640577829e-05, + "loss": 0.8479, + "num_tokens": 21855248453.0, + "step": 5229 + }, + { + "epoch": 0.6215092097445039, + "grad_norm": 0.5303532080943214, + "learning_rate": 1.653018008366026e-05, + "loss": 0.8787, + "num_tokens": 21859412235.0, + "step": 5230 + }, + { + "epoch": 0.621628045157457, + "grad_norm": 0.47020584453319086, + "learning_rate": 1.6528813556740962e-05, + "loss": 0.8541, + "num_tokens": 21863597101.0, + "step": 5231 + }, + { + "epoch": 0.6217468805704099, + "grad_norm": 0.43439131156022986, + "learning_rate": 1.6527446825071002e-05, + "loss": 0.8632, + "num_tokens": 21867771481.0, + "step": 5232 + }, + { + "epoch": 0.621865715983363, + "grad_norm": 0.4664394568485326, + "learning_rate": 1.6526079888701002e-05, + "loss": 0.8838, + "num_tokens": 21871941127.0, + "step": 5233 + }, + { + "epoch": 0.6219845513963161, + "grad_norm": 0.4184484556261784, + "learning_rate": 1.6524712747681574e-05, + "loss": 0.8808, + "num_tokens": 21876129165.0, + "step": 5234 + }, + { + "epoch": 0.6221033868092691, + "grad_norm": 0.41860026313855725, + "learning_rate": 1.652334540206336e-05, + "loss": 0.8433, + "num_tokens": 21880305554.0, + "step": 5235 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.43022017253754646, + "learning_rate": 1.6521977851896986e-05, + "loss": 0.8703, + "num_tokens": 21884483355.0, + "step": 5236 + }, + { + "epoch": 0.6223410576351753, + "grad_norm": 0.48756350000898613, + "learning_rate": 1.6520610097233104e-05, + "loss": 0.8332, + "num_tokens": 21888672055.0, + "step": 5237 + }, + { + "epoch": 0.6224598930481283, + "grad_norm": 0.43549955282186664, + "learning_rate": 1.6519242138122364e-05, + "loss": 0.852, + "num_tokens": 21892861366.0, + "step": 5238 + }, + { + "epoch": 0.6225787284610814, + "grad_norm": 0.5023522972547236, + "learning_rate": 1.6517873974615428e-05, + "loss": 0.8208, + "num_tokens": 21897052085.0, + "step": 5239 + }, + { + "epoch": 0.6226975638740344, + "grad_norm": 0.4499237413518223, + "learning_rate": 1.6516505606762955e-05, + "loss": 0.8599, + "num_tokens": 21901214235.0, + "step": 5240 + }, + { + "epoch": 0.6228163992869875, + "grad_norm": 0.4066503968241558, + "learning_rate": 1.6515137034615636e-05, + "loss": 0.8432, + "num_tokens": 21905369664.0, + "step": 5241 + }, + { + "epoch": 0.6229352346999406, + "grad_norm": 0.46635617775699684, + "learning_rate": 1.6513768258224144e-05, + "loss": 0.8926, + "num_tokens": 21909551751.0, + "step": 5242 + }, + { + "epoch": 0.6230540701128936, + "grad_norm": 0.42489913438671684, + "learning_rate": 1.6512399277639167e-05, + "loss": 0.8385, + "num_tokens": 21913741523.0, + "step": 5243 + }, + { + "epoch": 0.6231729055258467, + "grad_norm": 0.4607699407590576, + "learning_rate": 1.651103009291141e-05, + "loss": 0.8385, + "num_tokens": 21917893824.0, + "step": 5244 + }, + { + "epoch": 0.6232917409387998, + "grad_norm": 0.5375116133987206, + "learning_rate": 1.6509660704091576e-05, + "loss": 0.8686, + "num_tokens": 21922076625.0, + "step": 5245 + }, + { + "epoch": 0.6234105763517528, + "grad_norm": 0.37936978772570173, + "learning_rate": 1.6508291111230376e-05, + "loss": 0.8768, + "num_tokens": 21926266162.0, + "step": 5246 + }, + { + "epoch": 0.6235294117647059, + "grad_norm": 0.42404986401324113, + "learning_rate": 1.6506921314378532e-05, + "loss": 0.92, + "num_tokens": 21930438220.0, + "step": 5247 + }, + { + "epoch": 0.623648247177659, + "grad_norm": 0.4035582116442733, + "learning_rate": 1.6505551313586775e-05, + "loss": 0.8693, + "num_tokens": 21934628382.0, + "step": 5248 + }, + { + "epoch": 0.623767082590612, + "grad_norm": 0.4346050240391289, + "learning_rate": 1.6504181108905836e-05, + "loss": 0.8628, + "num_tokens": 21938817399.0, + "step": 5249 + }, + { + "epoch": 0.6238859180035651, + "grad_norm": 0.4407888908143348, + "learning_rate": 1.650281070038646e-05, + "loss": 0.8379, + "num_tokens": 21942989506.0, + "step": 5250 + }, + { + "epoch": 0.6240047534165182, + "grad_norm": 0.44419915711127966, + "learning_rate": 1.65014400880794e-05, + "loss": 0.8673, + "num_tokens": 21947153131.0, + "step": 5251 + }, + { + "epoch": 0.6241235888294712, + "grad_norm": 0.45849309382791553, + "learning_rate": 1.6500069272035412e-05, + "loss": 0.8421, + "num_tokens": 21951338783.0, + "step": 5252 + }, + { + "epoch": 0.6242424242424243, + "grad_norm": 0.5320326043894685, + "learning_rate": 1.649869825230526e-05, + "loss": 0.844, + "num_tokens": 21955527274.0, + "step": 5253 + }, + { + "epoch": 0.6243612596553773, + "grad_norm": 0.48240825417379307, + "learning_rate": 1.6497327028939724e-05, + "loss": 0.8818, + "num_tokens": 21959715339.0, + "step": 5254 + }, + { + "epoch": 0.6244800950683304, + "grad_norm": 0.4422093251708625, + "learning_rate": 1.6495955601989578e-05, + "loss": 0.8379, + "num_tokens": 21963885573.0, + "step": 5255 + }, + { + "epoch": 0.6245989304812835, + "grad_norm": 0.4889665063626786, + "learning_rate": 1.6494583971505614e-05, + "loss": 0.8502, + "num_tokens": 21968054257.0, + "step": 5256 + }, + { + "epoch": 0.6247177658942364, + "grad_norm": 0.38528220516720185, + "learning_rate": 1.649321213753863e-05, + "loss": 0.8683, + "num_tokens": 21972210208.0, + "step": 5257 + }, + { + "epoch": 0.6248366013071895, + "grad_norm": 0.4548614101987305, + "learning_rate": 1.649184010013943e-05, + "loss": 0.8571, + "num_tokens": 21976389513.0, + "step": 5258 + }, + { + "epoch": 0.6249554367201426, + "grad_norm": 0.46490497129335184, + "learning_rate": 1.649046785935882e-05, + "loss": 0.888, + "num_tokens": 21980579418.0, + "step": 5259 + }, + { + "epoch": 0.6250742721330956, + "grad_norm": 0.5147314094799618, + "learning_rate": 1.648909541524762e-05, + "loss": 0.8537, + "num_tokens": 21984757804.0, + "step": 5260 + }, + { + "epoch": 0.6251931075460487, + "grad_norm": 0.45732657561782153, + "learning_rate": 1.6487722767856663e-05, + "loss": 0.8621, + "num_tokens": 21988948962.0, + "step": 5261 + }, + { + "epoch": 0.6253119429590018, + "grad_norm": 0.48262586712093675, + "learning_rate": 1.648634991723677e-05, + "loss": 0.847, + "num_tokens": 21993138879.0, + "step": 5262 + }, + { + "epoch": 0.6254307783719548, + "grad_norm": 0.4068816939762007, + "learning_rate": 1.6484976863438796e-05, + "loss": 0.8286, + "num_tokens": 21997296801.0, + "step": 5263 + }, + { + "epoch": 0.6255496137849079, + "grad_norm": 0.5493110674221708, + "learning_rate": 1.648360360651358e-05, + "loss": 0.8665, + "num_tokens": 22001470044.0, + "step": 5264 + }, + { + "epoch": 0.6256684491978609, + "grad_norm": 0.4035472219511694, + "learning_rate": 1.6482230146511978e-05, + "loss": 0.806, + "num_tokens": 22005660110.0, + "step": 5265 + }, + { + "epoch": 0.625787284610814, + "grad_norm": 0.43579750427413405, + "learning_rate": 1.6480856483484868e-05, + "loss": 0.8869, + "num_tokens": 22009848971.0, + "step": 5266 + }, + { + "epoch": 0.6259061200237671, + "grad_norm": 0.5940555770837392, + "learning_rate": 1.6479482617483103e-05, + "loss": 0.8045, + "num_tokens": 22014017107.0, + "step": 5267 + }, + { + "epoch": 0.6260249554367201, + "grad_norm": 0.37207560905744297, + "learning_rate": 1.647810854855757e-05, + "loss": 0.8553, + "num_tokens": 22018205420.0, + "step": 5268 + }, + { + "epoch": 0.6261437908496732, + "grad_norm": 0.579484529408913, + "learning_rate": 1.647673427675916e-05, + "loss": 0.8841, + "num_tokens": 22022394561.0, + "step": 5269 + }, + { + "epoch": 0.6262626262626263, + "grad_norm": 0.4383024588886652, + "learning_rate": 1.647535980213876e-05, + "loss": 0.8274, + "num_tokens": 22026583224.0, + "step": 5270 + }, + { + "epoch": 0.6263814616755793, + "grad_norm": 0.518973955452835, + "learning_rate": 1.647398512474728e-05, + "loss": 0.8645, + "num_tokens": 22030773360.0, + "step": 5271 + }, + { + "epoch": 0.6265002970885324, + "grad_norm": 0.4725819607675166, + "learning_rate": 1.6472610244635613e-05, + "loss": 0.8771, + "num_tokens": 22034962783.0, + "step": 5272 + }, + { + "epoch": 0.6266191325014855, + "grad_norm": 0.44507661229135914, + "learning_rate": 1.6471235161854693e-05, + "loss": 0.8329, + "num_tokens": 22039134017.0, + "step": 5273 + }, + { + "epoch": 0.6267379679144385, + "grad_norm": 0.5189027425328675, + "learning_rate": 1.6469859876455434e-05, + "loss": 0.8084, + "num_tokens": 22043324464.0, + "step": 5274 + }, + { + "epoch": 0.6268568033273916, + "grad_norm": 0.393421974140865, + "learning_rate": 1.6468484388488767e-05, + "loss": 0.8722, + "num_tokens": 22047513582.0, + "step": 5275 + }, + { + "epoch": 0.6269756387403447, + "grad_norm": 0.40848213412830114, + "learning_rate": 1.6467108698005637e-05, + "loss": 0.8656, + "num_tokens": 22051703782.0, + "step": 5276 + }, + { + "epoch": 0.6270944741532977, + "grad_norm": 0.4998585075570851, + "learning_rate": 1.6465732805056984e-05, + "loss": 0.8658, + "num_tokens": 22055826258.0, + "step": 5277 + }, + { + "epoch": 0.6272133095662508, + "grad_norm": 0.5044879695518717, + "learning_rate": 1.646435670969377e-05, + "loss": 0.8668, + "num_tokens": 22060016285.0, + "step": 5278 + }, + { + "epoch": 0.6273321449792038, + "grad_norm": 0.5222618177195382, + "learning_rate": 1.6462980411966946e-05, + "loss": 0.8256, + "num_tokens": 22064205359.0, + "step": 5279 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.3849126410082032, + "learning_rate": 1.6461603911927492e-05, + "loss": 0.856, + "num_tokens": 22068393537.0, + "step": 5280 + }, + { + "epoch": 0.62756981580511, + "grad_norm": 0.39218998809115274, + "learning_rate": 1.6460227209626373e-05, + "loss": 0.8266, + "num_tokens": 22072582497.0, + "step": 5281 + }, + { + "epoch": 0.6276886512180629, + "grad_norm": 0.554634699300648, + "learning_rate": 1.6458850305114582e-05, + "loss": 0.8737, + "num_tokens": 22076771027.0, + "step": 5282 + }, + { + "epoch": 0.627807486631016, + "grad_norm": 0.409822714830519, + "learning_rate": 1.645747319844311e-05, + "loss": 0.8624, + "num_tokens": 22080960233.0, + "step": 5283 + }, + { + "epoch": 0.6279263220439691, + "grad_norm": 0.45146288662265627, + "learning_rate": 1.645609588966295e-05, + "loss": 0.8279, + "num_tokens": 22085149732.0, + "step": 5284 + }, + { + "epoch": 0.6280451574569221, + "grad_norm": 0.4070062644553455, + "learning_rate": 1.645471837882511e-05, + "loss": 0.8303, + "num_tokens": 22089339996.0, + "step": 5285 + }, + { + "epoch": 0.6281639928698752, + "grad_norm": 0.5142266547445331, + "learning_rate": 1.645334066598061e-05, + "loss": 0.8642, + "num_tokens": 22093518649.0, + "step": 5286 + }, + { + "epoch": 0.6282828282828283, + "grad_norm": 0.4440529038592415, + "learning_rate": 1.6451962751180463e-05, + "loss": 0.8456, + "num_tokens": 22097708589.0, + "step": 5287 + }, + { + "epoch": 0.6284016636957813, + "grad_norm": 0.41808636842143904, + "learning_rate": 1.6450584634475705e-05, + "loss": 0.864, + "num_tokens": 22101897198.0, + "step": 5288 + }, + { + "epoch": 0.6285204991087344, + "grad_norm": 0.4629851376662272, + "learning_rate": 1.6449206315917368e-05, + "loss": 0.8401, + "num_tokens": 22106063848.0, + "step": 5289 + }, + { + "epoch": 0.6286393345216874, + "grad_norm": 0.4567914153066482, + "learning_rate": 1.6447827795556498e-05, + "loss": 0.9026, + "num_tokens": 22110251538.0, + "step": 5290 + }, + { + "epoch": 0.6287581699346405, + "grad_norm": 0.4220692437886287, + "learning_rate": 1.6446449073444143e-05, + "loss": 0.8804, + "num_tokens": 22114440067.0, + "step": 5291 + }, + { + "epoch": 0.6288770053475936, + "grad_norm": 0.39549016339030485, + "learning_rate": 1.6445070149631366e-05, + "loss": 0.8849, + "num_tokens": 22118612916.0, + "step": 5292 + }, + { + "epoch": 0.6289958407605466, + "grad_norm": 0.4094167010265144, + "learning_rate": 1.644369102416923e-05, + "loss": 0.8594, + "num_tokens": 22122801902.0, + "step": 5293 + }, + { + "epoch": 0.6291146761734997, + "grad_norm": 0.5059589492115472, + "learning_rate": 1.6442311697108807e-05, + "loss": 0.8884, + "num_tokens": 22126991569.0, + "step": 5294 + }, + { + "epoch": 0.6292335115864528, + "grad_norm": 0.4507334846164256, + "learning_rate": 1.6440932168501187e-05, + "loss": 0.8494, + "num_tokens": 22131180285.0, + "step": 5295 + }, + { + "epoch": 0.6293523469994058, + "grad_norm": 0.3378691918938435, + "learning_rate": 1.643955243839745e-05, + "loss": 0.8405, + "num_tokens": 22135368194.0, + "step": 5296 + }, + { + "epoch": 0.6294711824123589, + "grad_norm": 0.46729244373388124, + "learning_rate": 1.6438172506848694e-05, + "loss": 0.8839, + "num_tokens": 22139556303.0, + "step": 5297 + }, + { + "epoch": 0.629590017825312, + "grad_norm": 0.5579418064285844, + "learning_rate": 1.6436792373906025e-05, + "loss": 0.8817, + "num_tokens": 22143744800.0, + "step": 5298 + }, + { + "epoch": 0.629708853238265, + "grad_norm": 0.43153116277913306, + "learning_rate": 1.643541203962055e-05, + "loss": 0.8441, + "num_tokens": 22147927513.0, + "step": 5299 + }, + { + "epoch": 0.6298276886512181, + "grad_norm": 0.3853013328899722, + "learning_rate": 1.6434031504043388e-05, + "loss": 0.8835, + "num_tokens": 22152104178.0, + "step": 5300 + }, + { + "epoch": 0.6299465240641712, + "grad_norm": 0.49072015364040406, + "learning_rate": 1.643265076722567e-05, + "loss": 0.8375, + "num_tokens": 22156294103.0, + "step": 5301 + }, + { + "epoch": 0.6300653594771242, + "grad_norm": 0.4886211592278736, + "learning_rate": 1.6431269829218525e-05, + "loss": 0.8509, + "num_tokens": 22160479784.0, + "step": 5302 + }, + { + "epoch": 0.6301841948900773, + "grad_norm": 0.43552770053788925, + "learning_rate": 1.6429888690073094e-05, + "loss": 0.8374, + "num_tokens": 22164637849.0, + "step": 5303 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.49226095975831274, + "learning_rate": 1.6428507349840524e-05, + "loss": 0.8674, + "num_tokens": 22168797306.0, + "step": 5304 + }, + { + "epoch": 0.6304218657159834, + "grad_norm": 0.4290283376460498, + "learning_rate": 1.6427125808571975e-05, + "loss": 0.8455, + "num_tokens": 22172984847.0, + "step": 5305 + }, + { + "epoch": 0.6305407011289365, + "grad_norm": 0.4246840335850721, + "learning_rate": 1.642574406631861e-05, + "loss": 0.8763, + "num_tokens": 22177174634.0, + "step": 5306 + }, + { + "epoch": 0.6306595365418894, + "grad_norm": 0.39195498729156064, + "learning_rate": 1.6424362123131595e-05, + "loss": 0.8564, + "num_tokens": 22181362968.0, + "step": 5307 + }, + { + "epoch": 0.6307783719548425, + "grad_norm": 0.42636279124453974, + "learning_rate": 1.6422979979062112e-05, + "loss": 0.8268, + "num_tokens": 22185521659.0, + "step": 5308 + }, + { + "epoch": 0.6308972073677956, + "grad_norm": 0.4458979225953897, + "learning_rate": 1.642159763416134e-05, + "loss": 0.8641, + "num_tokens": 22189710524.0, + "step": 5309 + }, + { + "epoch": 0.6310160427807486, + "grad_norm": 0.49951400285790815, + "learning_rate": 1.642021508848048e-05, + "loss": 0.8848, + "num_tokens": 22193898570.0, + "step": 5310 + }, + { + "epoch": 0.6311348781937017, + "grad_norm": 0.4067464931667428, + "learning_rate": 1.641883234207073e-05, + "loss": 0.8768, + "num_tokens": 22198088072.0, + "step": 5311 + }, + { + "epoch": 0.6312537136066548, + "grad_norm": 0.53526377413582, + "learning_rate": 1.6417449394983295e-05, + "loss": 0.8052, + "num_tokens": 22202278017.0, + "step": 5312 + }, + { + "epoch": 0.6313725490196078, + "grad_norm": 0.4959681735288025, + "learning_rate": 1.6416066247269392e-05, + "loss": 0.8571, + "num_tokens": 22206418154.0, + "step": 5313 + }, + { + "epoch": 0.6314913844325609, + "grad_norm": 0.3802228001428259, + "learning_rate": 1.6414682898980246e-05, + "loss": 0.8853, + "num_tokens": 22210607976.0, + "step": 5314 + }, + { + "epoch": 0.6316102198455139, + "grad_norm": 0.45153548128982657, + "learning_rate": 1.641329935016708e-05, + "loss": 0.8525, + "num_tokens": 22214786958.0, + "step": 5315 + }, + { + "epoch": 0.631729055258467, + "grad_norm": 0.44402910302931375, + "learning_rate": 1.641191560088114e-05, + "loss": 0.7956, + "num_tokens": 22218974412.0, + "step": 5316 + }, + { + "epoch": 0.6318478906714201, + "grad_norm": 0.49671311756578207, + "learning_rate": 1.6410531651173668e-05, + "loss": 0.853, + "num_tokens": 22223147624.0, + "step": 5317 + }, + { + "epoch": 0.6319667260843731, + "grad_norm": 0.4637071366504775, + "learning_rate": 1.6409147501095913e-05, + "loss": 0.8352, + "num_tokens": 22227337673.0, + "step": 5318 + }, + { + "epoch": 0.6320855614973262, + "grad_norm": 0.36245140615438975, + "learning_rate": 1.640776315069914e-05, + "loss": 0.8561, + "num_tokens": 22231507494.0, + "step": 5319 + }, + { + "epoch": 0.6322043969102793, + "grad_norm": 0.48685204672821064, + "learning_rate": 1.6406378600034612e-05, + "loss": 0.8381, + "num_tokens": 22235662812.0, + "step": 5320 + }, + { + "epoch": 0.6323232323232323, + "grad_norm": 0.437100370260299, + "learning_rate": 1.6404993849153607e-05, + "loss": 0.8639, + "num_tokens": 22239838547.0, + "step": 5321 + }, + { + "epoch": 0.6324420677361854, + "grad_norm": 0.3821845270108673, + "learning_rate": 1.6403608898107403e-05, + "loss": 0.8346, + "num_tokens": 22244010713.0, + "step": 5322 + }, + { + "epoch": 0.6325609031491385, + "grad_norm": 0.491625914536758, + "learning_rate": 1.640222374694729e-05, + "loss": 0.8194, + "num_tokens": 22248201272.0, + "step": 5323 + }, + { + "epoch": 0.6326797385620915, + "grad_norm": 0.41010535329618136, + "learning_rate": 1.6400838395724568e-05, + "loss": 0.9274, + "num_tokens": 22252390432.0, + "step": 5324 + }, + { + "epoch": 0.6327985739750446, + "grad_norm": 0.40385534949591634, + "learning_rate": 1.6399452844490537e-05, + "loss": 0.8543, + "num_tokens": 22256579389.0, + "step": 5325 + }, + { + "epoch": 0.6329174093879977, + "grad_norm": 0.3948153316220406, + "learning_rate": 1.6398067093296516e-05, + "loss": 0.8978, + "num_tokens": 22260768609.0, + "step": 5326 + }, + { + "epoch": 0.6330362448009507, + "grad_norm": 0.4890826783481907, + "learning_rate": 1.6396681142193816e-05, + "loss": 0.8522, + "num_tokens": 22264913517.0, + "step": 5327 + }, + { + "epoch": 0.6331550802139038, + "grad_norm": 0.34136615425468014, + "learning_rate": 1.639529499123377e-05, + "loss": 0.8274, + "num_tokens": 22269102295.0, + "step": 5328 + }, + { + "epoch": 0.6332739156268568, + "grad_norm": 0.4409880864432364, + "learning_rate": 1.6393908640467704e-05, + "loss": 0.8472, + "num_tokens": 22273291990.0, + "step": 5329 + }, + { + "epoch": 0.6333927510398099, + "grad_norm": 0.4855747656068538, + "learning_rate": 1.6392522089946964e-05, + "loss": 0.8475, + "num_tokens": 22277481637.0, + "step": 5330 + }, + { + "epoch": 0.633511586452763, + "grad_norm": 0.372822984486927, + "learning_rate": 1.6391135339722905e-05, + "loss": 0.8345, + "num_tokens": 22281671367.0, + "step": 5331 + }, + { + "epoch": 0.6336304218657159, + "grad_norm": 0.5599057678886031, + "learning_rate": 1.6389748389846868e-05, + "loss": 0.876, + "num_tokens": 22285829450.0, + "step": 5332 + }, + { + "epoch": 0.633749257278669, + "grad_norm": 0.4501458757454767, + "learning_rate": 1.6388361240370228e-05, + "loss": 0.8682, + "num_tokens": 22289991880.0, + "step": 5333 + }, + { + "epoch": 0.6338680926916221, + "grad_norm": 0.5607276246929318, + "learning_rate": 1.6386973891344355e-05, + "loss": 0.8778, + "num_tokens": 22294181260.0, + "step": 5334 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.42973585884926446, + "learning_rate": 1.6385586342820627e-05, + "loss": 0.8538, + "num_tokens": 22298359595.0, + "step": 5335 + }, + { + "epoch": 0.6341057635175282, + "grad_norm": 0.5718332667077959, + "learning_rate": 1.638419859485042e-05, + "loss": 0.8687, + "num_tokens": 22302547291.0, + "step": 5336 + }, + { + "epoch": 0.6342245989304813, + "grad_norm": 0.4746446975028021, + "learning_rate": 1.638281064748514e-05, + "loss": 0.8474, + "num_tokens": 22306731668.0, + "step": 5337 + }, + { + "epoch": 0.6343434343434343, + "grad_norm": 0.4984284262157028, + "learning_rate": 1.638142250077618e-05, + "loss": 0.8166, + "num_tokens": 22310921088.0, + "step": 5338 + }, + { + "epoch": 0.6344622697563874, + "grad_norm": 0.45594709162207203, + "learning_rate": 1.638003415477495e-05, + "loss": 0.8672, + "num_tokens": 22315110094.0, + "step": 5339 + }, + { + "epoch": 0.6345811051693404, + "grad_norm": 0.4838221148927669, + "learning_rate": 1.6378645609532867e-05, + "loss": 0.8702, + "num_tokens": 22319266530.0, + "step": 5340 + }, + { + "epoch": 0.6346999405822935, + "grad_norm": 0.48466674997332304, + "learning_rate": 1.637725686510135e-05, + "loss": 0.8508, + "num_tokens": 22323438138.0, + "step": 5341 + }, + { + "epoch": 0.6348187759952466, + "grad_norm": 0.5391312801899455, + "learning_rate": 1.637586792153183e-05, + "loss": 0.8655, + "num_tokens": 22327601084.0, + "step": 5342 + }, + { + "epoch": 0.6349376114081996, + "grad_norm": 0.42319877195015027, + "learning_rate": 1.6374478778875744e-05, + "loss": 0.8842, + "num_tokens": 22331781750.0, + "step": 5343 + }, + { + "epoch": 0.6350564468211527, + "grad_norm": 0.5130010043807555, + "learning_rate": 1.637308943718454e-05, + "loss": 0.8573, + "num_tokens": 22335920709.0, + "step": 5344 + }, + { + "epoch": 0.6351752822341058, + "grad_norm": 0.4435514320027966, + "learning_rate": 1.6371699896509665e-05, + "loss": 0.8675, + "num_tokens": 22340110995.0, + "step": 5345 + }, + { + "epoch": 0.6352941176470588, + "grad_norm": 0.5224992929012257, + "learning_rate": 1.6370310156902583e-05, + "loss": 0.8471, + "num_tokens": 22344301077.0, + "step": 5346 + }, + { + "epoch": 0.6354129530600119, + "grad_norm": 0.4577129658271342, + "learning_rate": 1.6368920218414757e-05, + "loss": 0.8612, + "num_tokens": 22348467447.0, + "step": 5347 + }, + { + "epoch": 0.635531788472965, + "grad_norm": 0.44868153469225325, + "learning_rate": 1.636753008109766e-05, + "loss": 0.8769, + "num_tokens": 22352656916.0, + "step": 5348 + }, + { + "epoch": 0.635650623885918, + "grad_norm": 0.4433952258631549, + "learning_rate": 1.636613974500278e-05, + "loss": 0.8475, + "num_tokens": 22356846529.0, + "step": 5349 + }, + { + "epoch": 0.6357694592988711, + "grad_norm": 0.4575565357719062, + "learning_rate": 1.63647492101816e-05, + "loss": 0.8546, + "num_tokens": 22361024180.0, + "step": 5350 + }, + { + "epoch": 0.6358882947118242, + "grad_norm": 0.3821852205803296, + "learning_rate": 1.6363358476685614e-05, + "loss": 0.8632, + "num_tokens": 22365180886.0, + "step": 5351 + }, + { + "epoch": 0.6360071301247772, + "grad_norm": 0.4424921926435696, + "learning_rate": 1.6361967544566338e-05, + "loss": 0.8953, + "num_tokens": 22369369632.0, + "step": 5352 + }, + { + "epoch": 0.6361259655377303, + "grad_norm": 0.4143774910434393, + "learning_rate": 1.6360576413875272e-05, + "loss": 0.8695, + "num_tokens": 22373558842.0, + "step": 5353 + }, + { + "epoch": 0.6362448009506833, + "grad_norm": 0.4987361631147512, + "learning_rate": 1.6359185084663935e-05, + "loss": 0.8493, + "num_tokens": 22377747658.0, + "step": 5354 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.3975428893246186, + "learning_rate": 1.6357793556983854e-05, + "loss": 0.8992, + "num_tokens": 22381921719.0, + "step": 5355 + }, + { + "epoch": 0.6364824717765895, + "grad_norm": 0.4318565219525651, + "learning_rate": 1.6356401830886562e-05, + "loss": 0.8307, + "num_tokens": 22386112118.0, + "step": 5356 + }, + { + "epoch": 0.6366013071895424, + "grad_norm": 0.404354392242026, + "learning_rate": 1.63550099064236e-05, + "loss": 0.8369, + "num_tokens": 22390300514.0, + "step": 5357 + }, + { + "epoch": 0.6367201426024955, + "grad_norm": 0.5144531677253793, + "learning_rate": 1.635361778364652e-05, + "loss": 0.8445, + "num_tokens": 22394477180.0, + "step": 5358 + }, + { + "epoch": 0.6368389780154486, + "grad_norm": 0.45542547339269995, + "learning_rate": 1.635222546260687e-05, + "loss": 0.8597, + "num_tokens": 22398630259.0, + "step": 5359 + }, + { + "epoch": 0.6369578134284016, + "grad_norm": 0.45575195488957815, + "learning_rate": 1.635083294335621e-05, + "loss": 0.8501, + "num_tokens": 22402818987.0, + "step": 5360 + }, + { + "epoch": 0.6370766488413547, + "grad_norm": 0.39120364141782105, + "learning_rate": 1.634944022594612e-05, + "loss": 0.8405, + "num_tokens": 22406984396.0, + "step": 5361 + }, + { + "epoch": 0.6371954842543078, + "grad_norm": 0.4251827630131539, + "learning_rate": 1.6348047310428168e-05, + "loss": 0.8776, + "num_tokens": 22411172853.0, + "step": 5362 + }, + { + "epoch": 0.6373143196672608, + "grad_norm": 0.46036967667446715, + "learning_rate": 1.6346654196853945e-05, + "loss": 0.8566, + "num_tokens": 22415333656.0, + "step": 5363 + }, + { + "epoch": 0.6374331550802139, + "grad_norm": 0.46139407793062287, + "learning_rate": 1.6345260885275036e-05, + "loss": 0.8437, + "num_tokens": 22419468026.0, + "step": 5364 + }, + { + "epoch": 0.6375519904931669, + "grad_norm": 0.5025012776887691, + "learning_rate": 1.6343867375743047e-05, + "loss": 0.8308, + "num_tokens": 22423657787.0, + "step": 5365 + }, + { + "epoch": 0.63767082590612, + "grad_norm": 0.3915688497446017, + "learning_rate": 1.6342473668309582e-05, + "loss": 0.8277, + "num_tokens": 22427818844.0, + "step": 5366 + }, + { + "epoch": 0.6377896613190731, + "grad_norm": 0.41333987034096525, + "learning_rate": 1.634107976302625e-05, + "loss": 0.859, + "num_tokens": 22432007536.0, + "step": 5367 + }, + { + "epoch": 0.6379084967320261, + "grad_norm": 0.4742161203489636, + "learning_rate": 1.6339685659944678e-05, + "loss": 0.8558, + "num_tokens": 22436196650.0, + "step": 5368 + }, + { + "epoch": 0.6380273321449792, + "grad_norm": 0.4774558753829724, + "learning_rate": 1.6338291359116495e-05, + "loss": 0.8569, + "num_tokens": 22440385234.0, + "step": 5369 + }, + { + "epoch": 0.6381461675579323, + "grad_norm": 0.4231845987564853, + "learning_rate": 1.6336896860593327e-05, + "loss": 0.8807, + "num_tokens": 22444574427.0, + "step": 5370 + }, + { + "epoch": 0.6382650029708853, + "grad_norm": 0.40224655178630675, + "learning_rate": 1.6335502164426827e-05, + "loss": 0.8676, + "num_tokens": 22448764508.0, + "step": 5371 + }, + { + "epoch": 0.6383838383838384, + "grad_norm": 0.5110401334483169, + "learning_rate": 1.6334107270668642e-05, + "loss": 0.8815, + "num_tokens": 22452953902.0, + "step": 5372 + }, + { + "epoch": 0.6385026737967915, + "grad_norm": 0.4621223612988321, + "learning_rate": 1.6332712179370428e-05, + "loss": 0.8668, + "num_tokens": 22457143391.0, + "step": 5373 + }, + { + "epoch": 0.6386215092097445, + "grad_norm": 0.38873602358900106, + "learning_rate": 1.6331316890583855e-05, + "loss": 0.8808, + "num_tokens": 22461329731.0, + "step": 5374 + }, + { + "epoch": 0.6387403446226976, + "grad_norm": 0.41962016645123196, + "learning_rate": 1.632992140436059e-05, + "loss": 0.8664, + "num_tokens": 22465494917.0, + "step": 5375 + }, + { + "epoch": 0.6388591800356507, + "grad_norm": 0.4803157305803753, + "learning_rate": 1.6328525720752318e-05, + "loss": 0.8209, + "num_tokens": 22469652141.0, + "step": 5376 + }, + { + "epoch": 0.6389780154486037, + "grad_norm": 0.40182609910658734, + "learning_rate": 1.632712983981072e-05, + "loss": 0.8493, + "num_tokens": 22473842073.0, + "step": 5377 + }, + { + "epoch": 0.6390968508615568, + "grad_norm": 0.4656462441527636, + "learning_rate": 1.632573376158749e-05, + "loss": 0.8365, + "num_tokens": 22478011161.0, + "step": 5378 + }, + { + "epoch": 0.6392156862745098, + "grad_norm": 0.47718023571345614, + "learning_rate": 1.6324337486134333e-05, + "loss": 0.8282, + "num_tokens": 22482200443.0, + "step": 5379 + }, + { + "epoch": 0.6393345216874629, + "grad_norm": 0.3897513040865674, + "learning_rate": 1.632294101350296e-05, + "loss": 0.8855, + "num_tokens": 22486363957.0, + "step": 5380 + }, + { + "epoch": 0.639453357100416, + "grad_norm": 0.5517996327985649, + "learning_rate": 1.6321544343745082e-05, + "loss": 0.8786, + "num_tokens": 22490553287.0, + "step": 5381 + }, + { + "epoch": 0.6395721925133689, + "grad_norm": 0.45920786120558643, + "learning_rate": 1.6320147476912426e-05, + "loss": 0.8695, + "num_tokens": 22494711132.0, + "step": 5382 + }, + { + "epoch": 0.639691027926322, + "grad_norm": 0.4885477771950422, + "learning_rate": 1.6318750413056718e-05, + "loss": 0.8294, + "num_tokens": 22498890318.0, + "step": 5383 + }, + { + "epoch": 0.6398098633392751, + "grad_norm": 0.5131386649280033, + "learning_rate": 1.6317353152229702e-05, + "loss": 0.8494, + "num_tokens": 22503077976.0, + "step": 5384 + }, + { + "epoch": 0.6399286987522281, + "grad_norm": 0.4655691750099321, + "learning_rate": 1.631595569448312e-05, + "loss": 0.8539, + "num_tokens": 22507231497.0, + "step": 5385 + }, + { + "epoch": 0.6400475341651812, + "grad_norm": 0.49662984836992724, + "learning_rate": 1.631455803986872e-05, + "loss": 0.8431, + "num_tokens": 22511419473.0, + "step": 5386 + }, + { + "epoch": 0.6401663695781343, + "grad_norm": 0.41075476141640876, + "learning_rate": 1.6313160188438273e-05, + "loss": 0.8939, + "num_tokens": 22515607761.0, + "step": 5387 + }, + { + "epoch": 0.6402852049910873, + "grad_norm": 0.4970260079101995, + "learning_rate": 1.6311762140243538e-05, + "loss": 0.8307, + "num_tokens": 22519766262.0, + "step": 5388 + }, + { + "epoch": 0.6404040404040404, + "grad_norm": 0.45232651070411184, + "learning_rate": 1.631036389533629e-05, + "loss": 0.8216, + "num_tokens": 22523956479.0, + "step": 5389 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.4565291879480511, + "learning_rate": 1.6308965453768316e-05, + "loss": 0.8318, + "num_tokens": 22528133631.0, + "step": 5390 + }, + { + "epoch": 0.6406417112299465, + "grad_norm": 0.5387158016646997, + "learning_rate": 1.63075668155914e-05, + "loss": 0.8415, + "num_tokens": 22532295064.0, + "step": 5391 + }, + { + "epoch": 0.6407605466428996, + "grad_norm": 0.41844536071878163, + "learning_rate": 1.6306167980857336e-05, + "loss": 0.865, + "num_tokens": 22536484243.0, + "step": 5392 + }, + { + "epoch": 0.6408793820558526, + "grad_norm": 0.49026600770598266, + "learning_rate": 1.6304768949617933e-05, + "loss": 0.8187, + "num_tokens": 22540656792.0, + "step": 5393 + }, + { + "epoch": 0.6409982174688057, + "grad_norm": 0.3883285318911305, + "learning_rate": 1.6303369721925e-05, + "loss": 0.8561, + "num_tokens": 22544845557.0, + "step": 5394 + }, + { + "epoch": 0.6411170528817588, + "grad_norm": 0.41701794260790287, + "learning_rate": 1.6301970297830356e-05, + "loss": 0.8637, + "num_tokens": 22549018185.0, + "step": 5395 + }, + { + "epoch": 0.6412358882947118, + "grad_norm": 0.5086006075546502, + "learning_rate": 1.6300570677385823e-05, + "loss": 0.8382, + "num_tokens": 22553207990.0, + "step": 5396 + }, + { + "epoch": 0.6413547237076649, + "grad_norm": 0.37978786852035734, + "learning_rate": 1.6299170860643238e-05, + "loss": 0.88, + "num_tokens": 22557398503.0, + "step": 5397 + }, + { + "epoch": 0.641473559120618, + "grad_norm": 0.4354300782321767, + "learning_rate": 1.629777084765444e-05, + "loss": 0.8791, + "num_tokens": 22561557546.0, + "step": 5398 + }, + { + "epoch": 0.641592394533571, + "grad_norm": 0.4460864912910596, + "learning_rate": 1.6296370638471274e-05, + "loss": 0.8618, + "num_tokens": 22565728478.0, + "step": 5399 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 0.42013595422884087, + "learning_rate": 1.62949702331456e-05, + "loss": 0.8583, + "num_tokens": 22569918337.0, + "step": 5400 + }, + { + "epoch": 0.6418300653594772, + "grad_norm": 0.41983740964299926, + "learning_rate": 1.629356963172927e-05, + "loss": 0.845, + "num_tokens": 22574102197.0, + "step": 5401 + }, + { + "epoch": 0.6419489007724302, + "grad_norm": 0.44224334653283387, + "learning_rate": 1.6292168834274156e-05, + "loss": 0.8532, + "num_tokens": 22578262934.0, + "step": 5402 + }, + { + "epoch": 0.6420677361853833, + "grad_norm": 0.4753293024198291, + "learning_rate": 1.6290767840832144e-05, + "loss": 0.8308, + "num_tokens": 22582425020.0, + "step": 5403 + }, + { + "epoch": 0.6421865715983363, + "grad_norm": 0.3750022628442083, + "learning_rate": 1.6289366651455107e-05, + "loss": 0.8299, + "num_tokens": 22586613821.0, + "step": 5404 + }, + { + "epoch": 0.6423054070112894, + "grad_norm": 0.4742158020115027, + "learning_rate": 1.6287965266194942e-05, + "loss": 0.8635, + "num_tokens": 22590803198.0, + "step": 5405 + }, + { + "epoch": 0.6424242424242425, + "grad_norm": 0.5220085986801115, + "learning_rate": 1.6286563685103537e-05, + "loss": 0.8423, + "num_tokens": 22594991259.0, + "step": 5406 + }, + { + "epoch": 0.6425430778371954, + "grad_norm": 0.4658107949476847, + "learning_rate": 1.6285161908232814e-05, + "loss": 0.8763, + "num_tokens": 22599181715.0, + "step": 5407 + }, + { + "epoch": 0.6426619132501485, + "grad_norm": 0.46141574408325575, + "learning_rate": 1.6283759935634666e-05, + "loss": 0.8389, + "num_tokens": 22603371158.0, + "step": 5408 + }, + { + "epoch": 0.6427807486631016, + "grad_norm": 0.3866408459782744, + "learning_rate": 1.628235776736103e-05, + "loss": 0.8889, + "num_tokens": 22607559821.0, + "step": 5409 + }, + { + "epoch": 0.6428995840760546, + "grad_norm": 0.5686298391378083, + "learning_rate": 1.628095540346382e-05, + "loss": 0.8581, + "num_tokens": 22611749071.0, + "step": 5410 + }, + { + "epoch": 0.6430184194890077, + "grad_norm": 0.47522601303249645, + "learning_rate": 1.6279552843994983e-05, + "loss": 0.8077, + "num_tokens": 22615899007.0, + "step": 5411 + }, + { + "epoch": 0.6431372549019608, + "grad_norm": 0.42767746418085173, + "learning_rate": 1.6278150089006452e-05, + "loss": 0.8478, + "num_tokens": 22620065837.0, + "step": 5412 + }, + { + "epoch": 0.6432560903149138, + "grad_norm": 0.45420171459708064, + "learning_rate": 1.6276747138550177e-05, + "loss": 0.8438, + "num_tokens": 22624253709.0, + "step": 5413 + }, + { + "epoch": 0.6433749257278669, + "grad_norm": 0.45994035669243954, + "learning_rate": 1.627534399267811e-05, + "loss": 0.8857, + "num_tokens": 22628442523.0, + "step": 5414 + }, + { + "epoch": 0.64349376114082, + "grad_norm": 0.40241891389781864, + "learning_rate": 1.627394065144222e-05, + "loss": 0.8713, + "num_tokens": 22632632550.0, + "step": 5415 + }, + { + "epoch": 0.643612596553773, + "grad_norm": 0.4232099643341697, + "learning_rate": 1.627253711489448e-05, + "loss": 0.879, + "num_tokens": 22636790557.0, + "step": 5416 + }, + { + "epoch": 0.6437314319667261, + "grad_norm": 0.3633725072948837, + "learning_rate": 1.6271133383086864e-05, + "loss": 0.8242, + "num_tokens": 22640968757.0, + "step": 5417 + }, + { + "epoch": 0.6438502673796791, + "grad_norm": 0.40277135077075377, + "learning_rate": 1.6269729456071352e-05, + "loss": 0.8301, + "num_tokens": 22645159527.0, + "step": 5418 + }, + { + "epoch": 0.6439691027926322, + "grad_norm": 0.39507476328844193, + "learning_rate": 1.626832533389994e-05, + "loss": 0.8564, + "num_tokens": 22649349841.0, + "step": 5419 + }, + { + "epoch": 0.6440879382055853, + "grad_norm": 0.4200869936282641, + "learning_rate": 1.6266921016624633e-05, + "loss": 0.8701, + "num_tokens": 22653509290.0, + "step": 5420 + }, + { + "epoch": 0.6442067736185383, + "grad_norm": 0.47446501834470306, + "learning_rate": 1.6265516504297432e-05, + "loss": 0.8402, + "num_tokens": 22657697187.0, + "step": 5421 + }, + { + "epoch": 0.6443256090314914, + "grad_norm": 0.4755363182773405, + "learning_rate": 1.626411179697035e-05, + "loss": 0.8717, + "num_tokens": 22661885275.0, + "step": 5422 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.47423054596304653, + "learning_rate": 1.6262706894695412e-05, + "loss": 0.8275, + "num_tokens": 22666059512.0, + "step": 5423 + }, + { + "epoch": 0.6445632798573975, + "grad_norm": 0.5222289579872373, + "learning_rate": 1.6261301797524643e-05, + "loss": 0.8698, + "num_tokens": 22670221551.0, + "step": 5424 + }, + { + "epoch": 0.6446821152703506, + "grad_norm": 0.4278319686683832, + "learning_rate": 1.625989650551008e-05, + "loss": 0.8823, + "num_tokens": 22674381783.0, + "step": 5425 + }, + { + "epoch": 0.6448009506833037, + "grad_norm": 0.44585458367156794, + "learning_rate": 1.6258491018703764e-05, + "loss": 0.8492, + "num_tokens": 22678571892.0, + "step": 5426 + }, + { + "epoch": 0.6449197860962567, + "grad_norm": 0.3992421691968428, + "learning_rate": 1.6257085337157744e-05, + "loss": 0.8426, + "num_tokens": 22682717044.0, + "step": 5427 + }, + { + "epoch": 0.6450386215092098, + "grad_norm": 0.43389134064711765, + "learning_rate": 1.6255679460924085e-05, + "loss": 0.8381, + "num_tokens": 22686884323.0, + "step": 5428 + }, + { + "epoch": 0.6451574569221628, + "grad_norm": 0.4462326296581198, + "learning_rate": 1.6254273390054844e-05, + "loss": 0.8697, + "num_tokens": 22691072540.0, + "step": 5429 + }, + { + "epoch": 0.6452762923351159, + "grad_norm": 0.41905909935282915, + "learning_rate": 1.625286712460209e-05, + "loss": 0.8369, + "num_tokens": 22695229004.0, + "step": 5430 + }, + { + "epoch": 0.645395127748069, + "grad_norm": 0.442223296919691, + "learning_rate": 1.625146066461791e-05, + "loss": 0.8569, + "num_tokens": 22699394612.0, + "step": 5431 + }, + { + "epoch": 0.6455139631610219, + "grad_norm": 0.45437020698536823, + "learning_rate": 1.6250054010154386e-05, + "loss": 0.8831, + "num_tokens": 22703584013.0, + "step": 5432 + }, + { + "epoch": 0.645632798573975, + "grad_norm": 0.447582779273997, + "learning_rate": 1.624864716126361e-05, + "loss": 0.8507, + "num_tokens": 22707769221.0, + "step": 5433 + }, + { + "epoch": 0.6457516339869281, + "grad_norm": 0.40145575186405597, + "learning_rate": 1.6247240117997685e-05, + "loss": 0.8449, + "num_tokens": 22711931628.0, + "step": 5434 + }, + { + "epoch": 0.6458704693998811, + "grad_norm": 0.3788552244403945, + "learning_rate": 1.6245832880408716e-05, + "loss": 0.8659, + "num_tokens": 22716120859.0, + "step": 5435 + }, + { + "epoch": 0.6459893048128342, + "grad_norm": 0.44468059526387727, + "learning_rate": 1.6244425448548816e-05, + "loss": 0.8417, + "num_tokens": 22720307915.0, + "step": 5436 + }, + { + "epoch": 0.6461081402257873, + "grad_norm": 0.4656668397037806, + "learning_rate": 1.6243017822470113e-05, + "loss": 0.8174, + "num_tokens": 22724496558.0, + "step": 5437 + }, + { + "epoch": 0.6462269756387403, + "grad_norm": 0.5408182986228702, + "learning_rate": 1.6241610002224733e-05, + "loss": 0.8429, + "num_tokens": 22728686247.0, + "step": 5438 + }, + { + "epoch": 0.6463458110516934, + "grad_norm": 0.4088571402839097, + "learning_rate": 1.6240201987864813e-05, + "loss": 0.8139, + "num_tokens": 22732850513.0, + "step": 5439 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 0.43774992370543686, + "learning_rate": 1.623879377944249e-05, + "loss": 0.8344, + "num_tokens": 22737039563.0, + "step": 5440 + }, + { + "epoch": 0.6465834818775995, + "grad_norm": 0.42311514698735114, + "learning_rate": 1.623738537700993e-05, + "loss": 0.8078, + "num_tokens": 22741192357.0, + "step": 5441 + }, + { + "epoch": 0.6467023172905526, + "grad_norm": 0.4920061272770727, + "learning_rate": 1.6235976780619275e-05, + "loss": 0.8356, + "num_tokens": 22745380850.0, + "step": 5442 + }, + { + "epoch": 0.6468211527035056, + "grad_norm": 0.5112374095923249, + "learning_rate": 1.6234567990322697e-05, + "loss": 0.8369, + "num_tokens": 22749569577.0, + "step": 5443 + }, + { + "epoch": 0.6469399881164587, + "grad_norm": 0.4750505279109048, + "learning_rate": 1.6233159006172367e-05, + "loss": 0.8792, + "num_tokens": 22753749928.0, + "step": 5444 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.4373624152544894, + "learning_rate": 1.623174982822047e-05, + "loss": 0.8676, + "num_tokens": 22757938872.0, + "step": 5445 + }, + { + "epoch": 0.6471776589423648, + "grad_norm": 0.4712292014165184, + "learning_rate": 1.6230340456519183e-05, + "loss": 0.871, + "num_tokens": 22762127029.0, + "step": 5446 + }, + { + "epoch": 0.6472964943553179, + "grad_norm": 0.41944335472957117, + "learning_rate": 1.622893089112071e-05, + "loss": 0.8855, + "num_tokens": 22766316010.0, + "step": 5447 + }, + { + "epoch": 0.647415329768271, + "grad_norm": 0.4446421756601019, + "learning_rate": 1.6227521132077245e-05, + "loss": 0.8487, + "num_tokens": 22770499487.0, + "step": 5448 + }, + { + "epoch": 0.647534165181224, + "grad_norm": 0.5772337915430809, + "learning_rate": 1.6226111179440993e-05, + "loss": 0.8389, + "num_tokens": 22774662763.0, + "step": 5449 + }, + { + "epoch": 0.6476530005941771, + "grad_norm": 0.3884446987208896, + "learning_rate": 1.622470103326418e-05, + "loss": 0.857, + "num_tokens": 22778851925.0, + "step": 5450 + }, + { + "epoch": 0.6477718360071302, + "grad_norm": 0.4618482098717204, + "learning_rate": 1.6223290693599018e-05, + "loss": 0.8967, + "num_tokens": 22782996934.0, + "step": 5451 + }, + { + "epoch": 0.6478906714200832, + "grad_norm": 0.4550651175156573, + "learning_rate": 1.6221880160497744e-05, + "loss": 0.8668, + "num_tokens": 22787186163.0, + "step": 5452 + }, + { + "epoch": 0.6480095068330363, + "grad_norm": 0.5227371488118531, + "learning_rate": 1.6220469434012588e-05, + "loss": 0.8739, + "num_tokens": 22791376338.0, + "step": 5453 + }, + { + "epoch": 0.6481283422459893, + "grad_norm": 0.4385976374189597, + "learning_rate": 1.6219058514195803e-05, + "loss": 0.8283, + "num_tokens": 22795564910.0, + "step": 5454 + }, + { + "epoch": 0.6482471776589424, + "grad_norm": 0.5445881695021393, + "learning_rate": 1.6217647401099635e-05, + "loss": 0.8651, + "num_tokens": 22799727623.0, + "step": 5455 + }, + { + "epoch": 0.6483660130718955, + "grad_norm": 0.4681021378672755, + "learning_rate": 1.621623609477634e-05, + "loss": 0.8788, + "num_tokens": 22803917337.0, + "step": 5456 + }, + { + "epoch": 0.6484848484848484, + "grad_norm": 0.36825651404372517, + "learning_rate": 1.6214824595278186e-05, + "loss": 0.8002, + "num_tokens": 22808107094.0, + "step": 5457 + }, + { + "epoch": 0.6486036838978015, + "grad_norm": 0.4859352420091165, + "learning_rate": 1.6213412902657448e-05, + "loss": 0.8325, + "num_tokens": 22812296749.0, + "step": 5458 + }, + { + "epoch": 0.6487225193107546, + "grad_norm": 0.4249366336778836, + "learning_rate": 1.62120010169664e-05, + "loss": 0.8444, + "num_tokens": 22816486279.0, + "step": 5459 + }, + { + "epoch": 0.6488413547237076, + "grad_norm": 0.4749699136135772, + "learning_rate": 1.6210588938257336e-05, + "loss": 0.855, + "num_tokens": 22820660882.0, + "step": 5460 + }, + { + "epoch": 0.6489601901366607, + "grad_norm": 0.47114567903334503, + "learning_rate": 1.6209176666582544e-05, + "loss": 0.8229, + "num_tokens": 22824849987.0, + "step": 5461 + }, + { + "epoch": 0.6490790255496138, + "grad_norm": 0.42100633401424553, + "learning_rate": 1.6207764201994328e-05, + "loss": 0.86, + "num_tokens": 22829039119.0, + "step": 5462 + }, + { + "epoch": 0.6491978609625668, + "grad_norm": 0.39925948720617993, + "learning_rate": 1.6206351544544994e-05, + "loss": 0.8037, + "num_tokens": 22833227908.0, + "step": 5463 + }, + { + "epoch": 0.6493166963755199, + "grad_norm": 0.4369753052329453, + "learning_rate": 1.620493869428686e-05, + "loss": 0.8412, + "num_tokens": 22837417225.0, + "step": 5464 + }, + { + "epoch": 0.649435531788473, + "grad_norm": 0.44839154216949684, + "learning_rate": 1.6203525651272246e-05, + "loss": 0.857, + "num_tokens": 22841593533.0, + "step": 5465 + }, + { + "epoch": 0.649554367201426, + "grad_norm": 0.3870331468470227, + "learning_rate": 1.620211241555349e-05, + "loss": 0.8487, + "num_tokens": 22845757355.0, + "step": 5466 + }, + { + "epoch": 0.6496732026143791, + "grad_norm": 0.42055395300159265, + "learning_rate": 1.6200698987182917e-05, + "loss": 0.8748, + "num_tokens": 22849946732.0, + "step": 5467 + }, + { + "epoch": 0.6497920380273321, + "grad_norm": 0.4976249322734748, + "learning_rate": 1.619928536621288e-05, + "loss": 0.8593, + "num_tokens": 22854133610.0, + "step": 5468 + }, + { + "epoch": 0.6499108734402852, + "grad_norm": 0.4133754716254176, + "learning_rate": 1.6197871552695724e-05, + "loss": 0.8394, + "num_tokens": 22858314076.0, + "step": 5469 + }, + { + "epoch": 0.6500297088532383, + "grad_norm": 0.4616939452399103, + "learning_rate": 1.619645754668381e-05, + "loss": 0.8524, + "num_tokens": 22862504047.0, + "step": 5470 + }, + { + "epoch": 0.6501485442661913, + "grad_norm": 0.4366927956693493, + "learning_rate": 1.6195043348229503e-05, + "loss": 0.8897, + "num_tokens": 22866694146.0, + "step": 5471 + }, + { + "epoch": 0.6502673796791444, + "grad_norm": 0.45297267615977516, + "learning_rate": 1.6193628957385176e-05, + "loss": 0.8247, + "num_tokens": 22870850932.0, + "step": 5472 + }, + { + "epoch": 0.6503862150920975, + "grad_norm": 0.44081526295654355, + "learning_rate": 1.6192214374203214e-05, + "loss": 0.8326, + "num_tokens": 22875040079.0, + "step": 5473 + }, + { + "epoch": 0.6505050505050505, + "grad_norm": 0.44119074069421443, + "learning_rate": 1.619079959873599e-05, + "loss": 0.8683, + "num_tokens": 22879227704.0, + "step": 5474 + }, + { + "epoch": 0.6506238859180036, + "grad_norm": 0.4316086837521458, + "learning_rate": 1.6189384631035908e-05, + "loss": 0.8553, + "num_tokens": 22883417320.0, + "step": 5475 + }, + { + "epoch": 0.6507427213309567, + "grad_norm": 0.4262859412179257, + "learning_rate": 1.6187969471155372e-05, + "loss": 0.8392, + "num_tokens": 22887588995.0, + "step": 5476 + }, + { + "epoch": 0.6508615567439097, + "grad_norm": 0.4167467258136977, + "learning_rate": 1.6186554119146784e-05, + "loss": 0.8508, + "num_tokens": 22891779234.0, + "step": 5477 + }, + { + "epoch": 0.6509803921568628, + "grad_norm": 0.4757451599250853, + "learning_rate": 1.618513857506256e-05, + "loss": 0.8766, + "num_tokens": 22895968960.0, + "step": 5478 + }, + { + "epoch": 0.6510992275698158, + "grad_norm": 0.548338252802339, + "learning_rate": 1.6183722838955122e-05, + "loss": 0.8885, + "num_tokens": 22900131247.0, + "step": 5479 + }, + { + "epoch": 0.6512180629827689, + "grad_norm": 0.40230915404261763, + "learning_rate": 1.6182306910876903e-05, + "loss": 0.8601, + "num_tokens": 22904318437.0, + "step": 5480 + }, + { + "epoch": 0.651336898395722, + "grad_norm": 0.4533424223150634, + "learning_rate": 1.6180890790880333e-05, + "loss": 0.8471, + "num_tokens": 22908505708.0, + "step": 5481 + }, + { + "epoch": 0.6514557338086749, + "grad_norm": 0.4577760852585351, + "learning_rate": 1.6179474479017866e-05, + "loss": 0.8857, + "num_tokens": 22912694029.0, + "step": 5482 + }, + { + "epoch": 0.651574569221628, + "grad_norm": 0.4148981538014189, + "learning_rate": 1.6178057975341945e-05, + "loss": 0.8371, + "num_tokens": 22916884249.0, + "step": 5483 + }, + { + "epoch": 0.6516934046345811, + "grad_norm": 0.5143577891493827, + "learning_rate": 1.6176641279905028e-05, + "loss": 0.8198, + "num_tokens": 22921057711.0, + "step": 5484 + }, + { + "epoch": 0.6518122400475341, + "grad_norm": 0.39060334675889596, + "learning_rate": 1.6175224392759582e-05, + "loss": 0.8782, + "num_tokens": 22925240767.0, + "step": 5485 + }, + { + "epoch": 0.6519310754604872, + "grad_norm": 0.4055351832987001, + "learning_rate": 1.617380731395808e-05, + "loss": 0.8284, + "num_tokens": 22929429545.0, + "step": 5486 + }, + { + "epoch": 0.6520499108734403, + "grad_norm": 0.4083432376510424, + "learning_rate": 1.6172390043552998e-05, + "loss": 0.8585, + "num_tokens": 22933620241.0, + "step": 5487 + }, + { + "epoch": 0.6521687462863933, + "grad_norm": 0.4137798031367514, + "learning_rate": 1.6170972581596828e-05, + "loss": 0.8611, + "num_tokens": 22937809071.0, + "step": 5488 + }, + { + "epoch": 0.6522875816993464, + "grad_norm": 0.41589395675601515, + "learning_rate": 1.6169554928142057e-05, + "loss": 0.8898, + "num_tokens": 22941999036.0, + "step": 5489 + }, + { + "epoch": 0.6524064171122995, + "grad_norm": 0.4508440884249111, + "learning_rate": 1.616813708324119e-05, + "loss": 0.8671, + "num_tokens": 22946176162.0, + "step": 5490 + }, + { + "epoch": 0.6525252525252525, + "grad_norm": 0.4513939794390083, + "learning_rate": 1.616671904694673e-05, + "loss": 0.8383, + "num_tokens": 22950366452.0, + "step": 5491 + }, + { + "epoch": 0.6526440879382056, + "grad_norm": 0.4288613432126881, + "learning_rate": 1.6165300819311198e-05, + "loss": 0.8579, + "num_tokens": 22954525438.0, + "step": 5492 + }, + { + "epoch": 0.6527629233511586, + "grad_norm": 0.39852909310394946, + "learning_rate": 1.616388240038711e-05, + "loss": 0.8589, + "num_tokens": 22958699186.0, + "step": 5493 + }, + { + "epoch": 0.6528817587641117, + "grad_norm": 0.4440674870055223, + "learning_rate": 1.6162463790226996e-05, + "loss": 0.825, + "num_tokens": 22962885342.0, + "step": 5494 + }, + { + "epoch": 0.6530005941770648, + "grad_norm": 0.4126191249375008, + "learning_rate": 1.6161044988883397e-05, + "loss": 0.8725, + "num_tokens": 22967073995.0, + "step": 5495 + }, + { + "epoch": 0.6531194295900178, + "grad_norm": 0.4432242167590437, + "learning_rate": 1.6159625996408853e-05, + "loss": 0.8392, + "num_tokens": 22971263207.0, + "step": 5496 + }, + { + "epoch": 0.6532382650029709, + "grad_norm": 0.5123931394819271, + "learning_rate": 1.6158206812855907e-05, + "loss": 0.8623, + "num_tokens": 22975422617.0, + "step": 5497 + }, + { + "epoch": 0.653357100415924, + "grad_norm": 0.3975876513159514, + "learning_rate": 1.615678743827713e-05, + "loss": 0.8471, + "num_tokens": 22979587702.0, + "step": 5498 + }, + { + "epoch": 0.653475935828877, + "grad_norm": 0.40770622549550956, + "learning_rate": 1.6155367872725074e-05, + "loss": 0.8193, + "num_tokens": 22983776195.0, + "step": 5499 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.4636530123126729, + "learning_rate": 1.6153948116252317e-05, + "loss": 0.8566, + "num_tokens": 22987965115.0, + "step": 5500 + }, + { + "epoch": 0.6537136066547832, + "grad_norm": 0.4511677958425915, + "learning_rate": 1.615252816891143e-05, + "loss": 0.8732, + "num_tokens": 22992150459.0, + "step": 5501 + }, + { + "epoch": 0.6538324420677362, + "grad_norm": 0.4278871021578524, + "learning_rate": 1.615110803075501e-05, + "loss": 0.8728, + "num_tokens": 22996338099.0, + "step": 5502 + }, + { + "epoch": 0.6539512774806893, + "grad_norm": 0.38801349668340285, + "learning_rate": 1.614968770183564e-05, + "loss": 0.8557, + "num_tokens": 23000522363.0, + "step": 5503 + }, + { + "epoch": 0.6540701128936423, + "grad_norm": 0.38917657635652364, + "learning_rate": 1.6148267182205927e-05, + "loss": 0.8792, + "num_tokens": 23004679512.0, + "step": 5504 + }, + { + "epoch": 0.6541889483065954, + "grad_norm": 0.4325280465289374, + "learning_rate": 1.614684647191847e-05, + "loss": 0.8536, + "num_tokens": 23008869589.0, + "step": 5505 + }, + { + "epoch": 0.6543077837195485, + "grad_norm": 0.48724782545299383, + "learning_rate": 1.6145425571025887e-05, + "loss": 0.892, + "num_tokens": 23013015010.0, + "step": 5506 + }, + { + "epoch": 0.6544266191325014, + "grad_norm": 0.4506383584058613, + "learning_rate": 1.6144004479580796e-05, + "loss": 0.8551, + "num_tokens": 23017204187.0, + "step": 5507 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 0.5022359233399143, + "learning_rate": 1.614258319763583e-05, + "loss": 0.8521, + "num_tokens": 23021378129.0, + "step": 5508 + }, + { + "epoch": 0.6546642899584076, + "grad_norm": 0.38190517934230495, + "learning_rate": 1.614116172524362e-05, + "loss": 0.8615, + "num_tokens": 23025569072.0, + "step": 5509 + }, + { + "epoch": 0.6547831253713606, + "grad_norm": 0.4543241506074016, + "learning_rate": 1.6139740062456808e-05, + "loss": 0.8499, + "num_tokens": 23029758397.0, + "step": 5510 + }, + { + "epoch": 0.6549019607843137, + "grad_norm": 0.4146284441633611, + "learning_rate": 1.6138318209328042e-05, + "loss": 0.8329, + "num_tokens": 23033947313.0, + "step": 5511 + }, + { + "epoch": 0.6550207961972668, + "grad_norm": 0.4139810809474348, + "learning_rate": 1.613689616590998e-05, + "loss": 0.8419, + "num_tokens": 23038136421.0, + "step": 5512 + }, + { + "epoch": 0.6551396316102198, + "grad_norm": 0.46709974080405353, + "learning_rate": 1.6135473932255288e-05, + "loss": 0.8664, + "num_tokens": 23042326233.0, + "step": 5513 + }, + { + "epoch": 0.6552584670231729, + "grad_norm": 0.4706755787370298, + "learning_rate": 1.613405150841663e-05, + "loss": 0.8483, + "num_tokens": 23046515699.0, + "step": 5514 + }, + { + "epoch": 0.655377302436126, + "grad_norm": 0.5173473690911923, + "learning_rate": 1.613262889444669e-05, + "loss": 0.9063, + "num_tokens": 23050704304.0, + "step": 5515 + }, + { + "epoch": 0.655496137849079, + "grad_norm": 0.3914907485304097, + "learning_rate": 1.613120609039814e-05, + "loss": 0.8426, + "num_tokens": 23054892119.0, + "step": 5516 + }, + { + "epoch": 0.6556149732620321, + "grad_norm": 0.4199424769408155, + "learning_rate": 1.612978309632369e-05, + "loss": 0.866, + "num_tokens": 23059080054.0, + "step": 5517 + }, + { + "epoch": 0.6557338086749851, + "grad_norm": 0.4015223119793881, + "learning_rate": 1.612835991227602e-05, + "loss": 0.8536, + "num_tokens": 23063267979.0, + "step": 5518 + }, + { + "epoch": 0.6558526440879382, + "grad_norm": 0.5130122194070428, + "learning_rate": 1.6126936538307844e-05, + "loss": 0.8802, + "num_tokens": 23067458232.0, + "step": 5519 + }, + { + "epoch": 0.6559714795008913, + "grad_norm": 0.4327537253282635, + "learning_rate": 1.6125512974471876e-05, + "loss": 0.8359, + "num_tokens": 23071648421.0, + "step": 5520 + }, + { + "epoch": 0.6560903149138443, + "grad_norm": 0.3967092403211605, + "learning_rate": 1.6124089220820832e-05, + "loss": 0.8984, + "num_tokens": 23075814616.0, + "step": 5521 + }, + { + "epoch": 0.6562091503267974, + "grad_norm": 0.37699980727556465, + "learning_rate": 1.612266527740744e-05, + "loss": 0.8273, + "num_tokens": 23079972945.0, + "step": 5522 + }, + { + "epoch": 0.6563279857397505, + "grad_norm": 0.40296197778507276, + "learning_rate": 1.612124114428443e-05, + "loss": 0.8326, + "num_tokens": 23084161995.0, + "step": 5523 + }, + { + "epoch": 0.6564468211527035, + "grad_norm": 0.4510055260137553, + "learning_rate": 1.6119816821504545e-05, + "loss": 0.8897, + "num_tokens": 23088318991.0, + "step": 5524 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.40351328656850877, + "learning_rate": 1.6118392309120533e-05, + "loss": 0.8696, + "num_tokens": 23092507362.0, + "step": 5525 + }, + { + "epoch": 0.6566844919786097, + "grad_norm": 0.36143420160063444, + "learning_rate": 1.611696760718515e-05, + "loss": 0.8586, + "num_tokens": 23096695828.0, + "step": 5526 + }, + { + "epoch": 0.6568033273915627, + "grad_norm": 0.4975006355116618, + "learning_rate": 1.6115542715751158e-05, + "loss": 0.8475, + "num_tokens": 23100885229.0, + "step": 5527 + }, + { + "epoch": 0.6569221628045158, + "grad_norm": 0.41180824179336245, + "learning_rate": 1.6114117634871318e-05, + "loss": 0.8349, + "num_tokens": 23105043799.0, + "step": 5528 + }, + { + "epoch": 0.6570409982174688, + "grad_norm": 0.437685065508369, + "learning_rate": 1.611269236459841e-05, + "loss": 0.8704, + "num_tokens": 23109232417.0, + "step": 5529 + }, + { + "epoch": 0.6571598336304219, + "grad_norm": 0.4232340267901342, + "learning_rate": 1.6111266904985223e-05, + "loss": 0.8721, + "num_tokens": 23113386549.0, + "step": 5530 + }, + { + "epoch": 0.657278669043375, + "grad_norm": 0.45090200789256196, + "learning_rate": 1.6109841256084536e-05, + "loss": 0.8325, + "num_tokens": 23117576908.0, + "step": 5531 + }, + { + "epoch": 0.6573975044563279, + "grad_norm": 0.4455139774665111, + "learning_rate": 1.6108415417949154e-05, + "loss": 0.8809, + "num_tokens": 23121765287.0, + "step": 5532 + }, + { + "epoch": 0.657516339869281, + "grad_norm": 0.40556593641036726, + "learning_rate": 1.6106989390631876e-05, + "loss": 0.8524, + "num_tokens": 23125942086.0, + "step": 5533 + }, + { + "epoch": 0.6576351752822341, + "grad_norm": 0.41233026668031764, + "learning_rate": 1.6105563174185515e-05, + "loss": 0.8498, + "num_tokens": 23130119985.0, + "step": 5534 + }, + { + "epoch": 0.6577540106951871, + "grad_norm": 0.3700825808001483, + "learning_rate": 1.6104136768662883e-05, + "loss": 0.859, + "num_tokens": 23134309291.0, + "step": 5535 + }, + { + "epoch": 0.6578728461081402, + "grad_norm": 0.41890586653015943, + "learning_rate": 1.6102710174116812e-05, + "loss": 0.8384, + "num_tokens": 23138499727.0, + "step": 5536 + }, + { + "epoch": 0.6579916815210933, + "grad_norm": 0.4911780342633364, + "learning_rate": 1.610128339060013e-05, + "loss": 0.8524, + "num_tokens": 23142688975.0, + "step": 5537 + }, + { + "epoch": 0.6581105169340463, + "grad_norm": 0.4558223268182663, + "learning_rate": 1.609985641816568e-05, + "loss": 0.8572, + "num_tokens": 23146860663.0, + "step": 5538 + }, + { + "epoch": 0.6582293523469994, + "grad_norm": 0.4276036294270825, + "learning_rate": 1.6098429256866296e-05, + "loss": 0.8533, + "num_tokens": 23151050424.0, + "step": 5539 + }, + { + "epoch": 0.6583481877599525, + "grad_norm": 0.44540687828581915, + "learning_rate": 1.6097001906754843e-05, + "loss": 0.8607, + "num_tokens": 23155239372.0, + "step": 5540 + }, + { + "epoch": 0.6584670231729055, + "grad_norm": 0.4134623714442648, + "learning_rate": 1.6095574367884177e-05, + "loss": 0.8189, + "num_tokens": 23159429914.0, + "step": 5541 + }, + { + "epoch": 0.6585858585858586, + "grad_norm": 0.42579911151106226, + "learning_rate": 1.609414664030716e-05, + "loss": 0.8777, + "num_tokens": 23163607990.0, + "step": 5542 + }, + { + "epoch": 0.6587046939988116, + "grad_norm": 0.4236519916181914, + "learning_rate": 1.6092718724076674e-05, + "loss": 0.7946, + "num_tokens": 23167769903.0, + "step": 5543 + }, + { + "epoch": 0.6588235294117647, + "grad_norm": 0.4100087823596464, + "learning_rate": 1.609129061924559e-05, + "loss": 0.8675, + "num_tokens": 23171958334.0, + "step": 5544 + }, + { + "epoch": 0.6589423648247178, + "grad_norm": 0.3997446892243682, + "learning_rate": 1.60898623258668e-05, + "loss": 0.8437, + "num_tokens": 23176147663.0, + "step": 5545 + }, + { + "epoch": 0.6590612002376708, + "grad_norm": 0.4471153633852715, + "learning_rate": 1.6088433843993204e-05, + "loss": 0.8532, + "num_tokens": 23180330104.0, + "step": 5546 + }, + { + "epoch": 0.6591800356506239, + "grad_norm": 0.47406698917911094, + "learning_rate": 1.6087005173677696e-05, + "loss": 0.8734, + "num_tokens": 23184509770.0, + "step": 5547 + }, + { + "epoch": 0.659298871063577, + "grad_norm": 0.39205800984901396, + "learning_rate": 1.6085576314973184e-05, + "loss": 0.8569, + "num_tokens": 23188699716.0, + "step": 5548 + }, + { + "epoch": 0.65941770647653, + "grad_norm": 0.4252998549234597, + "learning_rate": 1.608414726793259e-05, + "loss": 0.8801, + "num_tokens": 23192857509.0, + "step": 5549 + }, + { + "epoch": 0.6595365418894831, + "grad_norm": 0.390811461587091, + "learning_rate": 1.6082718032608834e-05, + "loss": 0.8454, + "num_tokens": 23197026232.0, + "step": 5550 + }, + { + "epoch": 0.6596553773024362, + "grad_norm": 0.4327912489568424, + "learning_rate": 1.608128860905484e-05, + "loss": 0.8621, + "num_tokens": 23201215047.0, + "step": 5551 + }, + { + "epoch": 0.6597742127153892, + "grad_norm": 0.5249814657635558, + "learning_rate": 1.6079858997323548e-05, + "loss": 0.8711, + "num_tokens": 23205403659.0, + "step": 5552 + }, + { + "epoch": 0.6598930481283423, + "grad_norm": 0.4210554370380413, + "learning_rate": 1.6078429197467908e-05, + "loss": 0.8481, + "num_tokens": 23209578108.0, + "step": 5553 + }, + { + "epoch": 0.6600118835412953, + "grad_norm": 0.33972754512977155, + "learning_rate": 1.6076999209540856e-05, + "loss": 0.8524, + "num_tokens": 23213767005.0, + "step": 5554 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.44071846999951725, + "learning_rate": 1.6075569033595367e-05, + "loss": 0.8709, + "num_tokens": 23217934213.0, + "step": 5555 + }, + { + "epoch": 0.6602495543672015, + "grad_norm": 0.42593249781547815, + "learning_rate": 1.6074138669684387e-05, + "loss": 0.8529, + "num_tokens": 23222096956.0, + "step": 5556 + }, + { + "epoch": 0.6603683897801544, + "grad_norm": 0.4139975934565984, + "learning_rate": 1.6072708117860906e-05, + "loss": 0.8803, + "num_tokens": 23226286277.0, + "step": 5557 + }, + { + "epoch": 0.6604872251931075, + "grad_norm": 0.42825220930737873, + "learning_rate": 1.607127737817788e-05, + "loss": 0.8552, + "num_tokens": 23230468156.0, + "step": 5558 + }, + { + "epoch": 0.6606060606060606, + "grad_norm": 0.4468886340588197, + "learning_rate": 1.6069846450688314e-05, + "loss": 0.8618, + "num_tokens": 23234655735.0, + "step": 5559 + }, + { + "epoch": 0.6607248960190136, + "grad_norm": 0.48255338349438937, + "learning_rate": 1.6068415335445187e-05, + "loss": 0.8352, + "num_tokens": 23238843760.0, + "step": 5560 + }, + { + "epoch": 0.6608437314319667, + "grad_norm": 0.4028475371344039, + "learning_rate": 1.606698403250151e-05, + "loss": 0.8384, + "num_tokens": 23243033393.0, + "step": 5561 + }, + { + "epoch": 0.6609625668449198, + "grad_norm": 0.44978950212914465, + "learning_rate": 1.606555254191028e-05, + "loss": 0.8547, + "num_tokens": 23247223182.0, + "step": 5562 + }, + { + "epoch": 0.6610814022578728, + "grad_norm": 0.4139414711144919, + "learning_rate": 1.6064120863724513e-05, + "loss": 0.8705, + "num_tokens": 23251411649.0, + "step": 5563 + }, + { + "epoch": 0.6612002376708259, + "grad_norm": 0.4728837489402758, + "learning_rate": 1.6062688997997226e-05, + "loss": 0.8542, + "num_tokens": 23255601131.0, + "step": 5564 + }, + { + "epoch": 0.661319073083779, + "grad_norm": 0.3930879919611146, + "learning_rate": 1.606125694478145e-05, + "loss": 0.8641, + "num_tokens": 23259756995.0, + "step": 5565 + }, + { + "epoch": 0.661437908496732, + "grad_norm": 0.42720763252548505, + "learning_rate": 1.6059824704130216e-05, + "loss": 0.8537, + "num_tokens": 23263946061.0, + "step": 5566 + }, + { + "epoch": 0.6615567439096851, + "grad_norm": 0.44059719241627787, + "learning_rate": 1.6058392276096562e-05, + "loss": 0.8508, + "num_tokens": 23268134452.0, + "step": 5567 + }, + { + "epoch": 0.6616755793226381, + "grad_norm": 0.4497402430894493, + "learning_rate": 1.6056959660733543e-05, + "loss": 0.851, + "num_tokens": 23272232980.0, + "step": 5568 + }, + { + "epoch": 0.6617944147355912, + "grad_norm": 0.37105160590655684, + "learning_rate": 1.6055526858094212e-05, + "loss": 0.8186, + "num_tokens": 23276423151.0, + "step": 5569 + }, + { + "epoch": 0.6619132501485443, + "grad_norm": 0.5121180965605968, + "learning_rate": 1.6054093868231624e-05, + "loss": 0.8512, + "num_tokens": 23280590305.0, + "step": 5570 + }, + { + "epoch": 0.6620320855614973, + "grad_norm": 0.39928989701301576, + "learning_rate": 1.6052660691198855e-05, + "loss": 0.8532, + "num_tokens": 23284779536.0, + "step": 5571 + }, + { + "epoch": 0.6621509209744504, + "grad_norm": 0.3999408582073041, + "learning_rate": 1.6051227327048976e-05, + "loss": 0.8297, + "num_tokens": 23288967805.0, + "step": 5572 + }, + { + "epoch": 0.6622697563874035, + "grad_norm": 0.4641629558750647, + "learning_rate": 1.604979377583507e-05, + "loss": 0.8608, + "num_tokens": 23293128448.0, + "step": 5573 + }, + { + "epoch": 0.6623885918003565, + "grad_norm": 0.40915393864728594, + "learning_rate": 1.6048360037610226e-05, + "loss": 0.8261, + "num_tokens": 23297299878.0, + "step": 5574 + }, + { + "epoch": 0.6625074272133096, + "grad_norm": 0.5021682666401898, + "learning_rate": 1.6046926112427542e-05, + "loss": 0.8615, + "num_tokens": 23301489668.0, + "step": 5575 + }, + { + "epoch": 0.6626262626262627, + "grad_norm": 0.46068938873918513, + "learning_rate": 1.604549200034012e-05, + "loss": 0.8906, + "num_tokens": 23305657711.0, + "step": 5576 + }, + { + "epoch": 0.6627450980392157, + "grad_norm": 0.36892862658527076, + "learning_rate": 1.604405770140107e-05, + "loss": 0.8485, + "num_tokens": 23309821414.0, + "step": 5577 + }, + { + "epoch": 0.6628639334521688, + "grad_norm": 0.39827630111327056, + "learning_rate": 1.6042623215663507e-05, + "loss": 0.8049, + "num_tokens": 23314010891.0, + "step": 5578 + }, + { + "epoch": 0.6629827688651218, + "grad_norm": 0.5787833736486305, + "learning_rate": 1.604118854318056e-05, + "loss": 0.8368, + "num_tokens": 23318187553.0, + "step": 5579 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 0.41550345555657103, + "learning_rate": 1.6039753684005353e-05, + "loss": 0.8637, + "num_tokens": 23322318827.0, + "step": 5580 + }, + { + "epoch": 0.663220439691028, + "grad_norm": 0.4190396699294279, + "learning_rate": 1.603831863819103e-05, + "loss": 0.8573, + "num_tokens": 23326508209.0, + "step": 5581 + }, + { + "epoch": 0.6633392751039809, + "grad_norm": 0.5093188166908384, + "learning_rate": 1.603688340579073e-05, + "loss": 0.8587, + "num_tokens": 23330699233.0, + "step": 5582 + }, + { + "epoch": 0.663458110516934, + "grad_norm": 0.4841113442458322, + "learning_rate": 1.603544798685761e-05, + "loss": 0.8629, + "num_tokens": 23334889615.0, + "step": 5583 + }, + { + "epoch": 0.6635769459298871, + "grad_norm": 0.39898949237512005, + "learning_rate": 1.6034012381444827e-05, + "loss": 0.8053, + "num_tokens": 23339064859.0, + "step": 5584 + }, + { + "epoch": 0.6636957813428401, + "grad_norm": 0.4363212737470057, + "learning_rate": 1.603257658960554e-05, + "loss": 0.8856, + "num_tokens": 23343252259.0, + "step": 5585 + }, + { + "epoch": 0.6638146167557932, + "grad_norm": 0.49039177587439636, + "learning_rate": 1.6031140611392934e-05, + "loss": 0.834, + "num_tokens": 23347428711.0, + "step": 5586 + }, + { + "epoch": 0.6639334521687463, + "grad_norm": 0.48806561946601074, + "learning_rate": 1.6029704446860174e-05, + "loss": 0.7936, + "num_tokens": 23351618919.0, + "step": 5587 + }, + { + "epoch": 0.6640522875816993, + "grad_norm": 0.5144808262105713, + "learning_rate": 1.6028268096060454e-05, + "loss": 0.8509, + "num_tokens": 23355807681.0, + "step": 5588 + }, + { + "epoch": 0.6641711229946524, + "grad_norm": 0.47019594065579834, + "learning_rate": 1.6026831559046968e-05, + "loss": 0.861, + "num_tokens": 23359994049.0, + "step": 5589 + }, + { + "epoch": 0.6642899584076055, + "grad_norm": 0.44040837382100073, + "learning_rate": 1.602539483587291e-05, + "loss": 0.8932, + "num_tokens": 23364181827.0, + "step": 5590 + }, + { + "epoch": 0.6644087938205585, + "grad_norm": 0.4793868946912566, + "learning_rate": 1.6023957926591493e-05, + "loss": 0.8087, + "num_tokens": 23368346408.0, + "step": 5591 + }, + { + "epoch": 0.6645276292335116, + "grad_norm": 0.47223049382524257, + "learning_rate": 1.6022520831255928e-05, + "loss": 0.8379, + "num_tokens": 23372535550.0, + "step": 5592 + }, + { + "epoch": 0.6646464646464646, + "grad_norm": 0.4163117581071323, + "learning_rate": 1.602108354991943e-05, + "loss": 0.8707, + "num_tokens": 23376724725.0, + "step": 5593 + }, + { + "epoch": 0.6647653000594177, + "grad_norm": 0.4253313135835659, + "learning_rate": 1.6019646082635234e-05, + "loss": 0.8181, + "num_tokens": 23380885156.0, + "step": 5594 + }, + { + "epoch": 0.6648841354723708, + "grad_norm": 0.4540079749345441, + "learning_rate": 1.6018208429456574e-05, + "loss": 0.8781, + "num_tokens": 23385048990.0, + "step": 5595 + }, + { + "epoch": 0.6650029708853238, + "grad_norm": 0.42203770505428884, + "learning_rate": 1.6016770590436686e-05, + "loss": 0.8556, + "num_tokens": 23389231617.0, + "step": 5596 + }, + { + "epoch": 0.6651218062982769, + "grad_norm": 0.558349271828941, + "learning_rate": 1.6015332565628822e-05, + "loss": 0.8188, + "num_tokens": 23393421552.0, + "step": 5597 + }, + { + "epoch": 0.66524064171123, + "grad_norm": 0.4407912240919123, + "learning_rate": 1.6013894355086236e-05, + "loss": 0.834, + "num_tokens": 23397611843.0, + "step": 5598 + }, + { + "epoch": 0.665359477124183, + "grad_norm": 0.4323991905365238, + "learning_rate": 1.601245595886219e-05, + "loss": 0.842, + "num_tokens": 23401775152.0, + "step": 5599 + }, + { + "epoch": 0.6654783125371361, + "grad_norm": 0.4814589833242647, + "learning_rate": 1.601101737700995e-05, + "loss": 0.8812, + "num_tokens": 23405955351.0, + "step": 5600 + }, + { + "epoch": 0.6655971479500892, + "grad_norm": 0.4112156954734809, + "learning_rate": 1.6009578609582795e-05, + "loss": 0.8492, + "num_tokens": 23410144396.0, + "step": 5601 + }, + { + "epoch": 0.6657159833630422, + "grad_norm": 0.3676417680253344, + "learning_rate": 1.600813965663401e-05, + "loss": 0.8742, + "num_tokens": 23414332359.0, + "step": 5602 + }, + { + "epoch": 0.6658348187759953, + "grad_norm": 0.5136802266701954, + "learning_rate": 1.600670051821688e-05, + "loss": 0.8621, + "num_tokens": 23418522069.0, + "step": 5603 + }, + { + "epoch": 0.6659536541889483, + "grad_norm": 0.4343642233939913, + "learning_rate": 1.6005261194384697e-05, + "loss": 0.8396, + "num_tokens": 23422710956.0, + "step": 5604 + }, + { + "epoch": 0.6660724896019014, + "grad_norm": 0.41959044572719256, + "learning_rate": 1.600382168519077e-05, + "loss": 0.8483, + "num_tokens": 23426899823.0, + "step": 5605 + }, + { + "epoch": 0.6661913250148545, + "grad_norm": 0.5002056609600216, + "learning_rate": 1.600238199068841e-05, + "loss": 0.8421, + "num_tokens": 23431089035.0, + "step": 5606 + }, + { + "epoch": 0.6663101604278074, + "grad_norm": 0.41690896121022186, + "learning_rate": 1.6000942110930927e-05, + "loss": 0.8468, + "num_tokens": 23435278721.0, + "step": 5607 + }, + { + "epoch": 0.6664289958407605, + "grad_norm": 0.4778555953279378, + "learning_rate": 1.5999502045971653e-05, + "loss": 0.854, + "num_tokens": 23439447658.0, + "step": 5608 + }, + { + "epoch": 0.6665478312537136, + "grad_norm": 0.4145735004224206, + "learning_rate": 1.5998061795863916e-05, + "loss": 0.8407, + "num_tokens": 23443638222.0, + "step": 5609 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.37936477174380756, + "learning_rate": 1.5996621360661048e-05, + "loss": 0.8654, + "num_tokens": 23447828162.0, + "step": 5610 + }, + { + "epoch": 0.6667855020796197, + "grad_norm": 0.4199563400756048, + "learning_rate": 1.59951807404164e-05, + "loss": 0.8438, + "num_tokens": 23452017084.0, + "step": 5611 + }, + { + "epoch": 0.6669043374925728, + "grad_norm": 0.407204702500291, + "learning_rate": 1.599373993518332e-05, + "loss": 0.8769, + "num_tokens": 23456204571.0, + "step": 5612 + }, + { + "epoch": 0.6670231729055258, + "grad_norm": 0.40189441484897115, + "learning_rate": 1.5992298945015164e-05, + "loss": 0.843, + "num_tokens": 23460369477.0, + "step": 5613 + }, + { + "epoch": 0.6671420083184789, + "grad_norm": 0.45398130348545845, + "learning_rate": 1.5990857769965297e-05, + "loss": 0.8361, + "num_tokens": 23464557929.0, + "step": 5614 + }, + { + "epoch": 0.667260843731432, + "grad_norm": 0.41583930446545, + "learning_rate": 1.5989416410087095e-05, + "loss": 0.8522, + "num_tokens": 23468748828.0, + "step": 5615 + }, + { + "epoch": 0.667379679144385, + "grad_norm": 0.4623484269057192, + "learning_rate": 1.5987974865433937e-05, + "loss": 0.8873, + "num_tokens": 23472913159.0, + "step": 5616 + }, + { + "epoch": 0.6674985145573381, + "grad_norm": 0.4389725871649992, + "learning_rate": 1.5986533136059198e-05, + "loss": 0.832, + "num_tokens": 23477102063.0, + "step": 5617 + }, + { + "epoch": 0.6676173499702911, + "grad_norm": 0.4090557442458226, + "learning_rate": 1.5985091222016283e-05, + "loss": 0.8298, + "num_tokens": 23481292677.0, + "step": 5618 + }, + { + "epoch": 0.6677361853832442, + "grad_norm": 0.4422074089240785, + "learning_rate": 1.598364912335858e-05, + "loss": 0.8803, + "num_tokens": 23485480127.0, + "step": 5619 + }, + { + "epoch": 0.6678550207961973, + "grad_norm": 0.4095400378191082, + "learning_rate": 1.5982206840139506e-05, + "loss": 0.8764, + "num_tokens": 23489667418.0, + "step": 5620 + }, + { + "epoch": 0.6679738562091503, + "grad_norm": 0.40389876374893, + "learning_rate": 1.598076437241246e-05, + "loss": 0.8496, + "num_tokens": 23493848046.0, + "step": 5621 + }, + { + "epoch": 0.6680926916221034, + "grad_norm": 0.5253361293215657, + "learning_rate": 1.5979321720230874e-05, + "loss": 0.8077, + "num_tokens": 23498036488.0, + "step": 5622 + }, + { + "epoch": 0.6682115270350565, + "grad_norm": 0.4533252197768469, + "learning_rate": 1.597787888364817e-05, + "loss": 0.8393, + "num_tokens": 23502200383.0, + "step": 5623 + }, + { + "epoch": 0.6683303624480095, + "grad_norm": 0.417558551324792, + "learning_rate": 1.5976435862717776e-05, + "loss": 0.8338, + "num_tokens": 23506389701.0, + "step": 5624 + }, + { + "epoch": 0.6684491978609626, + "grad_norm": 0.41387090203158483, + "learning_rate": 1.597499265749314e-05, + "loss": 0.8145, + "num_tokens": 23510569496.0, + "step": 5625 + }, + { + "epoch": 0.6685680332739157, + "grad_norm": 0.45693286551885703, + "learning_rate": 1.5973549268027702e-05, + "loss": 0.8414, + "num_tokens": 23514757953.0, + "step": 5626 + }, + { + "epoch": 0.6686868686868687, + "grad_norm": 0.5203039995733377, + "learning_rate": 1.5972105694374923e-05, + "loss": 0.8481, + "num_tokens": 23518938518.0, + "step": 5627 + }, + { + "epoch": 0.6688057040998218, + "grad_norm": 0.4483485990966089, + "learning_rate": 1.5970661936588253e-05, + "loss": 0.8319, + "num_tokens": 23523114520.0, + "step": 5628 + }, + { + "epoch": 0.6689245395127748, + "grad_norm": 0.40435057928249535, + "learning_rate": 1.596921799472117e-05, + "loss": 0.8396, + "num_tokens": 23527304819.0, + "step": 5629 + }, + { + "epoch": 0.6690433749257279, + "grad_norm": 0.4206387986448419, + "learning_rate": 1.5967773868827145e-05, + "loss": 0.8305, + "num_tokens": 23531478353.0, + "step": 5630 + }, + { + "epoch": 0.669162210338681, + "grad_norm": 0.37251327197185713, + "learning_rate": 1.5966329558959653e-05, + "loss": 0.884, + "num_tokens": 23535666609.0, + "step": 5631 + }, + { + "epoch": 0.6692810457516339, + "grad_norm": 0.5615345440801427, + "learning_rate": 1.5964885065172188e-05, + "loss": 0.8776, + "num_tokens": 23539855906.0, + "step": 5632 + }, + { + "epoch": 0.669399881164587, + "grad_norm": 0.3836828959820469, + "learning_rate": 1.596344038751824e-05, + "loss": 0.8306, + "num_tokens": 23543985995.0, + "step": 5633 + }, + { + "epoch": 0.6695187165775401, + "grad_norm": 0.546538277175052, + "learning_rate": 1.5961995526051317e-05, + "loss": 0.818, + "num_tokens": 23548175736.0, + "step": 5634 + }, + { + "epoch": 0.6696375519904931, + "grad_norm": 0.3860083769818996, + "learning_rate": 1.5960550480824918e-05, + "loss": 0.8489, + "num_tokens": 23552362296.0, + "step": 5635 + }, + { + "epoch": 0.6697563874034462, + "grad_norm": 0.40580251774849574, + "learning_rate": 1.5959105251892563e-05, + "loss": 0.846, + "num_tokens": 23556552762.0, + "step": 5636 + }, + { + "epoch": 0.6698752228163993, + "grad_norm": 0.4663293451300597, + "learning_rate": 1.5957659839307776e-05, + "loss": 0.8498, + "num_tokens": 23560718490.0, + "step": 5637 + }, + { + "epoch": 0.6699940582293523, + "grad_norm": 0.5226204800648163, + "learning_rate": 1.595621424312408e-05, + "loss": 0.8799, + "num_tokens": 23564907203.0, + "step": 5638 + }, + { + "epoch": 0.6701128936423054, + "grad_norm": 0.4032070107522989, + "learning_rate": 1.595476846339502e-05, + "loss": 0.8328, + "num_tokens": 23569078389.0, + "step": 5639 + }, + { + "epoch": 0.6702317290552585, + "grad_norm": 0.41511216024037, + "learning_rate": 1.5953322500174123e-05, + "loss": 0.8484, + "num_tokens": 23573267420.0, + "step": 5640 + }, + { + "epoch": 0.6703505644682115, + "grad_norm": 0.4737998522542681, + "learning_rate": 1.595187635351495e-05, + "loss": 0.834, + "num_tokens": 23577456024.0, + "step": 5641 + }, + { + "epoch": 0.6704693998811646, + "grad_norm": 0.46296136907055147, + "learning_rate": 1.595043002347105e-05, + "loss": 0.8888, + "num_tokens": 23581616509.0, + "step": 5642 + }, + { + "epoch": 0.6705882352941176, + "grad_norm": 0.38218665884016845, + "learning_rate": 1.5948983510095993e-05, + "loss": 0.8319, + "num_tokens": 23585807095.0, + "step": 5643 + }, + { + "epoch": 0.6707070707070707, + "grad_norm": 0.411825994298974, + "learning_rate": 1.5947536813443342e-05, + "loss": 0.8705, + "num_tokens": 23589995603.0, + "step": 5644 + }, + { + "epoch": 0.6708259061200238, + "grad_norm": 0.4266530263147384, + "learning_rate": 1.5946089933566676e-05, + "loss": 0.8604, + "num_tokens": 23594185226.0, + "step": 5645 + }, + { + "epoch": 0.6709447415329768, + "grad_norm": 0.3494596107519197, + "learning_rate": 1.5944642870519576e-05, + "loss": 0.8778, + "num_tokens": 23598373420.0, + "step": 5646 + }, + { + "epoch": 0.6710635769459299, + "grad_norm": 0.42098403344560353, + "learning_rate": 1.594319562435563e-05, + "loss": 0.8373, + "num_tokens": 23602561903.0, + "step": 5647 + }, + { + "epoch": 0.671182412358883, + "grad_norm": 0.5072431806197608, + "learning_rate": 1.5941748195128437e-05, + "loss": 0.8866, + "num_tokens": 23606725178.0, + "step": 5648 + }, + { + "epoch": 0.671301247771836, + "grad_norm": 0.40724295153435924, + "learning_rate": 1.5940300582891605e-05, + "loss": 0.9237, + "num_tokens": 23610880802.0, + "step": 5649 + }, + { + "epoch": 0.6714200831847891, + "grad_norm": 0.4721357776054166, + "learning_rate": 1.593885278769873e-05, + "loss": 0.8423, + "num_tokens": 23615038043.0, + "step": 5650 + }, + { + "epoch": 0.6715389185977422, + "grad_norm": 0.4045120592265934, + "learning_rate": 1.5937404809603444e-05, + "loss": 0.844, + "num_tokens": 23619220611.0, + "step": 5651 + }, + { + "epoch": 0.6716577540106952, + "grad_norm": 0.49243520500164917, + "learning_rate": 1.5935956648659363e-05, + "loss": 0.8401, + "num_tokens": 23623408370.0, + "step": 5652 + }, + { + "epoch": 0.6717765894236483, + "grad_norm": 0.3956898449352601, + "learning_rate": 1.5934508304920118e-05, + "loss": 0.8617, + "num_tokens": 23627597652.0, + "step": 5653 + }, + { + "epoch": 0.6718954248366014, + "grad_norm": 0.4644606111218673, + "learning_rate": 1.5933059778439345e-05, + "loss": 0.8278, + "num_tokens": 23631786296.0, + "step": 5654 + }, + { + "epoch": 0.6720142602495544, + "grad_norm": 0.4675999876278622, + "learning_rate": 1.5931611069270692e-05, + "loss": 0.8293, + "num_tokens": 23635944363.0, + "step": 5655 + }, + { + "epoch": 0.6721330956625075, + "grad_norm": 0.40467428940347006, + "learning_rate": 1.5930162177467803e-05, + "loss": 0.832, + "num_tokens": 23640133054.0, + "step": 5656 + }, + { + "epoch": 0.6722519310754604, + "grad_norm": 0.4432130904282717, + "learning_rate": 1.5928713103084344e-05, + "loss": 0.8507, + "num_tokens": 23644323304.0, + "step": 5657 + }, + { + "epoch": 0.6723707664884135, + "grad_norm": 0.381277926824085, + "learning_rate": 1.5927263846173976e-05, + "loss": 0.891, + "num_tokens": 23648492161.0, + "step": 5658 + }, + { + "epoch": 0.6724896019013666, + "grad_norm": 0.4700570042702472, + "learning_rate": 1.5925814406790362e-05, + "loss": 0.8291, + "num_tokens": 23652679799.0, + "step": 5659 + }, + { + "epoch": 0.6726084373143196, + "grad_norm": 0.4249325722148892, + "learning_rate": 1.592436478498719e-05, + "loss": 0.8541, + "num_tokens": 23656869032.0, + "step": 5660 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 0.5114620728695547, + "learning_rate": 1.592291498081814e-05, + "loss": 0.8223, + "num_tokens": 23661047593.0, + "step": 5661 + }, + { + "epoch": 0.6728461081402258, + "grad_norm": 0.483596760578355, + "learning_rate": 1.59214649943369e-05, + "loss": 0.8498, + "num_tokens": 23665231219.0, + "step": 5662 + }, + { + "epoch": 0.6729649435531788, + "grad_norm": 0.3912106048795565, + "learning_rate": 1.592001482559717e-05, + "loss": 0.8932, + "num_tokens": 23669420354.0, + "step": 5663 + }, + { + "epoch": 0.6730837789661319, + "grad_norm": 0.39543214826859024, + "learning_rate": 1.5918564474652668e-05, + "loss": 0.8548, + "num_tokens": 23673609031.0, + "step": 5664 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.45821567511091393, + "learning_rate": 1.5917113941557084e-05, + "loss": 0.8586, + "num_tokens": 23677798705.0, + "step": 5665 + }, + { + "epoch": 0.673321449792038, + "grad_norm": 0.4212479465134937, + "learning_rate": 1.5915663226364147e-05, + "loss": 0.821, + "num_tokens": 23681985773.0, + "step": 5666 + }, + { + "epoch": 0.6734402852049911, + "grad_norm": 0.3742231471226021, + "learning_rate": 1.591421232912758e-05, + "loss": 0.8425, + "num_tokens": 23686147331.0, + "step": 5667 + }, + { + "epoch": 0.6735591206179441, + "grad_norm": 0.38970370075495253, + "learning_rate": 1.5912761249901114e-05, + "loss": 0.848, + "num_tokens": 23690335671.0, + "step": 5668 + }, + { + "epoch": 0.6736779560308972, + "grad_norm": 0.40775779793806083, + "learning_rate": 1.591130998873849e-05, + "loss": 0.8599, + "num_tokens": 23694496389.0, + "step": 5669 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 0.4162136057887949, + "learning_rate": 1.590985854569345e-05, + "loss": 0.8219, + "num_tokens": 23698655961.0, + "step": 5670 + }, + { + "epoch": 0.6739156268568033, + "grad_norm": 0.395912364877623, + "learning_rate": 1.590840692081975e-05, + "loss": 0.8619, + "num_tokens": 23702844391.0, + "step": 5671 + }, + { + "epoch": 0.6740344622697564, + "grad_norm": 0.558545016197793, + "learning_rate": 1.5906955114171146e-05, + "loss": 0.8666, + "num_tokens": 23707033605.0, + "step": 5672 + }, + { + "epoch": 0.6741532976827095, + "grad_norm": 0.3448507843926339, + "learning_rate": 1.59055031258014e-05, + "loss": 0.8427, + "num_tokens": 23711222871.0, + "step": 5673 + }, + { + "epoch": 0.6742721330956625, + "grad_norm": 0.5012238583740415, + "learning_rate": 1.5904050955764296e-05, + "loss": 0.9096, + "num_tokens": 23715411999.0, + "step": 5674 + }, + { + "epoch": 0.6743909685086156, + "grad_norm": 0.46765022645989307, + "learning_rate": 1.5902598604113595e-05, + "loss": 0.866, + "num_tokens": 23719600299.0, + "step": 5675 + }, + { + "epoch": 0.6745098039215687, + "grad_norm": 0.4788828717648864, + "learning_rate": 1.5901146070903094e-05, + "loss": 0.8659, + "num_tokens": 23723773014.0, + "step": 5676 + }, + { + "epoch": 0.6746286393345217, + "grad_norm": 0.40426964816220234, + "learning_rate": 1.5899693356186584e-05, + "loss": 0.86, + "num_tokens": 23727963020.0, + "step": 5677 + }, + { + "epoch": 0.6747474747474748, + "grad_norm": 0.38910196862744306, + "learning_rate": 1.589824046001786e-05, + "loss": 0.8705, + "num_tokens": 23732115302.0, + "step": 5678 + }, + { + "epoch": 0.6748663101604279, + "grad_norm": 0.4482583691069593, + "learning_rate": 1.5896787382450732e-05, + "loss": 0.8636, + "num_tokens": 23736290939.0, + "step": 5679 + }, + { + "epoch": 0.6749851455733809, + "grad_norm": 0.4389333468320351, + "learning_rate": 1.5895334123539013e-05, + "loss": 0.8445, + "num_tokens": 23740480282.0, + "step": 5680 + }, + { + "epoch": 0.675103980986334, + "grad_norm": 0.45034748144750386, + "learning_rate": 1.589388068333652e-05, + "loss": 0.8377, + "num_tokens": 23744671711.0, + "step": 5681 + }, + { + "epoch": 0.6752228163992869, + "grad_norm": 0.3907612801057874, + "learning_rate": 1.5892427061897076e-05, + "loss": 0.8405, + "num_tokens": 23748851615.0, + "step": 5682 + }, + { + "epoch": 0.67534165181224, + "grad_norm": 0.5458822983839822, + "learning_rate": 1.589097325927452e-05, + "loss": 0.8543, + "num_tokens": 23753009399.0, + "step": 5683 + }, + { + "epoch": 0.6754604872251931, + "grad_norm": 0.5142865113820118, + "learning_rate": 1.588951927552268e-05, + "loss": 0.8358, + "num_tokens": 23757196989.0, + "step": 5684 + }, + { + "epoch": 0.6755793226381461, + "grad_norm": 0.33624385258867295, + "learning_rate": 1.5888065110695414e-05, + "loss": 0.8387, + "num_tokens": 23761386298.0, + "step": 5685 + }, + { + "epoch": 0.6756981580510992, + "grad_norm": 0.4907707189807257, + "learning_rate": 1.588661076484657e-05, + "loss": 0.8592, + "num_tokens": 23765571668.0, + "step": 5686 + }, + { + "epoch": 0.6758169934640523, + "grad_norm": 0.41584469943931957, + "learning_rate": 1.588515623803001e-05, + "loss": 0.8628, + "num_tokens": 23769762980.0, + "step": 5687 + }, + { + "epoch": 0.6759358288770053, + "grad_norm": 0.44216156193574085, + "learning_rate": 1.588370153029959e-05, + "loss": 0.8551, + "num_tokens": 23773953517.0, + "step": 5688 + }, + { + "epoch": 0.6760546642899584, + "grad_norm": 0.4891279394213469, + "learning_rate": 1.5882246641709197e-05, + "loss": 0.8702, + "num_tokens": 23778108934.0, + "step": 5689 + }, + { + "epoch": 0.6761734997029115, + "grad_norm": 0.3989745344138273, + "learning_rate": 1.5880791572312703e-05, + "loss": 0.8384, + "num_tokens": 23782282834.0, + "step": 5690 + }, + { + "epoch": 0.6762923351158645, + "grad_norm": 0.43228472169534016, + "learning_rate": 1.587933632216399e-05, + "loss": 0.7998, + "num_tokens": 23786470680.0, + "step": 5691 + }, + { + "epoch": 0.6764111705288176, + "grad_norm": 0.45997211594408544, + "learning_rate": 1.5877880891316963e-05, + "loss": 0.8478, + "num_tokens": 23790660990.0, + "step": 5692 + }, + { + "epoch": 0.6765300059417706, + "grad_norm": 0.45116944951901183, + "learning_rate": 1.587642527982551e-05, + "loss": 0.8546, + "num_tokens": 23794846499.0, + "step": 5693 + }, + { + "epoch": 0.6766488413547237, + "grad_norm": 0.496034407581829, + "learning_rate": 1.5874969487743538e-05, + "loss": 0.8467, + "num_tokens": 23799002884.0, + "step": 5694 + }, + { + "epoch": 0.6767676767676768, + "grad_norm": 0.40537088321524595, + "learning_rate": 1.587351351512497e-05, + "loss": 0.8648, + "num_tokens": 23803168269.0, + "step": 5695 + }, + { + "epoch": 0.6768865121806298, + "grad_norm": 0.4388104242980003, + "learning_rate": 1.5872057362023714e-05, + "loss": 0.8318, + "num_tokens": 23807357346.0, + "step": 5696 + }, + { + "epoch": 0.6770053475935829, + "grad_norm": 0.44848151741819287, + "learning_rate": 1.5870601028493703e-05, + "loss": 0.821, + "num_tokens": 23811524746.0, + "step": 5697 + }, + { + "epoch": 0.677124183006536, + "grad_norm": 0.4120819838716074, + "learning_rate": 1.5869144514588864e-05, + "loss": 0.8432, + "num_tokens": 23815699316.0, + "step": 5698 + }, + { + "epoch": 0.677243018419489, + "grad_norm": 0.5182498417101656, + "learning_rate": 1.5867687820363144e-05, + "loss": 0.8399, + "num_tokens": 23819887941.0, + "step": 5699 + }, + { + "epoch": 0.6773618538324421, + "grad_norm": 0.36927694847451326, + "learning_rate": 1.586623094587049e-05, + "loss": 0.856, + "num_tokens": 23824047615.0, + "step": 5700 + }, + { + "epoch": 0.6774806892453952, + "grad_norm": 0.45694329233967296, + "learning_rate": 1.5864773891164845e-05, + "loss": 0.8553, + "num_tokens": 23828237591.0, + "step": 5701 + }, + { + "epoch": 0.6775995246583482, + "grad_norm": 0.4816248309769572, + "learning_rate": 1.5863316656300174e-05, + "loss": 0.8971, + "num_tokens": 23832426276.0, + "step": 5702 + }, + { + "epoch": 0.6777183600713013, + "grad_norm": 0.40353677872805765, + "learning_rate": 1.586185924133045e-05, + "loss": 0.8358, + "num_tokens": 23836608282.0, + "step": 5703 + }, + { + "epoch": 0.6778371954842544, + "grad_norm": 0.5653130536190412, + "learning_rate": 1.5860401646309633e-05, + "loss": 0.8175, + "num_tokens": 23840785524.0, + "step": 5704 + }, + { + "epoch": 0.6779560308972074, + "grad_norm": 0.41118373507726275, + "learning_rate": 1.5858943871291716e-05, + "loss": 0.8552, + "num_tokens": 23844973886.0, + "step": 5705 + }, + { + "epoch": 0.6780748663101605, + "grad_norm": 0.6213833453254706, + "learning_rate": 1.5857485916330675e-05, + "loss": 0.8706, + "num_tokens": 23849163798.0, + "step": 5706 + }, + { + "epoch": 0.6781937017231134, + "grad_norm": 0.44409752726598073, + "learning_rate": 1.5856027781480508e-05, + "loss": 0.8525, + "num_tokens": 23853353516.0, + "step": 5707 + }, + { + "epoch": 0.6783125371360665, + "grad_norm": 0.45243871065601565, + "learning_rate": 1.5854569466795213e-05, + "loss": 0.8969, + "num_tokens": 23857542124.0, + "step": 5708 + }, + { + "epoch": 0.6784313725490196, + "grad_norm": 0.4530704933127105, + "learning_rate": 1.5853110972328798e-05, + "loss": 0.8514, + "num_tokens": 23861701261.0, + "step": 5709 + }, + { + "epoch": 0.6785502079619726, + "grad_norm": 0.510663945394725, + "learning_rate": 1.5851652298135273e-05, + "loss": 0.8407, + "num_tokens": 23865863419.0, + "step": 5710 + }, + { + "epoch": 0.6786690433749257, + "grad_norm": 0.4541932219730557, + "learning_rate": 1.5850193444268663e-05, + "loss": 0.8435, + "num_tokens": 23870018808.0, + "step": 5711 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.482062418116654, + "learning_rate": 1.5848734410782986e-05, + "loss": 0.8505, + "num_tokens": 23874207324.0, + "step": 5712 + }, + { + "epoch": 0.6789067142008318, + "grad_norm": 0.5066736936352371, + "learning_rate": 1.5847275197732284e-05, + "loss": 0.841, + "num_tokens": 23878394853.0, + "step": 5713 + }, + { + "epoch": 0.6790255496137849, + "grad_norm": 0.4574422402578605, + "learning_rate": 1.584581580517059e-05, + "loss": 0.8921, + "num_tokens": 23882574818.0, + "step": 5714 + }, + { + "epoch": 0.679144385026738, + "grad_norm": 0.39033644180808436, + "learning_rate": 1.5844356233151955e-05, + "loss": 0.8669, + "num_tokens": 23886763668.0, + "step": 5715 + }, + { + "epoch": 0.679263220439691, + "grad_norm": 0.4432633284529637, + "learning_rate": 1.584289648173043e-05, + "loss": 0.873, + "num_tokens": 23890933926.0, + "step": 5716 + }, + { + "epoch": 0.6793820558526441, + "grad_norm": 0.4974404837212009, + "learning_rate": 1.584143655096007e-05, + "loss": 0.8472, + "num_tokens": 23895122218.0, + "step": 5717 + }, + { + "epoch": 0.6795008912655971, + "grad_norm": 0.4261201468250935, + "learning_rate": 1.583997644089495e-05, + "loss": 0.8236, + "num_tokens": 23899293608.0, + "step": 5718 + }, + { + "epoch": 0.6796197266785502, + "grad_norm": 0.40607106056569614, + "learning_rate": 1.5838516151589134e-05, + "loss": 0.7904, + "num_tokens": 23903483490.0, + "step": 5719 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.39011834068125056, + "learning_rate": 1.5837055683096708e-05, + "loss": 0.8575, + "num_tokens": 23907641098.0, + "step": 5720 + }, + { + "epoch": 0.6798573975044563, + "grad_norm": 0.4796034211917314, + "learning_rate": 1.5835595035471752e-05, + "loss": 0.8602, + "num_tokens": 23911819010.0, + "step": 5721 + }, + { + "epoch": 0.6799762329174094, + "grad_norm": 0.4124171571118334, + "learning_rate": 1.5834134208768367e-05, + "loss": 0.8312, + "num_tokens": 23915981085.0, + "step": 5722 + }, + { + "epoch": 0.6800950683303625, + "grad_norm": 0.404878673262479, + "learning_rate": 1.5832673203040646e-05, + "loss": 0.8696, + "num_tokens": 23920169868.0, + "step": 5723 + }, + { + "epoch": 0.6802139037433155, + "grad_norm": 0.4468830643115059, + "learning_rate": 1.5831212018342696e-05, + "loss": 0.863, + "num_tokens": 23924331440.0, + "step": 5724 + }, + { + "epoch": 0.6803327391562686, + "grad_norm": 0.43395770588020177, + "learning_rate": 1.582975065472863e-05, + "loss": 0.875, + "num_tokens": 23928504765.0, + "step": 5725 + }, + { + "epoch": 0.6804515745692217, + "grad_norm": 0.3887154928351824, + "learning_rate": 1.582828911225257e-05, + "loss": 0.9056, + "num_tokens": 23932694410.0, + "step": 5726 + }, + { + "epoch": 0.6805704099821747, + "grad_norm": 0.4287693984800258, + "learning_rate": 1.5826827390968636e-05, + "loss": 0.8734, + "num_tokens": 23936854247.0, + "step": 5727 + }, + { + "epoch": 0.6806892453951278, + "grad_norm": 0.5074637097355836, + "learning_rate": 1.5825365490930966e-05, + "loss": 0.8789, + "num_tokens": 23941041126.0, + "step": 5728 + }, + { + "epoch": 0.6808080808080809, + "grad_norm": 0.392362992583755, + "learning_rate": 1.5823903412193694e-05, + "loss": 0.8563, + "num_tokens": 23945229970.0, + "step": 5729 + }, + { + "epoch": 0.6809269162210339, + "grad_norm": 0.4104334535285678, + "learning_rate": 1.582244115481097e-05, + "loss": 0.8502, + "num_tokens": 23949338902.0, + "step": 5730 + }, + { + "epoch": 0.681045751633987, + "grad_norm": 0.48816186325673666, + "learning_rate": 1.5820978718836946e-05, + "loss": 0.8695, + "num_tokens": 23953526765.0, + "step": 5731 + }, + { + "epoch": 0.6811645870469399, + "grad_norm": 0.3980902919866026, + "learning_rate": 1.581951610432578e-05, + "loss": 0.8398, + "num_tokens": 23957686956.0, + "step": 5732 + }, + { + "epoch": 0.681283422459893, + "grad_norm": 0.39548597308909456, + "learning_rate": 1.5818053311331637e-05, + "loss": 0.8254, + "num_tokens": 23961841535.0, + "step": 5733 + }, + { + "epoch": 0.6814022578728461, + "grad_norm": 0.49243417002036943, + "learning_rate": 1.581659033990869e-05, + "loss": 0.8668, + "num_tokens": 23966002994.0, + "step": 5734 + }, + { + "epoch": 0.6815210932857991, + "grad_norm": 0.7077220685158009, + "learning_rate": 1.5815127190111118e-05, + "loss": 0.8379, + "num_tokens": 23970190679.0, + "step": 5735 + }, + { + "epoch": 0.6816399286987522, + "grad_norm": 0.4524463276764529, + "learning_rate": 1.5813663861993104e-05, + "loss": 0.8571, + "num_tokens": 23974351846.0, + "step": 5736 + }, + { + "epoch": 0.6817587641117053, + "grad_norm": 0.4866839937177841, + "learning_rate": 1.5812200355608844e-05, + "loss": 0.8803, + "num_tokens": 23978541147.0, + "step": 5737 + }, + { + "epoch": 0.6818775995246583, + "grad_norm": 0.3788704797710152, + "learning_rate": 1.581073667101253e-05, + "loss": 0.8376, + "num_tokens": 23982731645.0, + "step": 5738 + }, + { + "epoch": 0.6819964349376114, + "grad_norm": 0.5193483246518414, + "learning_rate": 1.580927280825838e-05, + "loss": 0.8461, + "num_tokens": 23986921635.0, + "step": 5739 + }, + { + "epoch": 0.6821152703505645, + "grad_norm": 0.3683373250621337, + "learning_rate": 1.580780876740059e-05, + "loss": 0.8505, + "num_tokens": 23991082636.0, + "step": 5740 + }, + { + "epoch": 0.6822341057635175, + "grad_norm": 0.3985204889500653, + "learning_rate": 1.5806344548493388e-05, + "loss": 0.8587, + "num_tokens": 23995271410.0, + "step": 5741 + }, + { + "epoch": 0.6823529411764706, + "grad_norm": 0.49048691810202194, + "learning_rate": 1.5804880151590997e-05, + "loss": 0.872, + "num_tokens": 23999428175.0, + "step": 5742 + }, + { + "epoch": 0.6824717765894236, + "grad_norm": 0.38638354577955436, + "learning_rate": 1.5803415576747647e-05, + "loss": 0.8444, + "num_tokens": 24003594671.0, + "step": 5743 + }, + { + "epoch": 0.6825906120023767, + "grad_norm": 0.489833866767782, + "learning_rate": 1.580195082401758e-05, + "loss": 0.8387, + "num_tokens": 24007783918.0, + "step": 5744 + }, + { + "epoch": 0.6827094474153298, + "grad_norm": 0.4249055575726868, + "learning_rate": 1.5800485893455036e-05, + "loss": 0.837, + "num_tokens": 24011949050.0, + "step": 5745 + }, + { + "epoch": 0.6828282828282828, + "grad_norm": 0.3854180081191421, + "learning_rate": 1.5799020785114267e-05, + "loss": 0.8534, + "num_tokens": 24016136123.0, + "step": 5746 + }, + { + "epoch": 0.6829471182412359, + "grad_norm": 0.4893485085235224, + "learning_rate": 1.5797555499049535e-05, + "loss": 0.9039, + "num_tokens": 24020326016.0, + "step": 5747 + }, + { + "epoch": 0.683065953654189, + "grad_norm": 0.4130245886225893, + "learning_rate": 1.5796090035315103e-05, + "loss": 0.83, + "num_tokens": 24024516207.0, + "step": 5748 + }, + { + "epoch": 0.683184789067142, + "grad_norm": 0.5010133984494365, + "learning_rate": 1.579462439396524e-05, + "loss": 0.8716, + "num_tokens": 24028691455.0, + "step": 5749 + }, + { + "epoch": 0.6833036244800951, + "grad_norm": 0.4009694011007976, + "learning_rate": 1.5793158575054224e-05, + "loss": 0.8475, + "num_tokens": 24032880212.0, + "step": 5750 + }, + { + "epoch": 0.6834224598930482, + "grad_norm": 0.47995666921303354, + "learning_rate": 1.579169257863634e-05, + "loss": 0.8599, + "num_tokens": 24037069234.0, + "step": 5751 + }, + { + "epoch": 0.6835412953060012, + "grad_norm": 0.3843829616521931, + "learning_rate": 1.579022640476588e-05, + "loss": 0.8291, + "num_tokens": 24041257853.0, + "step": 5752 + }, + { + "epoch": 0.6836601307189543, + "grad_norm": 0.3872940597452457, + "learning_rate": 1.5788760053497135e-05, + "loss": 0.819, + "num_tokens": 24045446071.0, + "step": 5753 + }, + { + "epoch": 0.6837789661319074, + "grad_norm": 0.3833936536929362, + "learning_rate": 1.578729352488442e-05, + "loss": 0.8373, + "num_tokens": 24049635384.0, + "step": 5754 + }, + { + "epoch": 0.6838978015448604, + "grad_norm": 0.4215574740317289, + "learning_rate": 1.5785826818982037e-05, + "loss": 0.848, + "num_tokens": 24053824987.0, + "step": 5755 + }, + { + "epoch": 0.6840166369578135, + "grad_norm": 2.1944113645635652, + "learning_rate": 1.5784359935844306e-05, + "loss": 0.8283, + "num_tokens": 24058012611.0, + "step": 5756 + }, + { + "epoch": 0.6841354723707664, + "grad_norm": 0.8079143130583228, + "learning_rate": 1.5782892875525552e-05, + "loss": 0.8485, + "num_tokens": 24062203439.0, + "step": 5757 + }, + { + "epoch": 0.6842543077837195, + "grad_norm": 0.5330213266460524, + "learning_rate": 1.57814256380801e-05, + "loss": 0.8547, + "num_tokens": 24066367891.0, + "step": 5758 + }, + { + "epoch": 0.6843731431966726, + "grad_norm": 0.4669536534255881, + "learning_rate": 1.577995822356229e-05, + "loss": 0.8116, + "num_tokens": 24070557993.0, + "step": 5759 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 0.6657690686320309, + "learning_rate": 1.5778490632026475e-05, + "loss": 0.8335, + "num_tokens": 24074734762.0, + "step": 5760 + }, + { + "epoch": 0.6846108140225787, + "grad_norm": 0.49151534954802883, + "learning_rate": 1.5777022863526984e-05, + "loss": 0.8371, + "num_tokens": 24078900265.0, + "step": 5761 + }, + { + "epoch": 0.6847296494355318, + "grad_norm": 0.7150687943780214, + "learning_rate": 1.577555491811819e-05, + "loss": 0.8855, + "num_tokens": 24083085304.0, + "step": 5762 + }, + { + "epoch": 0.6848484848484848, + "grad_norm": 0.4810121321770499, + "learning_rate": 1.577408679585445e-05, + "loss": 0.8603, + "num_tokens": 24087274377.0, + "step": 5763 + }, + { + "epoch": 0.6849673202614379, + "grad_norm": 0.8460335842213624, + "learning_rate": 1.5772618496790134e-05, + "loss": 0.83, + "num_tokens": 24091453794.0, + "step": 5764 + }, + { + "epoch": 0.685086155674391, + "grad_norm": 0.6791367397689859, + "learning_rate": 1.5771150020979623e-05, + "loss": 0.8698, + "num_tokens": 24095642028.0, + "step": 5765 + }, + { + "epoch": 0.685204991087344, + "grad_norm": 0.7879572410625736, + "learning_rate": 1.576968136847729e-05, + "loss": 0.8174, + "num_tokens": 24099824827.0, + "step": 5766 + }, + { + "epoch": 0.6853238265002971, + "grad_norm": 0.7117236741303314, + "learning_rate": 1.5768212539337528e-05, + "loss": 0.8415, + "num_tokens": 24103983736.0, + "step": 5767 + }, + { + "epoch": 0.6854426619132501, + "grad_norm": 0.6521309279677706, + "learning_rate": 1.5766743533614735e-05, + "loss": 0.8719, + "num_tokens": 24108164247.0, + "step": 5768 + }, + { + "epoch": 0.6855614973262032, + "grad_norm": 0.6213273041903814, + "learning_rate": 1.5765274351363312e-05, + "loss": 0.8551, + "num_tokens": 24112353284.0, + "step": 5769 + }, + { + "epoch": 0.6856803327391563, + "grad_norm": 0.6492788864173004, + "learning_rate": 1.576380499263767e-05, + "loss": 0.8211, + "num_tokens": 24116513698.0, + "step": 5770 + }, + { + "epoch": 0.6857991681521093, + "grad_norm": 0.591516277717336, + "learning_rate": 1.5762335457492224e-05, + "loss": 0.8653, + "num_tokens": 24120677072.0, + "step": 5771 + }, + { + "epoch": 0.6859180035650624, + "grad_norm": 0.6541724059322235, + "learning_rate": 1.576086574598139e-05, + "loss": 0.8962, + "num_tokens": 24124837715.0, + "step": 5772 + }, + { + "epoch": 0.6860368389780155, + "grad_norm": 0.5147258463165796, + "learning_rate": 1.5759395858159602e-05, + "loss": 0.8207, + "num_tokens": 24129027406.0, + "step": 5773 + }, + { + "epoch": 0.6861556743909685, + "grad_norm": 0.6396887938712803, + "learning_rate": 1.5757925794081295e-05, + "loss": 0.816, + "num_tokens": 24133217525.0, + "step": 5774 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.5080363120544016, + "learning_rate": 1.5756455553800906e-05, + "loss": 0.8634, + "num_tokens": 24137386559.0, + "step": 5775 + }, + { + "epoch": 0.6863933452168747, + "grad_norm": 0.7188706017684058, + "learning_rate": 1.575498513737289e-05, + "loss": 0.8278, + "num_tokens": 24141563207.0, + "step": 5776 + }, + { + "epoch": 0.6865121806298277, + "grad_norm": 0.5301469930577669, + "learning_rate": 1.5753514544851693e-05, + "loss": 0.8608, + "num_tokens": 24145741915.0, + "step": 5777 + }, + { + "epoch": 0.6866310160427808, + "grad_norm": 0.6901339621848249, + "learning_rate": 1.5752043776291783e-05, + "loss": 0.8645, + "num_tokens": 24149916682.0, + "step": 5778 + }, + { + "epoch": 0.6867498514557339, + "grad_norm": 0.5973952077091237, + "learning_rate": 1.5750572831747622e-05, + "loss": 0.8641, + "num_tokens": 24154092532.0, + "step": 5779 + }, + { + "epoch": 0.6868686868686869, + "grad_norm": 0.5988713877304701, + "learning_rate": 1.5749101711273692e-05, + "loss": 0.8939, + "num_tokens": 24158248500.0, + "step": 5780 + }, + { + "epoch": 0.68698752228164, + "grad_norm": 0.5413909145057335, + "learning_rate": 1.5747630414924466e-05, + "loss": 0.8286, + "num_tokens": 24162437262.0, + "step": 5781 + }, + { + "epoch": 0.6871063576945929, + "grad_norm": 0.5613219670368521, + "learning_rate": 1.5746158942754433e-05, + "loss": 0.8574, + "num_tokens": 24166627143.0, + "step": 5782 + }, + { + "epoch": 0.687225193107546, + "grad_norm": 0.45540517294138566, + "learning_rate": 1.5744687294818087e-05, + "loss": 0.8615, + "num_tokens": 24170815970.0, + "step": 5783 + }, + { + "epoch": 0.6873440285204991, + "grad_norm": 0.5882601179684228, + "learning_rate": 1.5743215471169927e-05, + "loss": 0.8436, + "num_tokens": 24174964033.0, + "step": 5784 + }, + { + "epoch": 0.6874628639334521, + "grad_norm": 0.4715261665625272, + "learning_rate": 1.5741743471864467e-05, + "loss": 0.8644, + "num_tokens": 24179154028.0, + "step": 5785 + }, + { + "epoch": 0.6875816993464052, + "grad_norm": 0.6014396338033887, + "learning_rate": 1.574027129695621e-05, + "loss": 0.841, + "num_tokens": 24183343965.0, + "step": 5786 + }, + { + "epoch": 0.6877005347593583, + "grad_norm": 0.5066389795783753, + "learning_rate": 1.5738798946499678e-05, + "loss": 0.8212, + "num_tokens": 24187534672.0, + "step": 5787 + }, + { + "epoch": 0.6878193701723113, + "grad_norm": 0.560456132659432, + "learning_rate": 1.57373264205494e-05, + "loss": 0.8397, + "num_tokens": 24191724657.0, + "step": 5788 + }, + { + "epoch": 0.6879382055852644, + "grad_norm": 0.5234622460577549, + "learning_rate": 1.5735853719159908e-05, + "loss": 0.8419, + "num_tokens": 24195856591.0, + "step": 5789 + }, + { + "epoch": 0.6880570409982175, + "grad_norm": 0.5730684138293769, + "learning_rate": 1.573438084238574e-05, + "loss": 0.8518, + "num_tokens": 24200045880.0, + "step": 5790 + }, + { + "epoch": 0.6881758764111705, + "grad_norm": 0.5148612833064165, + "learning_rate": 1.573290779028144e-05, + "loss": 0.8333, + "num_tokens": 24204235198.0, + "step": 5791 + }, + { + "epoch": 0.6882947118241236, + "grad_norm": 0.5558558649333483, + "learning_rate": 1.5731434562901565e-05, + "loss": 0.8629, + "num_tokens": 24208403008.0, + "step": 5792 + }, + { + "epoch": 0.6884135472370766, + "grad_norm": 0.46802921864630015, + "learning_rate": 1.5729961160300667e-05, + "loss": 0.8394, + "num_tokens": 24212579476.0, + "step": 5793 + }, + { + "epoch": 0.6885323826500297, + "grad_norm": 0.5505098098081336, + "learning_rate": 1.572848758253332e-05, + "loss": 0.8441, + "num_tokens": 24216742689.0, + "step": 5794 + }, + { + "epoch": 0.6886512180629828, + "grad_norm": 0.4879532921781894, + "learning_rate": 1.572701382965409e-05, + "loss": 0.8601, + "num_tokens": 24220932141.0, + "step": 5795 + }, + { + "epoch": 0.6887700534759358, + "grad_norm": 0.5271443853497468, + "learning_rate": 1.5725539901717554e-05, + "loss": 0.8335, + "num_tokens": 24225098191.0, + "step": 5796 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.47649439042232644, + "learning_rate": 1.57240657987783e-05, + "loss": 0.8498, + "num_tokens": 24229288104.0, + "step": 5797 + }, + { + "epoch": 0.689007724301842, + "grad_norm": 0.4845870340606779, + "learning_rate": 1.5722591520890913e-05, + "loss": 0.841, + "num_tokens": 24233477640.0, + "step": 5798 + }, + { + "epoch": 0.689126559714795, + "grad_norm": 0.4370547476214781, + "learning_rate": 1.5721117068109998e-05, + "loss": 0.863, + "num_tokens": 24237636458.0, + "step": 5799 + }, + { + "epoch": 0.6892453951277481, + "grad_norm": 0.5416545840442383, + "learning_rate": 1.571964244049015e-05, + "loss": 0.8309, + "num_tokens": 24241824948.0, + "step": 5800 + }, + { + "epoch": 0.6893642305407012, + "grad_norm": 0.4823091261312403, + "learning_rate": 1.571816763808599e-05, + "loss": 0.8305, + "num_tokens": 24246014087.0, + "step": 5801 + }, + { + "epoch": 0.6894830659536542, + "grad_norm": 0.4992132882353329, + "learning_rate": 1.5716692660952128e-05, + "loss": 0.8326, + "num_tokens": 24250202789.0, + "step": 5802 + }, + { + "epoch": 0.6896019013666073, + "grad_norm": 0.45297780562435364, + "learning_rate": 1.571521750914319e-05, + "loss": 0.8699, + "num_tokens": 24254392296.0, + "step": 5803 + }, + { + "epoch": 0.6897207367795604, + "grad_norm": 0.4893209631712941, + "learning_rate": 1.571374218271381e-05, + "loss": 0.8798, + "num_tokens": 24258553456.0, + "step": 5804 + }, + { + "epoch": 0.6898395721925134, + "grad_norm": 0.4811411517898507, + "learning_rate": 1.571226668171861e-05, + "loss": 0.8507, + "num_tokens": 24262721713.0, + "step": 5805 + }, + { + "epoch": 0.6899584076054665, + "grad_norm": 0.4387050805400215, + "learning_rate": 1.5710791006212245e-05, + "loss": 0.834, + "num_tokens": 24266905817.0, + "step": 5806 + }, + { + "epoch": 0.6900772430184194, + "grad_norm": 0.43940931415048495, + "learning_rate": 1.5709315156249366e-05, + "loss": 0.8421, + "num_tokens": 24271093733.0, + "step": 5807 + }, + { + "epoch": 0.6901960784313725, + "grad_norm": 0.4688141687032403, + "learning_rate": 1.570783913188462e-05, + "loss": 0.8922, + "num_tokens": 24275282420.0, + "step": 5808 + }, + { + "epoch": 0.6903149138443256, + "grad_norm": 0.3951942689616631, + "learning_rate": 1.5706362933172676e-05, + "loss": 0.8372, + "num_tokens": 24279468202.0, + "step": 5809 + }, + { + "epoch": 0.6904337492572786, + "grad_norm": 0.45655565349771876, + "learning_rate": 1.57048865601682e-05, + "loss": 0.8621, + "num_tokens": 24283655985.0, + "step": 5810 + }, + { + "epoch": 0.6905525846702317, + "grad_norm": 0.3745334249728676, + "learning_rate": 1.5703410012925865e-05, + "loss": 0.8497, + "num_tokens": 24287845043.0, + "step": 5811 + }, + { + "epoch": 0.6906714200831848, + "grad_norm": 0.4860754727548421, + "learning_rate": 1.5701933291500354e-05, + "loss": 0.8534, + "num_tokens": 24292035178.0, + "step": 5812 + }, + { + "epoch": 0.6907902554961378, + "grad_norm": 0.474483894215122, + "learning_rate": 1.570045639594636e-05, + "loss": 0.8209, + "num_tokens": 24296223532.0, + "step": 5813 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 0.5425904730321016, + "learning_rate": 1.5698979326318568e-05, + "loss": 0.8724, + "num_tokens": 24300411847.0, + "step": 5814 + }, + { + "epoch": 0.691027926322044, + "grad_norm": 0.44433602894836755, + "learning_rate": 1.569750208267169e-05, + "loss": 0.8298, + "num_tokens": 24304601820.0, + "step": 5815 + }, + { + "epoch": 0.691146761734997, + "grad_norm": 0.53826748064342, + "learning_rate": 1.569602466506042e-05, + "loss": 0.8626, + "num_tokens": 24308790712.0, + "step": 5816 + }, + { + "epoch": 0.6912655971479501, + "grad_norm": 0.4620839084536277, + "learning_rate": 1.5694547073539484e-05, + "loss": 0.8199, + "num_tokens": 24312981733.0, + "step": 5817 + }, + { + "epoch": 0.6913844325609031, + "grad_norm": 0.4998731521936376, + "learning_rate": 1.5693069308163592e-05, + "loss": 0.8474, + "num_tokens": 24317171593.0, + "step": 5818 + }, + { + "epoch": 0.6915032679738562, + "grad_norm": 0.4411006453968457, + "learning_rate": 1.569159136898748e-05, + "loss": 0.8707, + "num_tokens": 24321361166.0, + "step": 5819 + }, + { + "epoch": 0.6916221033868093, + "grad_norm": 0.39902811107612285, + "learning_rate": 1.5690113256065876e-05, + "loss": 0.845, + "num_tokens": 24325536001.0, + "step": 5820 + }, + { + "epoch": 0.6917409387997623, + "grad_norm": 0.4374809909158417, + "learning_rate": 1.5688634969453522e-05, + "loss": 0.8433, + "num_tokens": 24329726400.0, + "step": 5821 + }, + { + "epoch": 0.6918597742127154, + "grad_norm": 0.4412234379070581, + "learning_rate": 1.5687156509205163e-05, + "loss": 0.8567, + "num_tokens": 24333891532.0, + "step": 5822 + }, + { + "epoch": 0.6919786096256685, + "grad_norm": 0.4400766042242549, + "learning_rate": 1.568567787537555e-05, + "loss": 0.881, + "num_tokens": 24338080683.0, + "step": 5823 + }, + { + "epoch": 0.6920974450386215, + "grad_norm": 0.4853853843581565, + "learning_rate": 1.568419906801944e-05, + "loss": 0.8149, + "num_tokens": 24342268221.0, + "step": 5824 + }, + { + "epoch": 0.6922162804515746, + "grad_norm": 0.4250126623504466, + "learning_rate": 1.56827200871916e-05, + "loss": 0.862, + "num_tokens": 24346436639.0, + "step": 5825 + }, + { + "epoch": 0.6923351158645277, + "grad_norm": 0.4552391807752764, + "learning_rate": 1.5681240932946806e-05, + "loss": 0.839, + "num_tokens": 24350567261.0, + "step": 5826 + }, + { + "epoch": 0.6924539512774807, + "grad_norm": 0.441161804595721, + "learning_rate": 1.5679761605339833e-05, + "loss": 0.8536, + "num_tokens": 24354757034.0, + "step": 5827 + }, + { + "epoch": 0.6925727866904338, + "grad_norm": 0.41450625809858865, + "learning_rate": 1.5678282104425464e-05, + "loss": 0.8692, + "num_tokens": 24358946652.0, + "step": 5828 + }, + { + "epoch": 0.6926916221033869, + "grad_norm": 0.5528295714812455, + "learning_rate": 1.567680243025849e-05, + "loss": 0.855, + "num_tokens": 24363111058.0, + "step": 5829 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.4092910130650422, + "learning_rate": 1.567532258289371e-05, + "loss": 0.8702, + "num_tokens": 24367301620.0, + "step": 5830 + }, + { + "epoch": 0.692929292929293, + "grad_norm": 0.5426864275053116, + "learning_rate": 1.5673842562385927e-05, + "loss": 0.8187, + "num_tokens": 24371492517.0, + "step": 5831 + }, + { + "epoch": 0.6930481283422459, + "grad_norm": 0.4545154208017844, + "learning_rate": 1.567236236878995e-05, + "loss": 0.8204, + "num_tokens": 24375675517.0, + "step": 5832 + }, + { + "epoch": 0.693166963755199, + "grad_norm": 0.45643109675018945, + "learning_rate": 1.5670882002160605e-05, + "loss": 0.8607, + "num_tokens": 24379866475.0, + "step": 5833 + }, + { + "epoch": 0.6932857991681521, + "grad_norm": 0.49061927091520907, + "learning_rate": 1.5669401462552697e-05, + "loss": 0.8571, + "num_tokens": 24384055984.0, + "step": 5834 + }, + { + "epoch": 0.6934046345811051, + "grad_norm": 0.39583558766041776, + "learning_rate": 1.566792075002107e-05, + "loss": 0.8224, + "num_tokens": 24388215265.0, + "step": 5835 + }, + { + "epoch": 0.6935234699940582, + "grad_norm": 0.5036480459478365, + "learning_rate": 1.5666439864620553e-05, + "loss": 0.8343, + "num_tokens": 24392389010.0, + "step": 5836 + }, + { + "epoch": 0.6936423054070113, + "grad_norm": 0.43400695531348055, + "learning_rate": 1.5664958806405986e-05, + "loss": 0.9042, + "num_tokens": 24396577153.0, + "step": 5837 + }, + { + "epoch": 0.6937611408199643, + "grad_norm": 0.5704817842320103, + "learning_rate": 1.566347757543223e-05, + "loss": 0.8719, + "num_tokens": 24400735236.0, + "step": 5838 + }, + { + "epoch": 0.6938799762329174, + "grad_norm": 0.41493052677987835, + "learning_rate": 1.5661996171754127e-05, + "loss": 0.8772, + "num_tokens": 24404919015.0, + "step": 5839 + }, + { + "epoch": 0.6939988116458705, + "grad_norm": 0.48707795075772387, + "learning_rate": 1.5660514595426545e-05, + "loss": 0.8541, + "num_tokens": 24409108398.0, + "step": 5840 + }, + { + "epoch": 0.6941176470588235, + "grad_norm": 0.5285249125110915, + "learning_rate": 1.5659032846504346e-05, + "loss": 0.8533, + "num_tokens": 24413297323.0, + "step": 5841 + }, + { + "epoch": 0.6942364824717766, + "grad_norm": 0.41345771940415116, + "learning_rate": 1.5657550925042416e-05, + "loss": 0.877, + "num_tokens": 24417485528.0, + "step": 5842 + }, + { + "epoch": 0.6943553178847296, + "grad_norm": 0.5103065317994468, + "learning_rate": 1.5656068831095622e-05, + "loss": 0.858, + "num_tokens": 24421667935.0, + "step": 5843 + }, + { + "epoch": 0.6944741532976827, + "grad_norm": 0.4929243959893234, + "learning_rate": 1.5654586564718857e-05, + "loss": 0.8551, + "num_tokens": 24425828485.0, + "step": 5844 + }, + { + "epoch": 0.6945929887106358, + "grad_norm": 0.36021680185693206, + "learning_rate": 1.5653104125967015e-05, + "loss": 0.8678, + "num_tokens": 24430018328.0, + "step": 5845 + }, + { + "epoch": 0.6947118241235888, + "grad_norm": 0.5220454464542813, + "learning_rate": 1.5651621514894996e-05, + "loss": 0.8448, + "num_tokens": 24434208425.0, + "step": 5846 + }, + { + "epoch": 0.6948306595365419, + "grad_norm": 0.4075235617044028, + "learning_rate": 1.5650138731557703e-05, + "loss": 0.8788, + "num_tokens": 24438373269.0, + "step": 5847 + }, + { + "epoch": 0.694949494949495, + "grad_norm": 0.45281546183095744, + "learning_rate": 1.564865577601005e-05, + "loss": 0.8705, + "num_tokens": 24442547621.0, + "step": 5848 + }, + { + "epoch": 0.695068330362448, + "grad_norm": 0.43215462572152236, + "learning_rate": 1.5647172648306958e-05, + "loss": 0.8773, + "num_tokens": 24446718858.0, + "step": 5849 + }, + { + "epoch": 0.6951871657754011, + "grad_norm": 0.5030156682046232, + "learning_rate": 1.564568934850335e-05, + "loss": 0.8625, + "num_tokens": 24450867957.0, + "step": 5850 + }, + { + "epoch": 0.6953060011883542, + "grad_norm": 0.4862401723093462, + "learning_rate": 1.5644205876654156e-05, + "loss": 0.915, + "num_tokens": 24455058189.0, + "step": 5851 + }, + { + "epoch": 0.6954248366013072, + "grad_norm": 0.44839161509288633, + "learning_rate": 1.564272223281432e-05, + "loss": 0.8713, + "num_tokens": 24459214397.0, + "step": 5852 + }, + { + "epoch": 0.6955436720142603, + "grad_norm": 0.4108968348396722, + "learning_rate": 1.5641238417038775e-05, + "loss": 0.8568, + "num_tokens": 24463381890.0, + "step": 5853 + }, + { + "epoch": 0.6956625074272134, + "grad_norm": 0.4780328044460904, + "learning_rate": 1.5639754429382485e-05, + "loss": 0.8569, + "num_tokens": 24467572070.0, + "step": 5854 + }, + { + "epoch": 0.6957813428401664, + "grad_norm": 0.4162280605945242, + "learning_rate": 1.5638270269900398e-05, + "loss": 0.8863, + "num_tokens": 24471746335.0, + "step": 5855 + }, + { + "epoch": 0.6959001782531195, + "grad_norm": 0.5136022677998645, + "learning_rate": 1.5636785938647483e-05, + "loss": 0.873, + "num_tokens": 24475927238.0, + "step": 5856 + }, + { + "epoch": 0.6960190136660724, + "grad_norm": 0.425506314476128, + "learning_rate": 1.5635301435678707e-05, + "loss": 0.858, + "num_tokens": 24480115308.0, + "step": 5857 + }, + { + "epoch": 0.6961378490790255, + "grad_norm": 0.4922841442224809, + "learning_rate": 1.5633816761049047e-05, + "loss": 0.8106, + "num_tokens": 24484303155.0, + "step": 5858 + }, + { + "epoch": 0.6962566844919786, + "grad_norm": 0.41114070299065597, + "learning_rate": 1.5632331914813487e-05, + "loss": 0.8601, + "num_tokens": 24488492198.0, + "step": 5859 + }, + { + "epoch": 0.6963755199049316, + "grad_norm": 0.49662080766552974, + "learning_rate": 1.5630846897027006e-05, + "loss": 0.8886, + "num_tokens": 24492660948.0, + "step": 5860 + }, + { + "epoch": 0.6964943553178847, + "grad_norm": 0.48059270322592457, + "learning_rate": 1.562936170774461e-05, + "loss": 0.8146, + "num_tokens": 24496850357.0, + "step": 5861 + }, + { + "epoch": 0.6966131907308378, + "grad_norm": 0.45355316902967224, + "learning_rate": 1.5627876347021302e-05, + "loss": 0.8666, + "num_tokens": 24501037779.0, + "step": 5862 + }, + { + "epoch": 0.6967320261437908, + "grad_norm": 0.3784721811414314, + "learning_rate": 1.562639081491208e-05, + "loss": 0.8513, + "num_tokens": 24505207777.0, + "step": 5863 + }, + { + "epoch": 0.6968508615567439, + "grad_norm": 0.3946309599251241, + "learning_rate": 1.5624905111471972e-05, + "loss": 0.8511, + "num_tokens": 24509396564.0, + "step": 5864 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 0.3614176437364905, + "learning_rate": 1.5623419236755984e-05, + "loss": 0.8523, + "num_tokens": 24513559350.0, + "step": 5865 + }, + { + "epoch": 0.69708853238265, + "grad_norm": 0.40632643369967814, + "learning_rate": 1.562193319081915e-05, + "loss": 0.81, + "num_tokens": 24517748753.0, + "step": 5866 + }, + { + "epoch": 0.6972073677956031, + "grad_norm": 0.45589944461200377, + "learning_rate": 1.562044697371651e-05, + "loss": 0.8477, + "num_tokens": 24521938355.0, + "step": 5867 + }, + { + "epoch": 0.6973262032085561, + "grad_norm": 0.4421387956084309, + "learning_rate": 1.5618960585503086e-05, + "loss": 0.8623, + "num_tokens": 24526119922.0, + "step": 5868 + }, + { + "epoch": 0.6974450386215092, + "grad_norm": 0.3999596389808628, + "learning_rate": 1.5617474026233938e-05, + "loss": 0.8992, + "num_tokens": 24530279716.0, + "step": 5869 + }, + { + "epoch": 0.6975638740344623, + "grad_norm": 0.4146064540208755, + "learning_rate": 1.5615987295964115e-05, + "loss": 0.881, + "num_tokens": 24534467830.0, + "step": 5870 + }, + { + "epoch": 0.6976827094474153, + "grad_norm": 0.3678213581872252, + "learning_rate": 1.5614500394748678e-05, + "loss": 0.8101, + "num_tokens": 24538640371.0, + "step": 5871 + }, + { + "epoch": 0.6978015448603684, + "grad_norm": 0.49584451703292737, + "learning_rate": 1.5613013322642686e-05, + "loss": 0.8648, + "num_tokens": 24542812181.0, + "step": 5872 + }, + { + "epoch": 0.6979203802733215, + "grad_norm": 0.4245484358922259, + "learning_rate": 1.5611526079701215e-05, + "loss": 0.8247, + "num_tokens": 24547001313.0, + "step": 5873 + }, + { + "epoch": 0.6980392156862745, + "grad_norm": 0.3972495130188799, + "learning_rate": 1.5610038665979345e-05, + "loss": 0.8542, + "num_tokens": 24551191562.0, + "step": 5874 + }, + { + "epoch": 0.6981580510992276, + "grad_norm": 0.39822884820692656, + "learning_rate": 1.5608551081532153e-05, + "loss": 0.7965, + "num_tokens": 24555380838.0, + "step": 5875 + }, + { + "epoch": 0.6982768865121807, + "grad_norm": 0.4847999451438666, + "learning_rate": 1.560706332641473e-05, + "loss": 0.8998, + "num_tokens": 24559568060.0, + "step": 5876 + }, + { + "epoch": 0.6983957219251337, + "grad_norm": 0.40677946795770875, + "learning_rate": 1.5605575400682176e-05, + "loss": 0.8282, + "num_tokens": 24563758821.0, + "step": 5877 + }, + { + "epoch": 0.6985145573380868, + "grad_norm": 0.4400200317247284, + "learning_rate": 1.5604087304389597e-05, + "loss": 0.8466, + "num_tokens": 24567947734.0, + "step": 5878 + }, + { + "epoch": 0.6986333927510399, + "grad_norm": 0.4045304479232648, + "learning_rate": 1.5602599037592097e-05, + "loss": 0.8309, + "num_tokens": 24572137915.0, + "step": 5879 + }, + { + "epoch": 0.6987522281639929, + "grad_norm": 0.494887233100386, + "learning_rate": 1.5601110600344792e-05, + "loss": 0.8745, + "num_tokens": 24576317838.0, + "step": 5880 + }, + { + "epoch": 0.698871063576946, + "grad_norm": 0.44717231982970557, + "learning_rate": 1.5599621992702803e-05, + "loss": 0.822, + "num_tokens": 24580508133.0, + "step": 5881 + }, + { + "epoch": 0.6989898989898989, + "grad_norm": 0.39735571919141527, + "learning_rate": 1.5598133214721262e-05, + "loss": 0.8905, + "num_tokens": 24584694632.0, + "step": 5882 + }, + { + "epoch": 0.699108734402852, + "grad_norm": 0.3713638136177279, + "learning_rate": 1.55966442664553e-05, + "loss": 0.8811, + "num_tokens": 24588883376.0, + "step": 5883 + }, + { + "epoch": 0.6992275698158051, + "grad_norm": 0.4206120679173639, + "learning_rate": 1.5595155147960064e-05, + "loss": 0.8315, + "num_tokens": 24593073299.0, + "step": 5884 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.39689834358364895, + "learning_rate": 1.559366585929069e-05, + "loss": 0.8785, + "num_tokens": 24597262824.0, + "step": 5885 + }, + { + "epoch": 0.6994652406417112, + "grad_norm": 0.5006857970743498, + "learning_rate": 1.5592176400502342e-05, + "loss": 0.8833, + "num_tokens": 24601452171.0, + "step": 5886 + }, + { + "epoch": 0.6995840760546643, + "grad_norm": 0.4060522284087107, + "learning_rate": 1.559068677165017e-05, + "loss": 0.8738, + "num_tokens": 24605595166.0, + "step": 5887 + }, + { + "epoch": 0.6997029114676173, + "grad_norm": 0.40044759436893085, + "learning_rate": 1.5589196972789353e-05, + "loss": 0.872, + "num_tokens": 24609783835.0, + "step": 5888 + }, + { + "epoch": 0.6998217468805704, + "grad_norm": 0.4232582843111568, + "learning_rate": 1.5587707003975047e-05, + "loss": 0.838, + "num_tokens": 24613956291.0, + "step": 5889 + }, + { + "epoch": 0.6999405822935235, + "grad_norm": 0.38157154050571895, + "learning_rate": 1.558621686526245e-05, + "loss": 0.8348, + "num_tokens": 24618132006.0, + "step": 5890 + }, + { + "epoch": 0.7000594177064765, + "grad_norm": 0.3837534880611039, + "learning_rate": 1.5584726556706727e-05, + "loss": 0.8261, + "num_tokens": 24622319899.0, + "step": 5891 + }, + { + "epoch": 0.7001782531194296, + "grad_norm": 0.45756799338141696, + "learning_rate": 1.558323607836308e-05, + "loss": 0.8496, + "num_tokens": 24626494409.0, + "step": 5892 + }, + { + "epoch": 0.7002970885323827, + "grad_norm": 0.4531001708986816, + "learning_rate": 1.5581745430286706e-05, + "loss": 0.8494, + "num_tokens": 24630645645.0, + "step": 5893 + }, + { + "epoch": 0.7004159239453357, + "grad_norm": 0.41020144006449, + "learning_rate": 1.558025461253281e-05, + "loss": 0.8778, + "num_tokens": 24634834215.0, + "step": 5894 + }, + { + "epoch": 0.7005347593582888, + "grad_norm": 0.389789678982472, + "learning_rate": 1.5578763625156596e-05, + "loss": 0.8706, + "num_tokens": 24638998144.0, + "step": 5895 + }, + { + "epoch": 0.7006535947712418, + "grad_norm": 0.4131042784190885, + "learning_rate": 1.557727246821328e-05, + "loss": 0.8431, + "num_tokens": 24643156759.0, + "step": 5896 + }, + { + "epoch": 0.7007724301841949, + "grad_norm": 0.4014532359260821, + "learning_rate": 1.5575781141758098e-05, + "loss": 0.8512, + "num_tokens": 24647346405.0, + "step": 5897 + }, + { + "epoch": 0.700891265597148, + "grad_norm": 0.5322458336919281, + "learning_rate": 1.5574289645846262e-05, + "loss": 0.8401, + "num_tokens": 24651534925.0, + "step": 5898 + }, + { + "epoch": 0.701010101010101, + "grad_norm": 0.39603470831737514, + "learning_rate": 1.5572797980533017e-05, + "loss": 0.8659, + "num_tokens": 24655724835.0, + "step": 5899 + }, + { + "epoch": 0.7011289364230541, + "grad_norm": 0.48324752899420687, + "learning_rate": 1.5571306145873603e-05, + "loss": 0.8154, + "num_tokens": 24659914341.0, + "step": 5900 + }, + { + "epoch": 0.7012477718360072, + "grad_norm": 0.4708936188124442, + "learning_rate": 1.5569814141923263e-05, + "loss": 0.8481, + "num_tokens": 24664076418.0, + "step": 5901 + }, + { + "epoch": 0.7013666072489602, + "grad_norm": 0.3708156561855638, + "learning_rate": 1.5568321968737256e-05, + "loss": 0.8571, + "num_tokens": 24668264117.0, + "step": 5902 + }, + { + "epoch": 0.7014854426619133, + "grad_norm": 0.4478731932673351, + "learning_rate": 1.556682962637084e-05, + "loss": 0.8112, + "num_tokens": 24672439936.0, + "step": 5903 + }, + { + "epoch": 0.7016042780748664, + "grad_norm": 0.43255730681643806, + "learning_rate": 1.5565337114879283e-05, + "loss": 0.8656, + "num_tokens": 24676620379.0, + "step": 5904 + }, + { + "epoch": 0.7017231134878193, + "grad_norm": 0.4191740917527707, + "learning_rate": 1.556384443431786e-05, + "loss": 0.8432, + "num_tokens": 24680809604.0, + "step": 5905 + }, + { + "epoch": 0.7018419489007724, + "grad_norm": 0.4220164433531109, + "learning_rate": 1.5562351584741844e-05, + "loss": 0.8429, + "num_tokens": 24684997828.0, + "step": 5906 + }, + { + "epoch": 0.7019607843137254, + "grad_norm": 0.3994999933793254, + "learning_rate": 1.556085856620652e-05, + "loss": 0.8636, + "num_tokens": 24689187375.0, + "step": 5907 + }, + { + "epoch": 0.7020796197266785, + "grad_norm": 0.4416111884790555, + "learning_rate": 1.5559365378767187e-05, + "loss": 0.8421, + "num_tokens": 24693376601.0, + "step": 5908 + }, + { + "epoch": 0.7021984551396316, + "grad_norm": 0.45142363790028617, + "learning_rate": 1.5557872022479135e-05, + "loss": 0.7953, + "num_tokens": 24697565723.0, + "step": 5909 + }, + { + "epoch": 0.7023172905525846, + "grad_norm": 0.39715143914633194, + "learning_rate": 1.5556378497397678e-05, + "loss": 0.8446, + "num_tokens": 24701753369.0, + "step": 5910 + }, + { + "epoch": 0.7024361259655377, + "grad_norm": 0.43014814536927914, + "learning_rate": 1.5554884803578113e-05, + "loss": 0.8687, + "num_tokens": 24705940758.0, + "step": 5911 + }, + { + "epoch": 0.7025549613784908, + "grad_norm": 0.519155935212411, + "learning_rate": 1.5553390941075765e-05, + "loss": 0.8319, + "num_tokens": 24710129236.0, + "step": 5912 + }, + { + "epoch": 0.7026737967914438, + "grad_norm": 0.4080629725880991, + "learning_rate": 1.5551896909945955e-05, + "loss": 0.8311, + "num_tokens": 24714305907.0, + "step": 5913 + }, + { + "epoch": 0.7027926322043969, + "grad_norm": 0.4830855840773753, + "learning_rate": 1.5550402710244016e-05, + "loss": 0.8383, + "num_tokens": 24718468548.0, + "step": 5914 + }, + { + "epoch": 0.70291146761735, + "grad_norm": 0.4063779665483079, + "learning_rate": 1.5548908342025275e-05, + "loss": 0.8293, + "num_tokens": 24722622449.0, + "step": 5915 + }, + { + "epoch": 0.703030303030303, + "grad_norm": 0.439276767002327, + "learning_rate": 1.554741380534508e-05, + "loss": 0.8213, + "num_tokens": 24726812177.0, + "step": 5916 + }, + { + "epoch": 0.7031491384432561, + "grad_norm": 0.4015842049919585, + "learning_rate": 1.5545919100258772e-05, + "loss": 0.8413, + "num_tokens": 24730991472.0, + "step": 5917 + }, + { + "epoch": 0.7032679738562092, + "grad_norm": 0.46318056596276147, + "learning_rate": 1.5544424226821714e-05, + "loss": 0.85, + "num_tokens": 24735183394.0, + "step": 5918 + }, + { + "epoch": 0.7033868092691622, + "grad_norm": 0.43774785423259044, + "learning_rate": 1.5542929185089265e-05, + "loss": 0.8709, + "num_tokens": 24739371682.0, + "step": 5919 + }, + { + "epoch": 0.7035056446821153, + "grad_norm": 0.4576282591040294, + "learning_rate": 1.554143397511678e-05, + "loss": 0.817, + "num_tokens": 24743560627.0, + "step": 5920 + }, + { + "epoch": 0.7036244800950683, + "grad_norm": 0.4223388531015703, + "learning_rate": 1.5539938596959643e-05, + "loss": 0.8075, + "num_tokens": 24747751912.0, + "step": 5921 + }, + { + "epoch": 0.7037433155080214, + "grad_norm": 0.4120057677644367, + "learning_rate": 1.5538443050673236e-05, + "loss": 0.8745, + "num_tokens": 24751942122.0, + "step": 5922 + }, + { + "epoch": 0.7038621509209745, + "grad_norm": 0.4797003973969608, + "learning_rate": 1.553694733631293e-05, + "loss": 0.8328, + "num_tokens": 24756130275.0, + "step": 5923 + }, + { + "epoch": 0.7039809863339275, + "grad_norm": 0.3275801760335419, + "learning_rate": 1.553545145393413e-05, + "loss": 0.8404, + "num_tokens": 24760315919.0, + "step": 5924 + }, + { + "epoch": 0.7040998217468806, + "grad_norm": 0.448768263645779, + "learning_rate": 1.5533955403592223e-05, + "loss": 0.8234, + "num_tokens": 24764505395.0, + "step": 5925 + }, + { + "epoch": 0.7042186571598337, + "grad_norm": 0.32512389352046567, + "learning_rate": 1.5532459185342622e-05, + "loss": 0.8163, + "num_tokens": 24768674710.0, + "step": 5926 + }, + { + "epoch": 0.7043374925727867, + "grad_norm": 0.4451506328563834, + "learning_rate": 1.5530962799240733e-05, + "loss": 0.8291, + "num_tokens": 24772847405.0, + "step": 5927 + }, + { + "epoch": 0.7044563279857398, + "grad_norm": 0.4559901685820583, + "learning_rate": 1.5529466245341966e-05, + "loss": 0.85, + "num_tokens": 24777038273.0, + "step": 5928 + }, + { + "epoch": 0.7045751633986929, + "grad_norm": 0.4568699903083013, + "learning_rate": 1.5527969523701755e-05, + "loss": 0.7837, + "num_tokens": 24781227621.0, + "step": 5929 + }, + { + "epoch": 0.7046939988116458, + "grad_norm": 0.3725397813927379, + "learning_rate": 1.552647263437552e-05, + "loss": 0.8819, + "num_tokens": 24785388713.0, + "step": 5930 + }, + { + "epoch": 0.704812834224599, + "grad_norm": 0.47939255294155664, + "learning_rate": 1.55249755774187e-05, + "loss": 0.7974, + "num_tokens": 24789548778.0, + "step": 5931 + }, + { + "epoch": 0.7049316696375519, + "grad_norm": 0.43418908305113946, + "learning_rate": 1.5523478352886738e-05, + "loss": 0.8272, + "num_tokens": 24793717659.0, + "step": 5932 + }, + { + "epoch": 0.705050505050505, + "grad_norm": 0.4794412437851157, + "learning_rate": 1.5521980960835073e-05, + "loss": 0.8481, + "num_tokens": 24797907336.0, + "step": 5933 + }, + { + "epoch": 0.7051693404634581, + "grad_norm": 0.4129741237962291, + "learning_rate": 1.5520483401319167e-05, + "loss": 0.8388, + "num_tokens": 24802096415.0, + "step": 5934 + }, + { + "epoch": 0.7052881758764111, + "grad_norm": 0.39076278263588826, + "learning_rate": 1.551898567439448e-05, + "loss": 0.8722, + "num_tokens": 24806284785.0, + "step": 5935 + }, + { + "epoch": 0.7054070112893642, + "grad_norm": 0.4151715026646246, + "learning_rate": 1.5517487780116466e-05, + "loss": 0.8723, + "num_tokens": 24810460682.0, + "step": 5936 + }, + { + "epoch": 0.7055258467023173, + "grad_norm": 0.365650797916002, + "learning_rate": 1.5515989718540612e-05, + "loss": 0.8323, + "num_tokens": 24814649327.0, + "step": 5937 + }, + { + "epoch": 0.7056446821152703, + "grad_norm": 0.4537109646139703, + "learning_rate": 1.551449148972239e-05, + "loss": 0.8539, + "num_tokens": 24818821289.0, + "step": 5938 + }, + { + "epoch": 0.7057635175282234, + "grad_norm": 0.3986025039104856, + "learning_rate": 1.551299309371728e-05, + "loss": 0.7998, + "num_tokens": 24823012406.0, + "step": 5939 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.4151133719897864, + "learning_rate": 1.551149453058078e-05, + "loss": 0.861, + "num_tokens": 24827200228.0, + "step": 5940 + }, + { + "epoch": 0.7060011883541295, + "grad_norm": 0.33460845347614016, + "learning_rate": 1.5509995800368386e-05, + "loss": 0.8557, + "num_tokens": 24831388695.0, + "step": 5941 + }, + { + "epoch": 0.7061200237670826, + "grad_norm": 0.40226161212240374, + "learning_rate": 1.5508496903135593e-05, + "loss": 0.8335, + "num_tokens": 24835555328.0, + "step": 5942 + }, + { + "epoch": 0.7062388591800357, + "grad_norm": 0.42399069808443524, + "learning_rate": 1.550699783893792e-05, + "loss": 0.878, + "num_tokens": 24839733654.0, + "step": 5943 + }, + { + "epoch": 0.7063576945929887, + "grad_norm": 0.3903644324703883, + "learning_rate": 1.5505498607830877e-05, + "loss": 0.8255, + "num_tokens": 24843895305.0, + "step": 5944 + }, + { + "epoch": 0.7064765300059418, + "grad_norm": 0.379137524368056, + "learning_rate": 1.550399920986999e-05, + "loss": 0.8566, + "num_tokens": 24848084601.0, + "step": 5945 + }, + { + "epoch": 0.7065953654188948, + "grad_norm": 0.4665501173953848, + "learning_rate": 1.550249964511078e-05, + "loss": 0.8429, + "num_tokens": 24852274117.0, + "step": 5946 + }, + { + "epoch": 0.7067142008318479, + "grad_norm": 0.40570921766771695, + "learning_rate": 1.550099991360878e-05, + "loss": 0.8872, + "num_tokens": 24856462417.0, + "step": 5947 + }, + { + "epoch": 0.706833036244801, + "grad_norm": 0.4301358184140167, + "learning_rate": 1.549950001541955e-05, + "loss": 0.8612, + "num_tokens": 24860651282.0, + "step": 5948 + }, + { + "epoch": 0.706951871657754, + "grad_norm": 0.4833270474410124, + "learning_rate": 1.549799995059861e-05, + "loss": 0.8833, + "num_tokens": 24864839501.0, + "step": 5949 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.4171371347258462, + "learning_rate": 1.5496499719201527e-05, + "loss": 0.8554, + "num_tokens": 24869016839.0, + "step": 5950 + }, + { + "epoch": 0.7071895424836602, + "grad_norm": 0.4730980069487751, + "learning_rate": 1.5494999321283853e-05, + "loss": 0.8327, + "num_tokens": 24873206971.0, + "step": 5951 + }, + { + "epoch": 0.7073083778966132, + "grad_norm": 0.4108986550658088, + "learning_rate": 1.549349875690116e-05, + "loss": 0.8825, + "num_tokens": 24877396153.0, + "step": 5952 + }, + { + "epoch": 0.7074272133095663, + "grad_norm": 0.4068361201561672, + "learning_rate": 1.5491998026109013e-05, + "loss": 0.8835, + "num_tokens": 24881572958.0, + "step": 5953 + }, + { + "epoch": 0.7075460487225194, + "grad_norm": 0.4317745386862713, + "learning_rate": 1.549049712896299e-05, + "loss": 0.8877, + "num_tokens": 24885750266.0, + "step": 5954 + }, + { + "epoch": 0.7076648841354723, + "grad_norm": 0.48490587941970803, + "learning_rate": 1.548899606551868e-05, + "loss": 0.8088, + "num_tokens": 24889940310.0, + "step": 5955 + }, + { + "epoch": 0.7077837195484254, + "grad_norm": 0.46061359731516305, + "learning_rate": 1.5487494835831666e-05, + "loss": 0.8694, + "num_tokens": 24894128726.0, + "step": 5956 + }, + { + "epoch": 0.7079025549613784, + "grad_norm": 0.47103359930622063, + "learning_rate": 1.5485993439957544e-05, + "loss": 0.8569, + "num_tokens": 24898317103.0, + "step": 5957 + }, + { + "epoch": 0.7080213903743315, + "grad_norm": 0.3443959593258592, + "learning_rate": 1.548449187795192e-05, + "loss": 0.862, + "num_tokens": 24902506556.0, + "step": 5958 + }, + { + "epoch": 0.7081402257872846, + "grad_norm": 0.44562226664778015, + "learning_rate": 1.5482990149870398e-05, + "loss": 0.8467, + "num_tokens": 24906673880.0, + "step": 5959 + }, + { + "epoch": 0.7082590612002376, + "grad_norm": 0.4051915530511827, + "learning_rate": 1.5481488255768593e-05, + "loss": 0.8339, + "num_tokens": 24910858998.0, + "step": 5960 + }, + { + "epoch": 0.7083778966131907, + "grad_norm": 0.4948738542097837, + "learning_rate": 1.5479986195702127e-05, + "loss": 0.8209, + "num_tokens": 24915047776.0, + "step": 5961 + }, + { + "epoch": 0.7084967320261438, + "grad_norm": 0.4186670993437878, + "learning_rate": 1.5478483969726623e-05, + "loss": 0.8182, + "num_tokens": 24919237598.0, + "step": 5962 + }, + { + "epoch": 0.7086155674390968, + "grad_norm": 0.4942129561605208, + "learning_rate": 1.5476981577897717e-05, + "loss": 0.8672, + "num_tokens": 24923425956.0, + "step": 5963 + }, + { + "epoch": 0.7087344028520499, + "grad_norm": 0.36434785339080944, + "learning_rate": 1.5475479020271047e-05, + "loss": 0.8514, + "num_tokens": 24927616968.0, + "step": 5964 + }, + { + "epoch": 0.708853238265003, + "grad_norm": 0.5225894687982676, + "learning_rate": 1.5473976296902252e-05, + "loss": 0.8681, + "num_tokens": 24931806550.0, + "step": 5965 + }, + { + "epoch": 0.708972073677956, + "grad_norm": 0.5104986301863136, + "learning_rate": 1.5472473407846995e-05, + "loss": 0.8487, + "num_tokens": 24935997980.0, + "step": 5966 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 0.3783648573442322, + "learning_rate": 1.5470970353160924e-05, + "loss": 0.8498, + "num_tokens": 24940157005.0, + "step": 5967 + }, + { + "epoch": 0.7092097445038622, + "grad_norm": 0.4868205791643289, + "learning_rate": 1.54694671328997e-05, + "loss": 0.8596, + "num_tokens": 24944345386.0, + "step": 5968 + }, + { + "epoch": 0.7093285799168152, + "grad_norm": 0.4108109445041535, + "learning_rate": 1.5467963747118998e-05, + "loss": 0.823, + "num_tokens": 24948536341.0, + "step": 5969 + }, + { + "epoch": 0.7094474153297683, + "grad_norm": 0.4321888928211275, + "learning_rate": 1.5466460195874494e-05, + "loss": 0.8235, + "num_tokens": 24952725604.0, + "step": 5970 + }, + { + "epoch": 0.7095662507427213, + "grad_norm": 0.3840349887884872, + "learning_rate": 1.5464956479221868e-05, + "loss": 0.8782, + "num_tokens": 24956912869.0, + "step": 5971 + }, + { + "epoch": 0.7096850861556744, + "grad_norm": 0.41424988309560634, + "learning_rate": 1.5463452597216806e-05, + "loss": 0.8521, + "num_tokens": 24961092100.0, + "step": 5972 + }, + { + "epoch": 0.7098039215686275, + "grad_norm": 0.39045851039046425, + "learning_rate": 1.5461948549915002e-05, + "loss": 0.8774, + "num_tokens": 24965255973.0, + "step": 5973 + }, + { + "epoch": 0.7099227569815805, + "grad_norm": 0.4782538560345269, + "learning_rate": 1.5460444337372158e-05, + "loss": 0.8575, + "num_tokens": 24969418794.0, + "step": 5974 + }, + { + "epoch": 0.7100415923945336, + "grad_norm": 0.3873468882756596, + "learning_rate": 1.5458939959643977e-05, + "loss": 0.8194, + "num_tokens": 24973586532.0, + "step": 5975 + }, + { + "epoch": 0.7101604278074867, + "grad_norm": 0.4609644655842268, + "learning_rate": 1.5457435416786175e-05, + "loss": 0.8645, + "num_tokens": 24977769359.0, + "step": 5976 + }, + { + "epoch": 0.7102792632204397, + "grad_norm": 0.4108462407173202, + "learning_rate": 1.545593070885447e-05, + "loss": 0.8429, + "num_tokens": 24981958626.0, + "step": 5977 + }, + { + "epoch": 0.7103980986333928, + "grad_norm": 0.4143315831422185, + "learning_rate": 1.545442583590458e-05, + "loss": 0.896, + "num_tokens": 24986117828.0, + "step": 5978 + }, + { + "epoch": 0.7105169340463459, + "grad_norm": 0.4448261040250632, + "learning_rate": 1.5452920797992247e-05, + "loss": 0.817, + "num_tokens": 24990298906.0, + "step": 5979 + }, + { + "epoch": 0.7106357694592988, + "grad_norm": 0.43780388075676774, + "learning_rate": 1.54514155951732e-05, + "loss": 0.8669, + "num_tokens": 24994488286.0, + "step": 5980 + }, + { + "epoch": 0.710754604872252, + "grad_norm": 0.42881166616300437, + "learning_rate": 1.544991022750318e-05, + "loss": 0.8683, + "num_tokens": 24998669302.0, + "step": 5981 + }, + { + "epoch": 0.7108734402852049, + "grad_norm": 0.4557783878868525, + "learning_rate": 1.544840469503794e-05, + "loss": 0.8781, + "num_tokens": 25002837266.0, + "step": 5982 + }, + { + "epoch": 0.710992275698158, + "grad_norm": 0.3540349558863282, + "learning_rate": 1.5446898997833232e-05, + "loss": 0.8876, + "num_tokens": 25007022829.0, + "step": 5983 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.4235266885942098, + "learning_rate": 1.5445393135944824e-05, + "loss": 0.8632, + "num_tokens": 25011212592.0, + "step": 5984 + }, + { + "epoch": 0.7112299465240641, + "grad_norm": 0.379561953350964, + "learning_rate": 1.5443887109428472e-05, + "loss": 0.8328, + "num_tokens": 25015371161.0, + "step": 5985 + }, + { + "epoch": 0.7113487819370172, + "grad_norm": 0.4880281888475838, + "learning_rate": 1.5442380918339957e-05, + "loss": 0.853, + "num_tokens": 25019561543.0, + "step": 5986 + }, + { + "epoch": 0.7114676173499703, + "grad_norm": 0.38493978688580854, + "learning_rate": 1.5440874562735058e-05, + "loss": 0.8001, + "num_tokens": 25023711930.0, + "step": 5987 + }, + { + "epoch": 0.7115864527629233, + "grad_norm": 0.447759288807128, + "learning_rate": 1.5439368042669558e-05, + "loss": 0.8399, + "num_tokens": 25027901822.0, + "step": 5988 + }, + { + "epoch": 0.7117052881758764, + "grad_norm": 0.4377811096184991, + "learning_rate": 1.5437861358199252e-05, + "loss": 0.8997, + "num_tokens": 25032090400.0, + "step": 5989 + }, + { + "epoch": 0.7118241235888295, + "grad_norm": 0.3943802395837436, + "learning_rate": 1.543635450937993e-05, + "loss": 0.8691, + "num_tokens": 25036255093.0, + "step": 5990 + }, + { + "epoch": 0.7119429590017825, + "grad_norm": 0.45807909363992694, + "learning_rate": 1.5434847496267404e-05, + "loss": 0.8675, + "num_tokens": 25040443236.0, + "step": 5991 + }, + { + "epoch": 0.7120617944147356, + "grad_norm": 0.4579857770184694, + "learning_rate": 1.5433340318917482e-05, + "loss": 0.8417, + "num_tokens": 25044632752.0, + "step": 5992 + }, + { + "epoch": 0.7121806298276887, + "grad_norm": 0.45767092408733645, + "learning_rate": 1.5431832977385976e-05, + "loss": 0.8205, + "num_tokens": 25048821536.0, + "step": 5993 + }, + { + "epoch": 0.7122994652406417, + "grad_norm": 0.4370169661405082, + "learning_rate": 1.543032547172871e-05, + "loss": 0.8471, + "num_tokens": 25053010491.0, + "step": 5994 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.4122789008571289, + "learning_rate": 1.5428817802001515e-05, + "loss": 0.8064, + "num_tokens": 25057200199.0, + "step": 5995 + }, + { + "epoch": 0.7125371360665478, + "grad_norm": 0.42757791180166793, + "learning_rate": 1.542730996826022e-05, + "loss": 0.8295, + "num_tokens": 25061335398.0, + "step": 5996 + }, + { + "epoch": 0.7126559714795009, + "grad_norm": 0.5868231557046525, + "learning_rate": 1.542580197056067e-05, + "loss": 0.8238, + "num_tokens": 25065525238.0, + "step": 5997 + }, + { + "epoch": 0.712774806892454, + "grad_norm": 0.38949405540593757, + "learning_rate": 1.542429380895871e-05, + "loss": 0.8501, + "num_tokens": 25069714418.0, + "step": 5998 + }, + { + "epoch": 0.712893642305407, + "grad_norm": 0.4059270729714632, + "learning_rate": 1.542278548351019e-05, + "loss": 0.8665, + "num_tokens": 25073902475.0, + "step": 5999 + }, + { + "epoch": 0.7130124777183601, + "grad_norm": 0.49548167066245585, + "learning_rate": 1.542127699427097e-05, + "loss": 0.8716, + "num_tokens": 25078076974.0, + "step": 6000 + }, + { + "epoch": 0.7131313131313132, + "grad_norm": 0.43632027964823433, + "learning_rate": 1.541976834129691e-05, + "loss": 0.8351, + "num_tokens": 25082266750.0, + "step": 6001 + }, + { + "epoch": 0.7132501485442662, + "grad_norm": 0.4480023765672559, + "learning_rate": 1.541825952464389e-05, + "loss": 0.8259, + "num_tokens": 25086448630.0, + "step": 6002 + }, + { + "epoch": 0.7133689839572193, + "grad_norm": 0.4770685254958695, + "learning_rate": 1.5416750544367778e-05, + "loss": 0.8767, + "num_tokens": 25090638045.0, + "step": 6003 + }, + { + "epoch": 0.7134878193701724, + "grad_norm": 0.38250606791647773, + "learning_rate": 1.5415241400524462e-05, + "loss": 0.8468, + "num_tokens": 25094826264.0, + "step": 6004 + }, + { + "epoch": 0.7136066547831253, + "grad_norm": 0.4581704274010005, + "learning_rate": 1.5413732093169828e-05, + "loss": 0.8339, + "num_tokens": 25098993382.0, + "step": 6005 + }, + { + "epoch": 0.7137254901960784, + "grad_norm": 0.4442206058085038, + "learning_rate": 1.541222262235977e-05, + "loss": 0.8541, + "num_tokens": 25103170145.0, + "step": 6006 + }, + { + "epoch": 0.7138443256090314, + "grad_norm": 0.42177411074979343, + "learning_rate": 1.5410712988150192e-05, + "loss": 0.8617, + "num_tokens": 25107347307.0, + "step": 6007 + }, + { + "epoch": 0.7139631610219845, + "grad_norm": 0.3969684310718043, + "learning_rate": 1.5409203190597002e-05, + "loss": 0.8574, + "num_tokens": 25111535798.0, + "step": 6008 + }, + { + "epoch": 0.7140819964349376, + "grad_norm": 0.4559903111607905, + "learning_rate": 1.5407693229756104e-05, + "loss": 0.8631, + "num_tokens": 25115655521.0, + "step": 6009 + }, + { + "epoch": 0.7142008318478906, + "grad_norm": 0.37814991971288275, + "learning_rate": 1.540618310568343e-05, + "loss": 0.8667, + "num_tokens": 25119845578.0, + "step": 6010 + }, + { + "epoch": 0.7143196672608437, + "grad_norm": 0.42730183468607796, + "learning_rate": 1.540467281843489e-05, + "loss": 0.9065, + "num_tokens": 25124014336.0, + "step": 6011 + }, + { + "epoch": 0.7144385026737968, + "grad_norm": 0.4174553360190742, + "learning_rate": 1.5403162368066425e-05, + "loss": 0.8518, + "num_tokens": 25128203193.0, + "step": 6012 + }, + { + "epoch": 0.7145573380867498, + "grad_norm": 0.4597193010852284, + "learning_rate": 1.5401651754633977e-05, + "loss": 0.8546, + "num_tokens": 25132391215.0, + "step": 6013 + }, + { + "epoch": 0.7146761734997029, + "grad_norm": 0.34512953078300634, + "learning_rate": 1.5400140978193473e-05, + "loss": 0.8707, + "num_tokens": 25136554997.0, + "step": 6014 + }, + { + "epoch": 0.714795008912656, + "grad_norm": 0.4590183165766187, + "learning_rate": 1.5398630038800876e-05, + "loss": 0.8529, + "num_tokens": 25140744666.0, + "step": 6015 + }, + { + "epoch": 0.714913844325609, + "grad_norm": 0.5218487060184513, + "learning_rate": 1.5397118936512137e-05, + "loss": 0.8666, + "num_tokens": 25144874016.0, + "step": 6016 + }, + { + "epoch": 0.7150326797385621, + "grad_norm": 0.3942029122393042, + "learning_rate": 1.5395607671383215e-05, + "loss": 0.8471, + "num_tokens": 25149064211.0, + "step": 6017 + }, + { + "epoch": 0.7151515151515152, + "grad_norm": 0.4566657282992354, + "learning_rate": 1.539409624347008e-05, + "loss": 0.8395, + "num_tokens": 25153231587.0, + "step": 6018 + }, + { + "epoch": 0.7152703505644682, + "grad_norm": 0.449424782852102, + "learning_rate": 1.5392584652828704e-05, + "loss": 0.8645, + "num_tokens": 25157422125.0, + "step": 6019 + }, + { + "epoch": 0.7153891859774213, + "grad_norm": 0.40536440271530566, + "learning_rate": 1.5391072899515067e-05, + "loss": 0.8629, + "num_tokens": 25161611763.0, + "step": 6020 + }, + { + "epoch": 0.7155080213903743, + "grad_norm": 0.47977137924629215, + "learning_rate": 1.5389560983585155e-05, + "loss": 0.8441, + "num_tokens": 25165801216.0, + "step": 6021 + }, + { + "epoch": 0.7156268568033274, + "grad_norm": 0.45965965462777497, + "learning_rate": 1.5388048905094956e-05, + "loss": 0.8778, + "num_tokens": 25169989748.0, + "step": 6022 + }, + { + "epoch": 0.7157456922162805, + "grad_norm": 0.355842367376534, + "learning_rate": 1.5386536664100473e-05, + "loss": 0.8688, + "num_tokens": 25174167267.0, + "step": 6023 + }, + { + "epoch": 0.7158645276292335, + "grad_norm": 0.4618762008915873, + "learning_rate": 1.53850242606577e-05, + "loss": 0.8731, + "num_tokens": 25178306280.0, + "step": 6024 + }, + { + "epoch": 0.7159833630421866, + "grad_norm": 0.4636091996381473, + "learning_rate": 1.5383511694822654e-05, + "loss": 0.886, + "num_tokens": 25182494775.0, + "step": 6025 + }, + { + "epoch": 0.7161021984551397, + "grad_norm": 0.3803949758685805, + "learning_rate": 1.5381998966651354e-05, + "loss": 0.8482, + "num_tokens": 25186683132.0, + "step": 6026 + }, + { + "epoch": 0.7162210338680927, + "grad_norm": 0.43457602630311026, + "learning_rate": 1.5380486076199814e-05, + "loss": 0.8361, + "num_tokens": 25190872515.0, + "step": 6027 + }, + { + "epoch": 0.7163398692810458, + "grad_norm": 0.42066838167663034, + "learning_rate": 1.5378973023524062e-05, + "loss": 0.8414, + "num_tokens": 25195062892.0, + "step": 6028 + }, + { + "epoch": 0.7164587046939989, + "grad_norm": 0.3479276120955065, + "learning_rate": 1.5377459808680137e-05, + "loss": 0.8817, + "num_tokens": 25199254146.0, + "step": 6029 + }, + { + "epoch": 0.7165775401069518, + "grad_norm": 0.5283450845165333, + "learning_rate": 1.5375946431724073e-05, + "loss": 0.8494, + "num_tokens": 25203439107.0, + "step": 6030 + }, + { + "epoch": 0.716696375519905, + "grad_norm": 0.39487897147567363, + "learning_rate": 1.5374432892711913e-05, + "loss": 0.821, + "num_tokens": 25207607115.0, + "step": 6031 + }, + { + "epoch": 0.7168152109328579, + "grad_norm": 0.39357458521686095, + "learning_rate": 1.537291919169972e-05, + "loss": 0.8623, + "num_tokens": 25211772056.0, + "step": 6032 + }, + { + "epoch": 0.716934046345811, + "grad_norm": 0.5574683057115607, + "learning_rate": 1.5371405328743536e-05, + "loss": 0.8402, + "num_tokens": 25215939529.0, + "step": 6033 + }, + { + "epoch": 0.7170528817587641, + "grad_norm": 0.3784412204159229, + "learning_rate": 1.5369891303899442e-05, + "loss": 0.877, + "num_tokens": 25220127806.0, + "step": 6034 + }, + { + "epoch": 0.7171717171717171, + "grad_norm": 0.5146384745381433, + "learning_rate": 1.536837711722349e-05, + "loss": 0.8836, + "num_tokens": 25224272601.0, + "step": 6035 + }, + { + "epoch": 0.7172905525846702, + "grad_norm": 0.4538567172644651, + "learning_rate": 1.5366862768771768e-05, + "loss": 0.8458, + "num_tokens": 25228461264.0, + "step": 6036 + }, + { + "epoch": 0.7174093879976233, + "grad_norm": 0.4049119246052225, + "learning_rate": 1.536534825860035e-05, + "loss": 0.8513, + "num_tokens": 25232615878.0, + "step": 6037 + }, + { + "epoch": 0.7175282234105763, + "grad_norm": 0.43158566651382285, + "learning_rate": 1.5363833586765327e-05, + "loss": 0.8573, + "num_tokens": 25236806170.0, + "step": 6038 + }, + { + "epoch": 0.7176470588235294, + "grad_norm": 0.40047580665830107, + "learning_rate": 1.5362318753322795e-05, + "loss": 0.8522, + "num_tokens": 25240978660.0, + "step": 6039 + }, + { + "epoch": 0.7177658942364825, + "grad_norm": 0.40438206422365824, + "learning_rate": 1.5360803758328847e-05, + "loss": 0.8708, + "num_tokens": 25245168407.0, + "step": 6040 + }, + { + "epoch": 0.7178847296494355, + "grad_norm": 0.41707453466693234, + "learning_rate": 1.535928860183959e-05, + "loss": 0.8491, + "num_tokens": 25249346044.0, + "step": 6041 + }, + { + "epoch": 0.7180035650623886, + "grad_norm": 0.38929065752770786, + "learning_rate": 1.5357773283911143e-05, + "loss": 0.8558, + "num_tokens": 25253536969.0, + "step": 6042 + }, + { + "epoch": 0.7181224004753417, + "grad_norm": 0.3947557632189901, + "learning_rate": 1.5356257804599612e-05, + "loss": 0.901, + "num_tokens": 25257725584.0, + "step": 6043 + }, + { + "epoch": 0.7182412358882947, + "grad_norm": 0.4028499011639366, + "learning_rate": 1.535474216396113e-05, + "loss": 0.8611, + "num_tokens": 25261914718.0, + "step": 6044 + }, + { + "epoch": 0.7183600713012478, + "grad_norm": 0.41230392494240037, + "learning_rate": 1.5353226362051818e-05, + "loss": 0.8583, + "num_tokens": 25266104398.0, + "step": 6045 + }, + { + "epoch": 0.7184789067142008, + "grad_norm": 0.41429272006909623, + "learning_rate": 1.5351710398927813e-05, + "loss": 0.8619, + "num_tokens": 25270293799.0, + "step": 6046 + }, + { + "epoch": 0.7185977421271539, + "grad_norm": 0.4257242626181569, + "learning_rate": 1.5350194274645264e-05, + "loss": 0.8437, + "num_tokens": 25274464630.0, + "step": 6047 + }, + { + "epoch": 0.718716577540107, + "grad_norm": 0.38035372805246187, + "learning_rate": 1.534867798926031e-05, + "loss": 0.865, + "num_tokens": 25278653683.0, + "step": 6048 + }, + { + "epoch": 0.71883541295306, + "grad_norm": 0.43806599416298786, + "learning_rate": 1.534716154282911e-05, + "loss": 0.8914, + "num_tokens": 25282842811.0, + "step": 6049 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.44908739870452474, + "learning_rate": 1.5345644935407816e-05, + "loss": 0.8685, + "num_tokens": 25287031989.0, + "step": 6050 + }, + { + "epoch": 0.7190730837789662, + "grad_norm": 0.45033564687507577, + "learning_rate": 1.5344128167052598e-05, + "loss": 0.8233, + "num_tokens": 25291220223.0, + "step": 6051 + }, + { + "epoch": 0.7191919191919192, + "grad_norm": 0.4183208975780465, + "learning_rate": 1.534261123781963e-05, + "loss": 0.8607, + "num_tokens": 25295410432.0, + "step": 6052 + }, + { + "epoch": 0.7193107546048723, + "grad_norm": 0.3877756485767912, + "learning_rate": 1.5341094147765082e-05, + "loss": 0.8512, + "num_tokens": 25299598640.0, + "step": 6053 + }, + { + "epoch": 0.7194295900178254, + "grad_norm": 0.42922186179076055, + "learning_rate": 1.5339576896945135e-05, + "loss": 0.8633, + "num_tokens": 25303787973.0, + "step": 6054 + }, + { + "epoch": 0.7195484254307783, + "grad_norm": 0.3918016123784684, + "learning_rate": 1.5338059485415994e-05, + "loss": 0.8278, + "num_tokens": 25307977224.0, + "step": 6055 + }, + { + "epoch": 0.7196672608437314, + "grad_norm": 0.4256634725356925, + "learning_rate": 1.5336541913233834e-05, + "loss": 0.8371, + "num_tokens": 25312141444.0, + "step": 6056 + }, + { + "epoch": 0.7197860962566844, + "grad_norm": 0.4950282571021606, + "learning_rate": 1.5335024180454867e-05, + "loss": 0.8854, + "num_tokens": 25316329225.0, + "step": 6057 + }, + { + "epoch": 0.7199049316696375, + "grad_norm": 0.43374216558113016, + "learning_rate": 1.5333506287135297e-05, + "loss": 0.8596, + "num_tokens": 25320517597.0, + "step": 6058 + }, + { + "epoch": 0.7200237670825906, + "grad_norm": 0.465054135989733, + "learning_rate": 1.533198823333134e-05, + "loss": 0.8386, + "num_tokens": 25324707959.0, + "step": 6059 + }, + { + "epoch": 0.7201426024955436, + "grad_norm": 0.34841599068687773, + "learning_rate": 1.533047001909921e-05, + "loss": 0.8707, + "num_tokens": 25328898142.0, + "step": 6060 + }, + { + "epoch": 0.7202614379084967, + "grad_norm": 0.5691330412745991, + "learning_rate": 1.5328951644495133e-05, + "loss": 0.8588, + "num_tokens": 25333087009.0, + "step": 6061 + }, + { + "epoch": 0.7203802733214498, + "grad_norm": 0.43756257241623825, + "learning_rate": 1.5327433109575342e-05, + "loss": 0.8522, + "num_tokens": 25337267611.0, + "step": 6062 + }, + { + "epoch": 0.7204991087344028, + "grad_norm": 0.3786730381967395, + "learning_rate": 1.532591441439607e-05, + "loss": 0.8601, + "num_tokens": 25341439560.0, + "step": 6063 + }, + { + "epoch": 0.7206179441473559, + "grad_norm": 0.4680728803059874, + "learning_rate": 1.5324395559013564e-05, + "loss": 0.9165, + "num_tokens": 25345627214.0, + "step": 6064 + }, + { + "epoch": 0.720736779560309, + "grad_norm": 0.44769535321941156, + "learning_rate": 1.5322876543484067e-05, + "loss": 0.8556, + "num_tokens": 25349789874.0, + "step": 6065 + }, + { + "epoch": 0.720855614973262, + "grad_norm": 0.4764248954149781, + "learning_rate": 1.5321357367863835e-05, + "loss": 0.8331, + "num_tokens": 25353953631.0, + "step": 6066 + }, + { + "epoch": 0.7209744503862151, + "grad_norm": 0.5472716371849483, + "learning_rate": 1.531983803220913e-05, + "loss": 0.8474, + "num_tokens": 25358143572.0, + "step": 6067 + }, + { + "epoch": 0.7210932857991682, + "grad_norm": 0.37823291439237516, + "learning_rate": 1.5318318536576217e-05, + "loss": 0.8196, + "num_tokens": 25362333244.0, + "step": 6068 + }, + { + "epoch": 0.7212121212121212, + "grad_norm": 0.6257475808966309, + "learning_rate": 1.5316798881021367e-05, + "loss": 0.855, + "num_tokens": 25366496510.0, + "step": 6069 + }, + { + "epoch": 0.7213309566250743, + "grad_norm": 0.4054870257205317, + "learning_rate": 1.531527906560086e-05, + "loss": 0.8574, + "num_tokens": 25370684850.0, + "step": 6070 + }, + { + "epoch": 0.7214497920380273, + "grad_norm": 0.5787723278849503, + "learning_rate": 1.5313759090370975e-05, + "loss": 0.8657, + "num_tokens": 25374872316.0, + "step": 6071 + }, + { + "epoch": 0.7215686274509804, + "grad_norm": 0.4584934861544245, + "learning_rate": 1.531223895538801e-05, + "loss": 0.8536, + "num_tokens": 25379047907.0, + "step": 6072 + }, + { + "epoch": 0.7216874628639335, + "grad_norm": 0.5558667583263092, + "learning_rate": 1.5310718660708255e-05, + "loss": 0.8254, + "num_tokens": 25383218682.0, + "step": 6073 + }, + { + "epoch": 0.7218062982768865, + "grad_norm": 0.4575710409515203, + "learning_rate": 1.530919820638801e-05, + "loss": 0.8418, + "num_tokens": 25387393944.0, + "step": 6074 + }, + { + "epoch": 0.7219251336898396, + "grad_norm": 0.5195146995547698, + "learning_rate": 1.530767759248359e-05, + "loss": 0.865, + "num_tokens": 25391582317.0, + "step": 6075 + }, + { + "epoch": 0.7220439691027927, + "grad_norm": 0.5439732993419522, + "learning_rate": 1.5306156819051304e-05, + "loss": 0.8534, + "num_tokens": 25395743799.0, + "step": 6076 + }, + { + "epoch": 0.7221628045157457, + "grad_norm": 0.4440156997309242, + "learning_rate": 1.5304635886147473e-05, + "loss": 0.8406, + "num_tokens": 25399926590.0, + "step": 6077 + }, + { + "epoch": 0.7222816399286988, + "grad_norm": 0.5151690456493963, + "learning_rate": 1.5303114793828417e-05, + "loss": 0.8371, + "num_tokens": 25404069388.0, + "step": 6078 + }, + { + "epoch": 0.7224004753416519, + "grad_norm": 0.4464850996283529, + "learning_rate": 1.5301593542150472e-05, + "loss": 0.8444, + "num_tokens": 25408258862.0, + "step": 6079 + }, + { + "epoch": 0.7225193107546048, + "grad_norm": 0.5322691179383965, + "learning_rate": 1.5300072131169973e-05, + "loss": 0.8091, + "num_tokens": 25412448551.0, + "step": 6080 + }, + { + "epoch": 0.722638146167558, + "grad_norm": 0.4134867304831373, + "learning_rate": 1.529855056094327e-05, + "loss": 0.8282, + "num_tokens": 25416637703.0, + "step": 6081 + }, + { + "epoch": 0.7227569815805109, + "grad_norm": 0.4384456651728874, + "learning_rate": 1.52970288315267e-05, + "loss": 0.8374, + "num_tokens": 25420828056.0, + "step": 6082 + }, + { + "epoch": 0.722875816993464, + "grad_norm": 0.4137759790214347, + "learning_rate": 1.5295506942976626e-05, + "loss": 0.893, + "num_tokens": 25425016082.0, + "step": 6083 + }, + { + "epoch": 0.7229946524064171, + "grad_norm": 0.512321811137885, + "learning_rate": 1.5293984895349407e-05, + "loss": 0.8786, + "num_tokens": 25429181751.0, + "step": 6084 + }, + { + "epoch": 0.7231134878193701, + "grad_norm": 0.3919281926963124, + "learning_rate": 1.5292462688701408e-05, + "loss": 0.8724, + "num_tokens": 25433372138.0, + "step": 6085 + }, + { + "epoch": 0.7232323232323232, + "grad_norm": 0.4993121500066138, + "learning_rate": 1.5290940323089004e-05, + "loss": 0.8358, + "num_tokens": 25437520378.0, + "step": 6086 + }, + { + "epoch": 0.7233511586452763, + "grad_norm": 0.4183239653135771, + "learning_rate": 1.5289417798568568e-05, + "loss": 0.8253, + "num_tokens": 25441710398.0, + "step": 6087 + }, + { + "epoch": 0.7234699940582293, + "grad_norm": 0.472333244749735, + "learning_rate": 1.528789511519649e-05, + "loss": 0.8428, + "num_tokens": 25445893082.0, + "step": 6088 + }, + { + "epoch": 0.7235888294711824, + "grad_norm": 0.44455958012835267, + "learning_rate": 1.5286372273029158e-05, + "loss": 0.8554, + "num_tokens": 25450073163.0, + "step": 6089 + }, + { + "epoch": 0.7237076648841355, + "grad_norm": 0.48235008317458483, + "learning_rate": 1.528484927212297e-05, + "loss": 0.8418, + "num_tokens": 25454236961.0, + "step": 6090 + }, + { + "epoch": 0.7238265002970885, + "grad_norm": 0.4990424151497763, + "learning_rate": 1.528332611253432e-05, + "loss": 0.8509, + "num_tokens": 25458427823.0, + "step": 6091 + }, + { + "epoch": 0.7239453357100416, + "grad_norm": 0.38064346128588816, + "learning_rate": 1.5281802794319633e-05, + "loss": 0.8883, + "num_tokens": 25462617666.0, + "step": 6092 + }, + { + "epoch": 0.7240641711229947, + "grad_norm": 0.442274207777763, + "learning_rate": 1.5280279317535304e-05, + "loss": 0.8666, + "num_tokens": 25466806375.0, + "step": 6093 + }, + { + "epoch": 0.7241830065359477, + "grad_norm": 0.4128422936526019, + "learning_rate": 1.5278755682237758e-05, + "loss": 0.8525, + "num_tokens": 25470924592.0, + "step": 6094 + }, + { + "epoch": 0.7243018419489008, + "grad_norm": 0.4434793900229196, + "learning_rate": 1.5277231888483424e-05, + "loss": 0.8521, + "num_tokens": 25475114707.0, + "step": 6095 + }, + { + "epoch": 0.7244206773618538, + "grad_norm": 0.36018289612072424, + "learning_rate": 1.5275707936328732e-05, + "loss": 0.8307, + "num_tokens": 25479303007.0, + "step": 6096 + }, + { + "epoch": 0.7245395127748069, + "grad_norm": 0.45973419723487136, + "learning_rate": 1.5274183825830124e-05, + "loss": 0.8565, + "num_tokens": 25483492793.0, + "step": 6097 + }, + { + "epoch": 0.72465834818776, + "grad_norm": 0.42381361087023434, + "learning_rate": 1.527265955704403e-05, + "loss": 0.8718, + "num_tokens": 25487682202.0, + "step": 6098 + }, + { + "epoch": 0.724777183600713, + "grad_norm": 0.4070476891261142, + "learning_rate": 1.5271135130026912e-05, + "loss": 0.8222, + "num_tokens": 25491869724.0, + "step": 6099 + }, + { + "epoch": 0.7248960190136661, + "grad_norm": 0.48011361994322677, + "learning_rate": 1.5269610544835215e-05, + "loss": 0.8101, + "num_tokens": 25496047269.0, + "step": 6100 + }, + { + "epoch": 0.7250148544266192, + "grad_norm": 0.37468396944173066, + "learning_rate": 1.5268085801525404e-05, + "loss": 0.8478, + "num_tokens": 25500210596.0, + "step": 6101 + }, + { + "epoch": 0.7251336898395722, + "grad_norm": 0.4398820753195104, + "learning_rate": 1.5266560900153945e-05, + "loss": 0.879, + "num_tokens": 25504387026.0, + "step": 6102 + }, + { + "epoch": 0.7252525252525253, + "grad_norm": 0.3855421519596729, + "learning_rate": 1.526503584077731e-05, + "loss": 0.8812, + "num_tokens": 25508573815.0, + "step": 6103 + }, + { + "epoch": 0.7253713606654784, + "grad_norm": 0.4878000208964175, + "learning_rate": 1.526351062345198e-05, + "loss": 0.8558, + "num_tokens": 25512738005.0, + "step": 6104 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.3930875617909119, + "learning_rate": 1.5261985248234436e-05, + "loss": 0.8043, + "num_tokens": 25516863279.0, + "step": 6105 + }, + { + "epoch": 0.7256090314913844, + "grad_norm": 0.39855394968448904, + "learning_rate": 1.5260459715181165e-05, + "loss": 0.8697, + "num_tokens": 25521023789.0, + "step": 6106 + }, + { + "epoch": 0.7257278669043374, + "grad_norm": 0.4689147714667717, + "learning_rate": 1.525893402434867e-05, + "loss": 0.8031, + "num_tokens": 25525188073.0, + "step": 6107 + }, + { + "epoch": 0.7258467023172905, + "grad_norm": 0.38258561100132715, + "learning_rate": 1.5257408175793444e-05, + "loss": 0.8606, + "num_tokens": 25529375902.0, + "step": 6108 + }, + { + "epoch": 0.7259655377302436, + "grad_norm": 0.46815088660390547, + "learning_rate": 1.5255882169572001e-05, + "loss": 0.8549, + "num_tokens": 25533522125.0, + "step": 6109 + }, + { + "epoch": 0.7260843731431966, + "grad_norm": 0.43852186403117943, + "learning_rate": 1.5254356005740853e-05, + "loss": 0.8422, + "num_tokens": 25537711705.0, + "step": 6110 + }, + { + "epoch": 0.7262032085561497, + "grad_norm": 0.3890599585710891, + "learning_rate": 1.5252829684356516e-05, + "loss": 0.8585, + "num_tokens": 25541900928.0, + "step": 6111 + }, + { + "epoch": 0.7263220439691028, + "grad_norm": 0.42243421706662787, + "learning_rate": 1.5251303205475518e-05, + "loss": 0.8484, + "num_tokens": 25546060587.0, + "step": 6112 + }, + { + "epoch": 0.7264408793820558, + "grad_norm": 0.40156330178897337, + "learning_rate": 1.5249776569154387e-05, + "loss": 0.8506, + "num_tokens": 25550249567.0, + "step": 6113 + }, + { + "epoch": 0.7265597147950089, + "grad_norm": 0.4361198328925136, + "learning_rate": 1.5248249775449657e-05, + "loss": 0.7886, + "num_tokens": 25554426245.0, + "step": 6114 + }, + { + "epoch": 0.726678550207962, + "grad_norm": 0.41485261567705256, + "learning_rate": 1.5246722824417882e-05, + "loss": 0.8395, + "num_tokens": 25558615790.0, + "step": 6115 + }, + { + "epoch": 0.726797385620915, + "grad_norm": 0.4323301772841447, + "learning_rate": 1.5245195716115597e-05, + "loss": 0.8515, + "num_tokens": 25562803680.0, + "step": 6116 + }, + { + "epoch": 0.7269162210338681, + "grad_norm": 0.34260072357810173, + "learning_rate": 1.5243668450599363e-05, + "loss": 0.8284, + "num_tokens": 25566994424.0, + "step": 6117 + }, + { + "epoch": 0.7270350564468212, + "grad_norm": 0.4280166118383571, + "learning_rate": 1.524214102792574e-05, + "loss": 0.8435, + "num_tokens": 25571182117.0, + "step": 6118 + }, + { + "epoch": 0.7271538918597742, + "grad_norm": 0.47018417142692415, + "learning_rate": 1.5240613448151287e-05, + "loss": 0.8907, + "num_tokens": 25575372367.0, + "step": 6119 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.43652496496443444, + "learning_rate": 1.5239085711332584e-05, + "loss": 0.866, + "num_tokens": 25579560638.0, + "step": 6120 + }, + { + "epoch": 0.7273915626856803, + "grad_norm": 0.3974459347872877, + "learning_rate": 1.5237557817526202e-05, + "loss": 0.8727, + "num_tokens": 25583731423.0, + "step": 6121 + }, + { + "epoch": 0.7275103980986334, + "grad_norm": 0.4485939915006867, + "learning_rate": 1.523602976678873e-05, + "loss": 0.875, + "num_tokens": 25587899282.0, + "step": 6122 + }, + { + "epoch": 0.7276292335115865, + "grad_norm": 0.3499128765435133, + "learning_rate": 1.5234501559176751e-05, + "loss": 0.8891, + "num_tokens": 25592088416.0, + "step": 6123 + }, + { + "epoch": 0.7277480689245395, + "grad_norm": 0.42210519455880247, + "learning_rate": 1.5232973194746858e-05, + "loss": 0.867, + "num_tokens": 25596277996.0, + "step": 6124 + }, + { + "epoch": 0.7278669043374926, + "grad_norm": 0.4051079587327184, + "learning_rate": 1.5231444673555661e-05, + "loss": 0.8421, + "num_tokens": 25600447195.0, + "step": 6125 + }, + { + "epoch": 0.7279857397504457, + "grad_norm": 0.3622593138155036, + "learning_rate": 1.5229915995659761e-05, + "loss": 0.8067, + "num_tokens": 25604636590.0, + "step": 6126 + }, + { + "epoch": 0.7281045751633987, + "grad_norm": 0.37111834427710766, + "learning_rate": 1.5228387161115767e-05, + "loss": 0.8525, + "num_tokens": 25608801495.0, + "step": 6127 + }, + { + "epoch": 0.7282234105763518, + "grad_norm": 0.4325046456063645, + "learning_rate": 1.5226858169980303e-05, + "loss": 0.875, + "num_tokens": 25612990884.0, + "step": 6128 + }, + { + "epoch": 0.7283422459893049, + "grad_norm": 0.4510799161788699, + "learning_rate": 1.5225329022309986e-05, + "loss": 0.8386, + "num_tokens": 25617180379.0, + "step": 6129 + }, + { + "epoch": 0.7284610814022578, + "grad_norm": 0.42950792377787606, + "learning_rate": 1.5223799718161454e-05, + "loss": 0.869, + "num_tokens": 25621367640.0, + "step": 6130 + }, + { + "epoch": 0.728579916815211, + "grad_norm": 0.42336779662500157, + "learning_rate": 1.5222270257591333e-05, + "loss": 0.8336, + "num_tokens": 25625556519.0, + "step": 6131 + }, + { + "epoch": 0.728698752228164, + "grad_norm": 0.42759869789361576, + "learning_rate": 1.522074064065627e-05, + "loss": 0.8524, + "num_tokens": 25629746226.0, + "step": 6132 + }, + { + "epoch": 0.728817587641117, + "grad_norm": 0.4060641647641769, + "learning_rate": 1.5219210867412913e-05, + "loss": 0.8535, + "num_tokens": 25633935683.0, + "step": 6133 + }, + { + "epoch": 0.7289364230540701, + "grad_norm": 0.3891202939523258, + "learning_rate": 1.5217680937917907e-05, + "loss": 0.8475, + "num_tokens": 25638110301.0, + "step": 6134 + }, + { + "epoch": 0.7290552584670231, + "grad_norm": 0.5102766373637305, + "learning_rate": 1.521615085222792e-05, + "loss": 0.8273, + "num_tokens": 25642298838.0, + "step": 6135 + }, + { + "epoch": 0.7291740938799762, + "grad_norm": 0.40265974407320787, + "learning_rate": 1.5214620610399609e-05, + "loss": 0.8276, + "num_tokens": 25646480185.0, + "step": 6136 + }, + { + "epoch": 0.7292929292929293, + "grad_norm": 0.5102656298995658, + "learning_rate": 1.5213090212489647e-05, + "loss": 0.8101, + "num_tokens": 25650669342.0, + "step": 6137 + }, + { + "epoch": 0.7294117647058823, + "grad_norm": 0.44482963149690136, + "learning_rate": 1.5211559658554713e-05, + "loss": 0.8633, + "num_tokens": 25654858219.0, + "step": 6138 + }, + { + "epoch": 0.7295306001188354, + "grad_norm": 0.39665743303592627, + "learning_rate": 1.5210028948651481e-05, + "loss": 0.8219, + "num_tokens": 25659047333.0, + "step": 6139 + }, + { + "epoch": 0.7296494355317885, + "grad_norm": 0.4650430847215476, + "learning_rate": 1.5208498082836644e-05, + "loss": 0.8565, + "num_tokens": 25663236321.0, + "step": 6140 + }, + { + "epoch": 0.7297682709447415, + "grad_norm": 0.4082528882004637, + "learning_rate": 1.5206967061166895e-05, + "loss": 0.8293, + "num_tokens": 25667396355.0, + "step": 6141 + }, + { + "epoch": 0.7298871063576946, + "grad_norm": 0.4780468675955533, + "learning_rate": 1.5205435883698931e-05, + "loss": 0.8195, + "num_tokens": 25671586345.0, + "step": 6142 + }, + { + "epoch": 0.7300059417706477, + "grad_norm": 0.4058064228488224, + "learning_rate": 1.5203904550489456e-05, + "loss": 0.8657, + "num_tokens": 25675776197.0, + "step": 6143 + }, + { + "epoch": 0.7301247771836007, + "grad_norm": 0.3997468468929483, + "learning_rate": 1.5202373061595184e-05, + "loss": 0.7942, + "num_tokens": 25679955474.0, + "step": 6144 + }, + { + "epoch": 0.7302436125965538, + "grad_norm": 0.44532128833019186, + "learning_rate": 1.5200841417072827e-05, + "loss": 0.8237, + "num_tokens": 25684145840.0, + "step": 6145 + }, + { + "epoch": 0.7303624480095068, + "grad_norm": 0.4372663320186755, + "learning_rate": 1.519930961697911e-05, + "loss": 0.8514, + "num_tokens": 25688334104.0, + "step": 6146 + }, + { + "epoch": 0.7304812834224599, + "grad_norm": 0.46290710935402507, + "learning_rate": 1.5197777661370758e-05, + "loss": 0.8107, + "num_tokens": 25692517561.0, + "step": 6147 + }, + { + "epoch": 0.730600118835413, + "grad_norm": 0.39071620256790535, + "learning_rate": 1.5196245550304509e-05, + "loss": 0.8338, + "num_tokens": 25696706725.0, + "step": 6148 + }, + { + "epoch": 0.730718954248366, + "grad_norm": 0.3784250354021564, + "learning_rate": 1.5194713283837098e-05, + "loss": 0.8669, + "num_tokens": 25700896690.0, + "step": 6149 + }, + { + "epoch": 0.7308377896613191, + "grad_norm": 0.4155549413241824, + "learning_rate": 1.519318086202527e-05, + "loss": 0.8536, + "num_tokens": 25705087202.0, + "step": 6150 + }, + { + "epoch": 0.7309566250742722, + "grad_norm": 0.447472894708594, + "learning_rate": 1.5191648284925781e-05, + "loss": 0.8207, + "num_tokens": 25709232668.0, + "step": 6151 + }, + { + "epoch": 0.7310754604872252, + "grad_norm": 0.4862444177644558, + "learning_rate": 1.5190115552595379e-05, + "loss": 0.8675, + "num_tokens": 25713399765.0, + "step": 6152 + }, + { + "epoch": 0.7311942959001783, + "grad_norm": 0.3900716530404355, + "learning_rate": 1.5188582665090834e-05, + "loss": 0.8688, + "num_tokens": 25717588568.0, + "step": 6153 + }, + { + "epoch": 0.7313131313131314, + "grad_norm": 0.3985328286604372, + "learning_rate": 1.518704962246891e-05, + "loss": 0.8723, + "num_tokens": 25721748665.0, + "step": 6154 + }, + { + "epoch": 0.7314319667260843, + "grad_norm": 0.4085349701997171, + "learning_rate": 1.5185516424786382e-05, + "loss": 0.8593, + "num_tokens": 25725938250.0, + "step": 6155 + }, + { + "epoch": 0.7315508021390374, + "grad_norm": 0.47428955395578987, + "learning_rate": 1.518398307210003e-05, + "loss": 0.8427, + "num_tokens": 25730128534.0, + "step": 6156 + }, + { + "epoch": 0.7316696375519905, + "grad_norm": 0.45427819614886916, + "learning_rate": 1.5182449564466637e-05, + "loss": 0.8606, + "num_tokens": 25734316612.0, + "step": 6157 + }, + { + "epoch": 0.7317884729649435, + "grad_norm": 0.422346541832794, + "learning_rate": 1.5180915901942996e-05, + "loss": 0.858, + "num_tokens": 25738505899.0, + "step": 6158 + }, + { + "epoch": 0.7319073083778966, + "grad_norm": 0.3445399757591619, + "learning_rate": 1.5179382084585903e-05, + "loss": 0.87, + "num_tokens": 25742680676.0, + "step": 6159 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.38299242265585914, + "learning_rate": 1.517784811245216e-05, + "loss": 0.8282, + "num_tokens": 25746849438.0, + "step": 6160 + }, + { + "epoch": 0.7321449792038027, + "grad_norm": 0.40048129436524843, + "learning_rate": 1.5176313985598579e-05, + "loss": 0.8813, + "num_tokens": 25751039522.0, + "step": 6161 + }, + { + "epoch": 0.7322638146167558, + "grad_norm": 0.38958472210427375, + "learning_rate": 1.5174779704081968e-05, + "loss": 0.8419, + "num_tokens": 25755229084.0, + "step": 6162 + }, + { + "epoch": 0.7323826500297088, + "grad_norm": 0.3584484936563141, + "learning_rate": 1.5173245267959146e-05, + "loss": 0.8633, + "num_tokens": 25759417622.0, + "step": 6163 + }, + { + "epoch": 0.7325014854426619, + "grad_norm": 0.4053576911266291, + "learning_rate": 1.5171710677286948e-05, + "loss": 0.7872, + "num_tokens": 25763574682.0, + "step": 6164 + }, + { + "epoch": 0.732620320855615, + "grad_norm": 0.38172642554203184, + "learning_rate": 1.5170175932122195e-05, + "loss": 0.8499, + "num_tokens": 25767735228.0, + "step": 6165 + }, + { + "epoch": 0.732739156268568, + "grad_norm": 0.3867097142043311, + "learning_rate": 1.516864103252173e-05, + "loss": 0.8502, + "num_tokens": 25771923428.0, + "step": 6166 + }, + { + "epoch": 0.7328579916815211, + "grad_norm": 0.45275440255408905, + "learning_rate": 1.5167105978542391e-05, + "loss": 0.8431, + "num_tokens": 25776084593.0, + "step": 6167 + }, + { + "epoch": 0.7329768270944742, + "grad_norm": 0.375079729724938, + "learning_rate": 1.5165570770241027e-05, + "loss": 0.8799, + "num_tokens": 25780256106.0, + "step": 6168 + }, + { + "epoch": 0.7330956625074272, + "grad_norm": 0.4438672930094432, + "learning_rate": 1.5164035407674496e-05, + "loss": 0.8179, + "num_tokens": 25784445816.0, + "step": 6169 + }, + { + "epoch": 0.7332144979203803, + "grad_norm": 0.44932515581053506, + "learning_rate": 1.5162499890899655e-05, + "loss": 0.8513, + "num_tokens": 25788615951.0, + "step": 6170 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.483871228126475, + "learning_rate": 1.5160964219973366e-05, + "loss": 0.8283, + "num_tokens": 25792790608.0, + "step": 6171 + }, + { + "epoch": 0.7334521687462864, + "grad_norm": 0.3763361027722767, + "learning_rate": 1.5159428394952508e-05, + "loss": 0.8618, + "num_tokens": 25796952768.0, + "step": 6172 + }, + { + "epoch": 0.7335710041592395, + "grad_norm": 0.4149545744724164, + "learning_rate": 1.515789241589395e-05, + "loss": 0.8535, + "num_tokens": 25801087461.0, + "step": 6173 + }, + { + "epoch": 0.7336898395721925, + "grad_norm": 0.40909465596708566, + "learning_rate": 1.5156356282854576e-05, + "loss": 0.8451, + "num_tokens": 25805276030.0, + "step": 6174 + }, + { + "epoch": 0.7338086749851456, + "grad_norm": 0.4475019649260435, + "learning_rate": 1.5154819995891281e-05, + "loss": 0.8761, + "num_tokens": 25809464380.0, + "step": 6175 + }, + { + "epoch": 0.7339275103980987, + "grad_norm": 0.36893899240770156, + "learning_rate": 1.515328355506095e-05, + "loss": 0.8717, + "num_tokens": 25813653072.0, + "step": 6176 + }, + { + "epoch": 0.7340463458110517, + "grad_norm": 0.4614333030706096, + "learning_rate": 1.5151746960420488e-05, + "loss": 0.8512, + "num_tokens": 25817840939.0, + "step": 6177 + }, + { + "epoch": 0.7341651812240048, + "grad_norm": 0.46038690374029034, + "learning_rate": 1.5150210212026797e-05, + "loss": 0.8564, + "num_tokens": 25822029818.0, + "step": 6178 + }, + { + "epoch": 0.7342840166369579, + "grad_norm": 0.3977348595846228, + "learning_rate": 1.5148673309936793e-05, + "loss": 0.8084, + "num_tokens": 25826219173.0, + "step": 6179 + }, + { + "epoch": 0.7344028520499108, + "grad_norm": 0.5142855607903251, + "learning_rate": 1.5147136254207389e-05, + "loss": 0.855, + "num_tokens": 25830383567.0, + "step": 6180 + }, + { + "epoch": 0.734521687462864, + "grad_norm": 0.41839098167943206, + "learning_rate": 1.5145599044895503e-05, + "loss": 0.8466, + "num_tokens": 25834573615.0, + "step": 6181 + }, + { + "epoch": 0.734640522875817, + "grad_norm": 0.4167514429498368, + "learning_rate": 1.5144061682058071e-05, + "loss": 0.7952, + "num_tokens": 25838762758.0, + "step": 6182 + }, + { + "epoch": 0.73475935828877, + "grad_norm": 0.40359632123224026, + "learning_rate": 1.5142524165752024e-05, + "loss": 0.8461, + "num_tokens": 25842925829.0, + "step": 6183 + }, + { + "epoch": 0.7348781937017231, + "grad_norm": 0.40978250326501964, + "learning_rate": 1.5140986496034298e-05, + "loss": 0.8547, + "num_tokens": 25847115496.0, + "step": 6184 + }, + { + "epoch": 0.7349970291146761, + "grad_norm": 0.40528266182559397, + "learning_rate": 1.5139448672961845e-05, + "loss": 0.8645, + "num_tokens": 25851268798.0, + "step": 6185 + }, + { + "epoch": 0.7351158645276292, + "grad_norm": 0.4569007182161227, + "learning_rate": 1.5137910696591609e-05, + "loss": 0.8535, + "num_tokens": 25855459108.0, + "step": 6186 + }, + { + "epoch": 0.7352346999405823, + "grad_norm": 0.40266559295676785, + "learning_rate": 1.5136372566980548e-05, + "loss": 0.8173, + "num_tokens": 25859648171.0, + "step": 6187 + }, + { + "epoch": 0.7353535353535353, + "grad_norm": 0.38106373379896963, + "learning_rate": 1.513483428418563e-05, + "loss": 0.8268, + "num_tokens": 25863836010.0, + "step": 6188 + }, + { + "epoch": 0.7354723707664884, + "grad_norm": 0.4599726279070336, + "learning_rate": 1.5133295848263816e-05, + "loss": 0.8896, + "num_tokens": 25868025782.0, + "step": 6189 + }, + { + "epoch": 0.7355912061794415, + "grad_norm": 0.44981669965850735, + "learning_rate": 1.5131757259272081e-05, + "loss": 0.8564, + "num_tokens": 25872215139.0, + "step": 6190 + }, + { + "epoch": 0.7357100415923945, + "grad_norm": 0.38210382042004654, + "learning_rate": 1.5130218517267405e-05, + "loss": 0.8038, + "num_tokens": 25876339943.0, + "step": 6191 + }, + { + "epoch": 0.7358288770053476, + "grad_norm": 0.4189916831174223, + "learning_rate": 1.5128679622306774e-05, + "loss": 0.8314, + "num_tokens": 25880504877.0, + "step": 6192 + }, + { + "epoch": 0.7359477124183007, + "grad_norm": 0.3580359211427371, + "learning_rate": 1.512714057444718e-05, + "loss": 0.8337, + "num_tokens": 25884682474.0, + "step": 6193 + }, + { + "epoch": 0.7360665478312537, + "grad_norm": 0.35904560480110326, + "learning_rate": 1.5125601373745608e-05, + "loss": 0.8772, + "num_tokens": 25888871622.0, + "step": 6194 + }, + { + "epoch": 0.7361853832442068, + "grad_norm": 0.3669193389036534, + "learning_rate": 1.5124062020259074e-05, + "loss": 0.8373, + "num_tokens": 25893059892.0, + "step": 6195 + }, + { + "epoch": 0.7363042186571598, + "grad_norm": 0.386289121205572, + "learning_rate": 1.512252251404458e-05, + "loss": 0.8542, + "num_tokens": 25897249604.0, + "step": 6196 + }, + { + "epoch": 0.7364230540701129, + "grad_norm": 0.40261791860208684, + "learning_rate": 1.5120982855159135e-05, + "loss": 0.8194, + "num_tokens": 25901439561.0, + "step": 6197 + }, + { + "epoch": 0.736541889483066, + "grad_norm": 0.4605273846989427, + "learning_rate": 1.5119443043659767e-05, + "loss": 0.8565, + "num_tokens": 25905627144.0, + "step": 6198 + }, + { + "epoch": 0.736660724896019, + "grad_norm": 0.3918474848659101, + "learning_rate": 1.511790307960349e-05, + "loss": 0.8553, + "num_tokens": 25909793801.0, + "step": 6199 + }, + { + "epoch": 0.7367795603089721, + "grad_norm": 0.4773404222679931, + "learning_rate": 1.511636296304734e-05, + "loss": 0.8478, + "num_tokens": 25913981483.0, + "step": 6200 + }, + { + "epoch": 0.7368983957219252, + "grad_norm": 0.41984502292061743, + "learning_rate": 1.5114822694048353e-05, + "loss": 0.8287, + "num_tokens": 25918170061.0, + "step": 6201 + }, + { + "epoch": 0.7370172311348782, + "grad_norm": 0.4545508564014787, + "learning_rate": 1.511328227266357e-05, + "loss": 0.8394, + "num_tokens": 25922359405.0, + "step": 6202 + }, + { + "epoch": 0.7371360665478313, + "grad_norm": 0.3662862102186932, + "learning_rate": 1.5111741698950033e-05, + "loss": 0.8183, + "num_tokens": 25926550353.0, + "step": 6203 + }, + { + "epoch": 0.7372549019607844, + "grad_norm": 0.455648757057555, + "learning_rate": 1.5110200972964801e-05, + "loss": 0.8309, + "num_tokens": 25930738221.0, + "step": 6204 + }, + { + "epoch": 0.7373737373737373, + "grad_norm": 0.4900152275608157, + "learning_rate": 1.510866009476493e-05, + "loss": 0.8722, + "num_tokens": 25934927140.0, + "step": 6205 + }, + { + "epoch": 0.7374925727866904, + "grad_norm": 0.38760798390541196, + "learning_rate": 1.5107119064407486e-05, + "loss": 0.8133, + "num_tokens": 25939077950.0, + "step": 6206 + }, + { + "epoch": 0.7376114081996435, + "grad_norm": 0.4469886901812821, + "learning_rate": 1.5105577881949533e-05, + "loss": 0.8424, + "num_tokens": 25943244524.0, + "step": 6207 + }, + { + "epoch": 0.7377302436125965, + "grad_norm": 0.38336892994516303, + "learning_rate": 1.5104036547448153e-05, + "loss": 0.8574, + "num_tokens": 25947433592.0, + "step": 6208 + }, + { + "epoch": 0.7378490790255496, + "grad_norm": 0.4416054033779062, + "learning_rate": 1.5102495060960422e-05, + "loss": 0.8464, + "num_tokens": 25951621816.0, + "step": 6209 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 0.4359368679886778, + "learning_rate": 1.5100953422543429e-05, + "loss": 0.8553, + "num_tokens": 25955810570.0, + "step": 6210 + }, + { + "epoch": 0.7380867498514557, + "grad_norm": 0.4137508891336207, + "learning_rate": 1.5099411632254265e-05, + "loss": 0.8523, + "num_tokens": 25959999067.0, + "step": 6211 + }, + { + "epoch": 0.7382055852644088, + "grad_norm": 0.35177049203555794, + "learning_rate": 1.5097869690150026e-05, + "loss": 0.8219, + "num_tokens": 25964159824.0, + "step": 6212 + }, + { + "epoch": 0.7383244206773618, + "grad_norm": 0.3755807030306009, + "learning_rate": 1.5096327596287817e-05, + "loss": 0.8514, + "num_tokens": 25968348499.0, + "step": 6213 + }, + { + "epoch": 0.7384432560903149, + "grad_norm": 0.3419773093131672, + "learning_rate": 1.5094785350724751e-05, + "loss": 0.8747, + "num_tokens": 25972536867.0, + "step": 6214 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.39833777391604436, + "learning_rate": 1.5093242953517938e-05, + "loss": 0.8993, + "num_tokens": 25976724888.0, + "step": 6215 + }, + { + "epoch": 0.738680926916221, + "grad_norm": 0.46287862582465866, + "learning_rate": 1.5091700404724494e-05, + "loss": 0.868, + "num_tokens": 25980914066.0, + "step": 6216 + }, + { + "epoch": 0.7387997623291741, + "grad_norm": 0.36338799497712987, + "learning_rate": 1.5090157704401556e-05, + "loss": 0.8422, + "num_tokens": 25985100929.0, + "step": 6217 + }, + { + "epoch": 0.7389185977421272, + "grad_norm": 0.3744345045833504, + "learning_rate": 1.5088614852606243e-05, + "loss": 0.8566, + "num_tokens": 25989268374.0, + "step": 6218 + }, + { + "epoch": 0.7390374331550802, + "grad_norm": 0.3453853039574081, + "learning_rate": 1.50870718493957e-05, + "loss": 0.8664, + "num_tokens": 25993438684.0, + "step": 6219 + }, + { + "epoch": 0.7391562685680333, + "grad_norm": 0.42751841779709576, + "learning_rate": 1.508552869482707e-05, + "loss": 0.8377, + "num_tokens": 25997627952.0, + "step": 6220 + }, + { + "epoch": 0.7392751039809863, + "grad_norm": 0.4451578442181954, + "learning_rate": 1.5083985388957496e-05, + "loss": 0.843, + "num_tokens": 26001817835.0, + "step": 6221 + }, + { + "epoch": 0.7393939393939394, + "grad_norm": 0.37968952035360426, + "learning_rate": 1.5082441931844136e-05, + "loss": 0.837, + "num_tokens": 26006005986.0, + "step": 6222 + }, + { + "epoch": 0.7395127748068925, + "grad_norm": 0.5772341719940622, + "learning_rate": 1.5080898323544149e-05, + "loss": 0.841, + "num_tokens": 26010164782.0, + "step": 6223 + }, + { + "epoch": 0.7396316102198455, + "grad_norm": 0.38498303430908987, + "learning_rate": 1.5079354564114696e-05, + "loss": 0.8343, + "num_tokens": 26014353120.0, + "step": 6224 + }, + { + "epoch": 0.7397504456327986, + "grad_norm": 0.5225102990992366, + "learning_rate": 1.5077810653612954e-05, + "loss": 0.8312, + "num_tokens": 26018505380.0, + "step": 6225 + }, + { + "epoch": 0.7398692810457517, + "grad_norm": 0.4101654164446481, + "learning_rate": 1.5076266592096097e-05, + "loss": 0.8409, + "num_tokens": 26022667252.0, + "step": 6226 + }, + { + "epoch": 0.7399881164587047, + "grad_norm": 0.42983358152689916, + "learning_rate": 1.5074722379621305e-05, + "loss": 0.8501, + "num_tokens": 26026856931.0, + "step": 6227 + }, + { + "epoch": 0.7401069518716578, + "grad_norm": 0.4354249026498161, + "learning_rate": 1.5073178016245766e-05, + "loss": 0.841, + "num_tokens": 26031046748.0, + "step": 6228 + }, + { + "epoch": 0.7402257872846109, + "grad_norm": 0.5072416707378067, + "learning_rate": 1.507163350202667e-05, + "loss": 0.8412, + "num_tokens": 26035236994.0, + "step": 6229 + }, + { + "epoch": 0.7403446226975638, + "grad_norm": 0.32087678522973273, + "learning_rate": 1.5070088837021227e-05, + "loss": 0.8343, + "num_tokens": 26039372457.0, + "step": 6230 + }, + { + "epoch": 0.740463458110517, + "grad_norm": 0.5969866927945925, + "learning_rate": 1.5068544021286626e-05, + "loss": 0.8296, + "num_tokens": 26043546308.0, + "step": 6231 + }, + { + "epoch": 0.74058229352347, + "grad_norm": 0.422118014224548, + "learning_rate": 1.5066999054880085e-05, + "loss": 0.8639, + "num_tokens": 26047721046.0, + "step": 6232 + }, + { + "epoch": 0.740701128936423, + "grad_norm": 0.4899064941141253, + "learning_rate": 1.5065453937858818e-05, + "loss": 0.8192, + "num_tokens": 26051889850.0, + "step": 6233 + }, + { + "epoch": 0.7408199643493761, + "grad_norm": 0.5068569683289654, + "learning_rate": 1.5063908670280048e-05, + "loss": 0.868, + "num_tokens": 26056079752.0, + "step": 6234 + }, + { + "epoch": 0.7409387997623291, + "grad_norm": 0.4347033226927081, + "learning_rate": 1.5062363252201e-05, + "loss": 0.8509, + "num_tokens": 26060267810.0, + "step": 6235 + }, + { + "epoch": 0.7410576351752822, + "grad_norm": 0.49168485304553394, + "learning_rate": 1.5060817683678903e-05, + "loss": 0.8549, + "num_tokens": 26064455903.0, + "step": 6236 + }, + { + "epoch": 0.7411764705882353, + "grad_norm": 0.38031738796924486, + "learning_rate": 1.5059271964770999e-05, + "loss": 0.8397, + "num_tokens": 26068621448.0, + "step": 6237 + }, + { + "epoch": 0.7412953060011883, + "grad_norm": 0.46059262993883776, + "learning_rate": 1.5057726095534528e-05, + "loss": 0.8246, + "num_tokens": 26072810097.0, + "step": 6238 + }, + { + "epoch": 0.7414141414141414, + "grad_norm": 0.4400029409998685, + "learning_rate": 1.505618007602674e-05, + "loss": 0.8231, + "num_tokens": 26077000971.0, + "step": 6239 + }, + { + "epoch": 0.7415329768270945, + "grad_norm": 0.41674340265373455, + "learning_rate": 1.5054633906304893e-05, + "loss": 0.8221, + "num_tokens": 26081167181.0, + "step": 6240 + }, + { + "epoch": 0.7416518122400475, + "grad_norm": 0.5373799523444356, + "learning_rate": 1.505308758642624e-05, + "loss": 0.862, + "num_tokens": 26085347665.0, + "step": 6241 + }, + { + "epoch": 0.7417706476530006, + "grad_norm": 0.3568499099439976, + "learning_rate": 1.5051541116448052e-05, + "loss": 0.8731, + "num_tokens": 26089536369.0, + "step": 6242 + }, + { + "epoch": 0.7418894830659537, + "grad_norm": 0.45929120378133526, + "learning_rate": 1.5049994496427598e-05, + "loss": 0.8296, + "num_tokens": 26093700310.0, + "step": 6243 + }, + { + "epoch": 0.7420083184789067, + "grad_norm": 0.48969155131532616, + "learning_rate": 1.504844772642215e-05, + "loss": 0.829, + "num_tokens": 26097864740.0, + "step": 6244 + }, + { + "epoch": 0.7421271538918598, + "grad_norm": 0.446314431809873, + "learning_rate": 1.5046900806488997e-05, + "loss": 0.8603, + "num_tokens": 26102054679.0, + "step": 6245 + }, + { + "epoch": 0.7422459893048128, + "grad_norm": 0.45405775986456826, + "learning_rate": 1.5045353736685427e-05, + "loss": 0.8801, + "num_tokens": 26106243775.0, + "step": 6246 + }, + { + "epoch": 0.7423648247177659, + "grad_norm": 0.37348481561512226, + "learning_rate": 1.5043806517068728e-05, + "loss": 0.8515, + "num_tokens": 26110418361.0, + "step": 6247 + }, + { + "epoch": 0.742483660130719, + "grad_norm": 0.38939270614759736, + "learning_rate": 1.5042259147696201e-05, + "loss": 0.8527, + "num_tokens": 26114588200.0, + "step": 6248 + }, + { + "epoch": 0.742602495543672, + "grad_norm": 0.39380383359095084, + "learning_rate": 1.5040711628625153e-05, + "loss": 0.8488, + "num_tokens": 26118756076.0, + "step": 6249 + }, + { + "epoch": 0.7427213309566251, + "grad_norm": 0.409483100274146, + "learning_rate": 1.503916395991289e-05, + "loss": 0.8811, + "num_tokens": 26122925132.0, + "step": 6250 + }, + { + "epoch": 0.7428401663695782, + "grad_norm": 0.40741757716767196, + "learning_rate": 1.5037616141616726e-05, + "loss": 0.8767, + "num_tokens": 26127114602.0, + "step": 6251 + }, + { + "epoch": 0.7429590017825312, + "grad_norm": 0.4374069804338262, + "learning_rate": 1.5036068173793987e-05, + "loss": 0.8456, + "num_tokens": 26131298407.0, + "step": 6252 + }, + { + "epoch": 0.7430778371954843, + "grad_norm": 0.4968777011664683, + "learning_rate": 1.5034520056502e-05, + "loss": 0.8659, + "num_tokens": 26135488421.0, + "step": 6253 + }, + { + "epoch": 0.7431966726084374, + "grad_norm": 0.35847729648332427, + "learning_rate": 1.503297178979809e-05, + "loss": 0.8393, + "num_tokens": 26139678797.0, + "step": 6254 + }, + { + "epoch": 0.7433155080213903, + "grad_norm": 0.401613933880225, + "learning_rate": 1.50314233737396e-05, + "loss": 0.8751, + "num_tokens": 26143868357.0, + "step": 6255 + }, + { + "epoch": 0.7434343434343434, + "grad_norm": 0.4447679101314857, + "learning_rate": 1.5029874808383873e-05, + "loss": 0.8213, + "num_tokens": 26148041562.0, + "step": 6256 + }, + { + "epoch": 0.7435531788472965, + "grad_norm": 0.4017935743094179, + "learning_rate": 1.5028326093788254e-05, + "loss": 0.816, + "num_tokens": 26152196996.0, + "step": 6257 + }, + { + "epoch": 0.7436720142602495, + "grad_norm": 0.48070085232000903, + "learning_rate": 1.5026777230010105e-05, + "loss": 0.8603, + "num_tokens": 26156385202.0, + "step": 6258 + }, + { + "epoch": 0.7437908496732026, + "grad_norm": 0.3913679233859262, + "learning_rate": 1.5025228217106775e-05, + "loss": 0.8627, + "num_tokens": 26160554100.0, + "step": 6259 + }, + { + "epoch": 0.7439096850861556, + "grad_norm": 0.41638423547861103, + "learning_rate": 1.5023679055135635e-05, + "loss": 0.8409, + "num_tokens": 26164685583.0, + "step": 6260 + }, + { + "epoch": 0.7440285204991087, + "grad_norm": 0.4649334260472028, + "learning_rate": 1.502212974415406e-05, + "loss": 0.857, + "num_tokens": 26168876360.0, + "step": 6261 + }, + { + "epoch": 0.7441473559120618, + "grad_norm": 0.4652426946761826, + "learning_rate": 1.5020580284219417e-05, + "loss": 0.8567, + "num_tokens": 26173065117.0, + "step": 6262 + }, + { + "epoch": 0.7442661913250148, + "grad_norm": 0.3939392430910968, + "learning_rate": 1.5019030675389093e-05, + "loss": 0.8613, + "num_tokens": 26177256370.0, + "step": 6263 + }, + { + "epoch": 0.7443850267379679, + "grad_norm": 0.35290718737285204, + "learning_rate": 1.5017480917720474e-05, + "loss": 0.8367, + "num_tokens": 26181446551.0, + "step": 6264 + }, + { + "epoch": 0.744503862150921, + "grad_norm": 0.47406410924924686, + "learning_rate": 1.5015931011270954e-05, + "loss": 0.8326, + "num_tokens": 26185629545.0, + "step": 6265 + }, + { + "epoch": 0.744622697563874, + "grad_norm": 0.3497472928433021, + "learning_rate": 1.5014380956097931e-05, + "loss": 0.9023, + "num_tokens": 26189819295.0, + "step": 6266 + }, + { + "epoch": 0.7447415329768271, + "grad_norm": 0.3972896786253354, + "learning_rate": 1.5012830752258807e-05, + "loss": 0.8722, + "num_tokens": 26194009345.0, + "step": 6267 + }, + { + "epoch": 0.7448603683897802, + "grad_norm": 0.3539303921461141, + "learning_rate": 1.5011280399810993e-05, + "loss": 0.8383, + "num_tokens": 26198199527.0, + "step": 6268 + }, + { + "epoch": 0.7449792038027332, + "grad_norm": 0.470555767779036, + "learning_rate": 1.5009729898811904e-05, + "loss": 0.8771, + "num_tokens": 26202387949.0, + "step": 6269 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.4253521714413721, + "learning_rate": 1.500817924931896e-05, + "loss": 0.8314, + "num_tokens": 26206556859.0, + "step": 6270 + }, + { + "epoch": 0.7452168746286393, + "grad_norm": 0.40866320808639384, + "learning_rate": 1.5006628451389586e-05, + "loss": 0.8004, + "num_tokens": 26210745000.0, + "step": 6271 + }, + { + "epoch": 0.7453357100415924, + "grad_norm": 0.4155555348920881, + "learning_rate": 1.5005077505081213e-05, + "loss": 0.7849, + "num_tokens": 26214906566.0, + "step": 6272 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 0.3878189130524315, + "learning_rate": 1.5003526410451277e-05, + "loss": 0.8153, + "num_tokens": 26219037945.0, + "step": 6273 + }, + { + "epoch": 0.7455733808674985, + "grad_norm": 0.3779687805977761, + "learning_rate": 1.5001975167557225e-05, + "loss": 0.8767, + "num_tokens": 26223226087.0, + "step": 6274 + }, + { + "epoch": 0.7456922162804516, + "grad_norm": 0.4652352362382831, + "learning_rate": 1.5000423776456501e-05, + "loss": 0.8529, + "num_tokens": 26227416758.0, + "step": 6275 + }, + { + "epoch": 0.7458110516934047, + "grad_norm": 0.38597583848493844, + "learning_rate": 1.4998872237206557e-05, + "loss": 0.8356, + "num_tokens": 26231550181.0, + "step": 6276 + }, + { + "epoch": 0.7459298871063577, + "grad_norm": 0.4783982991618834, + "learning_rate": 1.4997320549864856e-05, + "loss": 0.8715, + "num_tokens": 26235739877.0, + "step": 6277 + }, + { + "epoch": 0.7460487225193108, + "grad_norm": 0.45864149495454065, + "learning_rate": 1.4995768714488856e-05, + "loss": 0.862, + "num_tokens": 26239910615.0, + "step": 6278 + }, + { + "epoch": 0.7461675579322639, + "grad_norm": 0.3730579432185562, + "learning_rate": 1.4994216731136033e-05, + "loss": 0.8455, + "num_tokens": 26244097486.0, + "step": 6279 + }, + { + "epoch": 0.7462863933452168, + "grad_norm": 0.36416429315524523, + "learning_rate": 1.4992664599863857e-05, + "loss": 0.804, + "num_tokens": 26248284968.0, + "step": 6280 + }, + { + "epoch": 0.74640522875817, + "grad_norm": 0.4405199093289368, + "learning_rate": 1.4991112320729816e-05, + "loss": 0.8485, + "num_tokens": 26252475115.0, + "step": 6281 + }, + { + "epoch": 0.746524064171123, + "grad_norm": 0.38549841840774174, + "learning_rate": 1.498955989379139e-05, + "loss": 0.8195, + "num_tokens": 26256639931.0, + "step": 6282 + }, + { + "epoch": 0.746642899584076, + "grad_norm": 0.4095063701352908, + "learning_rate": 1.498800731910607e-05, + "loss": 0.8503, + "num_tokens": 26260827730.0, + "step": 6283 + }, + { + "epoch": 0.7467617349970291, + "grad_norm": 0.43667335119111494, + "learning_rate": 1.4986454596731354e-05, + "loss": 0.8445, + "num_tokens": 26265017059.0, + "step": 6284 + }, + { + "epoch": 0.7468805704099821, + "grad_norm": 0.4704445713785284, + "learning_rate": 1.4984901726724752e-05, + "loss": 0.7984, + "num_tokens": 26269184416.0, + "step": 6285 + }, + { + "epoch": 0.7469994058229352, + "grad_norm": 0.3779816459909547, + "learning_rate": 1.498334870914376e-05, + "loss": 0.8614, + "num_tokens": 26273372848.0, + "step": 6286 + }, + { + "epoch": 0.7471182412358883, + "grad_norm": 0.371584684417775, + "learning_rate": 1.4981795544045903e-05, + "loss": 0.8545, + "num_tokens": 26277560074.0, + "step": 6287 + }, + { + "epoch": 0.7472370766488413, + "grad_norm": 0.37178742806543535, + "learning_rate": 1.498024223148869e-05, + "loss": 0.8431, + "num_tokens": 26281748926.0, + "step": 6288 + }, + { + "epoch": 0.7473559120617944, + "grad_norm": 0.3439214316652807, + "learning_rate": 1.4978688771529648e-05, + "loss": 0.8263, + "num_tokens": 26285938896.0, + "step": 6289 + }, + { + "epoch": 0.7474747474747475, + "grad_norm": 0.4306452693371339, + "learning_rate": 1.497713516422631e-05, + "loss": 0.863, + "num_tokens": 26290120785.0, + "step": 6290 + }, + { + "epoch": 0.7475935828877005, + "grad_norm": 0.3550984693230134, + "learning_rate": 1.4975581409636213e-05, + "loss": 0.8325, + "num_tokens": 26294262812.0, + "step": 6291 + }, + { + "epoch": 0.7477124183006536, + "grad_norm": 0.3988412156138144, + "learning_rate": 1.497402750781689e-05, + "loss": 0.8854, + "num_tokens": 26298421453.0, + "step": 6292 + }, + { + "epoch": 0.7478312537136067, + "grad_norm": 0.3697086177673317, + "learning_rate": 1.4972473458825896e-05, + "loss": 0.8245, + "num_tokens": 26302606834.0, + "step": 6293 + }, + { + "epoch": 0.7479500891265597, + "grad_norm": 0.36768503234627486, + "learning_rate": 1.4970919262720775e-05, + "loss": 0.854, + "num_tokens": 26306796258.0, + "step": 6294 + }, + { + "epoch": 0.7480689245395128, + "grad_norm": 0.40589932631184783, + "learning_rate": 1.496936491955909e-05, + "loss": 0.8813, + "num_tokens": 26310985402.0, + "step": 6295 + }, + { + "epoch": 0.7481877599524658, + "grad_norm": 0.40817058060124656, + "learning_rate": 1.4967810429398402e-05, + "loss": 0.8357, + "num_tokens": 26315170149.0, + "step": 6296 + }, + { + "epoch": 0.7483065953654189, + "grad_norm": 0.3824184679473018, + "learning_rate": 1.4966255792296275e-05, + "loss": 0.8722, + "num_tokens": 26319359903.0, + "step": 6297 + }, + { + "epoch": 0.748425430778372, + "grad_norm": 0.34999814695103865, + "learning_rate": 1.496470100831029e-05, + "loss": 0.8131, + "num_tokens": 26323523628.0, + "step": 6298 + }, + { + "epoch": 0.748544266191325, + "grad_norm": 0.40986311367728945, + "learning_rate": 1.4963146077498019e-05, + "loss": 0.8449, + "num_tokens": 26327713480.0, + "step": 6299 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 0.4284832613734374, + "learning_rate": 1.4961590999917051e-05, + "loss": 0.8731, + "num_tokens": 26331902498.0, + "step": 6300 + }, + { + "epoch": 0.7487819370172312, + "grad_norm": 0.4345980834990612, + "learning_rate": 1.4960035775624971e-05, + "loss": 0.847, + "num_tokens": 26336091714.0, + "step": 6301 + }, + { + "epoch": 0.7489007724301842, + "grad_norm": 0.4518699205501761, + "learning_rate": 1.4958480404679377e-05, + "loss": 0.8572, + "num_tokens": 26340278482.0, + "step": 6302 + }, + { + "epoch": 0.7490196078431373, + "grad_norm": 0.3154233000983645, + "learning_rate": 1.4956924887137872e-05, + "loss": 0.8443, + "num_tokens": 26344436162.0, + "step": 6303 + }, + { + "epoch": 0.7491384432560904, + "grad_norm": 0.5247129569261888, + "learning_rate": 1.495536922305806e-05, + "loss": 0.8127, + "num_tokens": 26348613297.0, + "step": 6304 + }, + { + "epoch": 0.7492572786690433, + "grad_norm": 0.39542503691616343, + "learning_rate": 1.4953813412497549e-05, + "loss": 0.8653, + "num_tokens": 26352801358.0, + "step": 6305 + }, + { + "epoch": 0.7493761140819964, + "grad_norm": 0.42041692705465816, + "learning_rate": 1.4952257455513961e-05, + "loss": 0.8171, + "num_tokens": 26356988636.0, + "step": 6306 + }, + { + "epoch": 0.7494949494949495, + "grad_norm": 0.4561735893798781, + "learning_rate": 1.4950701352164919e-05, + "loss": 0.8653, + "num_tokens": 26361162012.0, + "step": 6307 + }, + { + "epoch": 0.7496137849079025, + "grad_norm": 0.418107384553874, + "learning_rate": 1.4949145102508043e-05, + "loss": 0.8529, + "num_tokens": 26365314403.0, + "step": 6308 + }, + { + "epoch": 0.7497326203208556, + "grad_norm": 0.38326002288705957, + "learning_rate": 1.4947588706600974e-05, + "loss": 0.8054, + "num_tokens": 26369476424.0, + "step": 6309 + }, + { + "epoch": 0.7498514557338086, + "grad_norm": 0.45435842099629054, + "learning_rate": 1.494603216450135e-05, + "loss": 0.857, + "num_tokens": 26373634251.0, + "step": 6310 + }, + { + "epoch": 0.7499702911467617, + "grad_norm": 0.37879385512918673, + "learning_rate": 1.4944475476266811e-05, + "loss": 0.8473, + "num_tokens": 26377823566.0, + "step": 6311 + }, + { + "epoch": 0.7500891265597148, + "grad_norm": 0.3736565788097585, + "learning_rate": 1.4942918641955012e-05, + "loss": 0.8434, + "num_tokens": 26382013389.0, + "step": 6312 + }, + { + "epoch": 0.7502079619726678, + "grad_norm": 0.4219356255223545, + "learning_rate": 1.49413616616236e-05, + "loss": 0.8486, + "num_tokens": 26386202182.0, + "step": 6313 + }, + { + "epoch": 0.7503267973856209, + "grad_norm": 0.47391085938545185, + "learning_rate": 1.493980453533024e-05, + "loss": 0.8341, + "num_tokens": 26390391368.0, + "step": 6314 + }, + { + "epoch": 0.750445632798574, + "grad_norm": 0.44104478314010026, + "learning_rate": 1.4938247263132597e-05, + "loss": 0.8476, + "num_tokens": 26394580834.0, + "step": 6315 + }, + { + "epoch": 0.750564468211527, + "grad_norm": 0.3809150749118277, + "learning_rate": 1.4936689845088346e-05, + "loss": 0.8366, + "num_tokens": 26398770741.0, + "step": 6316 + }, + { + "epoch": 0.7506833036244801, + "grad_norm": 0.3909027901257456, + "learning_rate": 1.4935132281255159e-05, + "loss": 0.8462, + "num_tokens": 26402961413.0, + "step": 6317 + }, + { + "epoch": 0.7508021390374332, + "grad_norm": 0.4411668770377427, + "learning_rate": 1.4933574571690716e-05, + "loss": 0.8147, + "num_tokens": 26407141103.0, + "step": 6318 + }, + { + "epoch": 0.7509209744503862, + "grad_norm": 0.39650320769614794, + "learning_rate": 1.4932016716452706e-05, + "loss": 0.8588, + "num_tokens": 26411328870.0, + "step": 6319 + }, + { + "epoch": 0.7510398098633393, + "grad_norm": 0.44150469945871984, + "learning_rate": 1.4930458715598825e-05, + "loss": 0.822, + "num_tokens": 26415518350.0, + "step": 6320 + }, + { + "epoch": 0.7511586452762923, + "grad_norm": 0.4323475078263544, + "learning_rate": 1.4928900569186771e-05, + "loss": 0.866, + "num_tokens": 26419682812.0, + "step": 6321 + }, + { + "epoch": 0.7512774806892454, + "grad_norm": 0.38047499665659074, + "learning_rate": 1.492734227727424e-05, + "loss": 0.8847, + "num_tokens": 26423859508.0, + "step": 6322 + }, + { + "epoch": 0.7513963161021985, + "grad_norm": 0.38889228369934997, + "learning_rate": 1.4925783839918948e-05, + "loss": 0.8447, + "num_tokens": 26428031066.0, + "step": 6323 + }, + { + "epoch": 0.7515151515151515, + "grad_norm": 0.3964781496641262, + "learning_rate": 1.4924225257178607e-05, + "loss": 0.8487, + "num_tokens": 26432219353.0, + "step": 6324 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.4497208284110039, + "learning_rate": 1.4922666529110937e-05, + "loss": 0.8497, + "num_tokens": 26436407836.0, + "step": 6325 + }, + { + "epoch": 0.7517528223410577, + "grad_norm": 0.3600270615912766, + "learning_rate": 1.4921107655773666e-05, + "loss": 0.8468, + "num_tokens": 26440597353.0, + "step": 6326 + }, + { + "epoch": 0.7518716577540107, + "grad_norm": 0.3776787837348374, + "learning_rate": 1.4919548637224512e-05, + "loss": 0.8208, + "num_tokens": 26444786655.0, + "step": 6327 + }, + { + "epoch": 0.7519904931669638, + "grad_norm": 0.39257895699780004, + "learning_rate": 1.4917989473521225e-05, + "loss": 0.8282, + "num_tokens": 26448953862.0, + "step": 6328 + }, + { + "epoch": 0.7521093285799169, + "grad_norm": 0.4269871663449839, + "learning_rate": 1.4916430164721542e-05, + "loss": 0.8576, + "num_tokens": 26453116444.0, + "step": 6329 + }, + { + "epoch": 0.7522281639928698, + "grad_norm": 0.524727356180469, + "learning_rate": 1.4914870710883205e-05, + "loss": 0.8263, + "num_tokens": 26457305109.0, + "step": 6330 + }, + { + "epoch": 0.752346999405823, + "grad_norm": 0.36326729346479925, + "learning_rate": 1.491331111206397e-05, + "loss": 0.8221, + "num_tokens": 26461495381.0, + "step": 6331 + }, + { + "epoch": 0.752465834818776, + "grad_norm": 0.6006072079689017, + "learning_rate": 1.4911751368321597e-05, + "loss": 0.8037, + "num_tokens": 26465685551.0, + "step": 6332 + }, + { + "epoch": 0.752584670231729, + "grad_norm": 0.4210045478129358, + "learning_rate": 1.491019147971384e-05, + "loss": 0.8483, + "num_tokens": 26469859655.0, + "step": 6333 + }, + { + "epoch": 0.7527035056446821, + "grad_norm": 0.5073433848980756, + "learning_rate": 1.4908631446298473e-05, + "loss": 0.8199, + "num_tokens": 26474013103.0, + "step": 6334 + }, + { + "epoch": 0.7528223410576351, + "grad_norm": 0.4630387747514807, + "learning_rate": 1.4907071268133273e-05, + "loss": 0.8405, + "num_tokens": 26478173221.0, + "step": 6335 + }, + { + "epoch": 0.7529411764705882, + "grad_norm": 0.44423186457325026, + "learning_rate": 1.490551094527601e-05, + "loss": 0.858, + "num_tokens": 26482361109.0, + "step": 6336 + }, + { + "epoch": 0.7530600118835413, + "grad_norm": 0.4791891511477553, + "learning_rate": 1.490395047778447e-05, + "loss": 0.8278, + "num_tokens": 26486550463.0, + "step": 6337 + }, + { + "epoch": 0.7531788472964943, + "grad_norm": 0.4770822627135803, + "learning_rate": 1.4902389865716444e-05, + "loss": 0.821, + "num_tokens": 26490692310.0, + "step": 6338 + }, + { + "epoch": 0.7532976827094474, + "grad_norm": 0.43819413767598037, + "learning_rate": 1.4900829109129728e-05, + "loss": 0.9003, + "num_tokens": 26494878988.0, + "step": 6339 + }, + { + "epoch": 0.7534165181224005, + "grad_norm": 0.37878350869409105, + "learning_rate": 1.4899268208082125e-05, + "loss": 0.844, + "num_tokens": 26499031257.0, + "step": 6340 + }, + { + "epoch": 0.7535353535353535, + "grad_norm": 0.5349199658593072, + "learning_rate": 1.4897707162631433e-05, + "loss": 0.8563, + "num_tokens": 26503220148.0, + "step": 6341 + }, + { + "epoch": 0.7536541889483066, + "grad_norm": 0.43578740744530287, + "learning_rate": 1.4896145972835467e-05, + "loss": 0.8414, + "num_tokens": 26507409765.0, + "step": 6342 + }, + { + "epoch": 0.7537730243612597, + "grad_norm": 0.42106504786191284, + "learning_rate": 1.489458463875204e-05, + "loss": 0.8782, + "num_tokens": 26511596770.0, + "step": 6343 + }, + { + "epoch": 0.7538918597742127, + "grad_norm": 0.3805601172635056, + "learning_rate": 1.489302316043898e-05, + "loss": 0.8353, + "num_tokens": 26515785042.0, + "step": 6344 + }, + { + "epoch": 0.7540106951871658, + "grad_norm": 0.4471058864873158, + "learning_rate": 1.4891461537954109e-05, + "loss": 0.8904, + "num_tokens": 26519973780.0, + "step": 6345 + }, + { + "epoch": 0.7541295306001188, + "grad_norm": 0.42827542794339024, + "learning_rate": 1.4889899771355256e-05, + "loss": 0.8266, + "num_tokens": 26524164015.0, + "step": 6346 + }, + { + "epoch": 0.7542483660130719, + "grad_norm": 0.4439957473127288, + "learning_rate": 1.4888337860700268e-05, + "loss": 0.858, + "num_tokens": 26528353907.0, + "step": 6347 + }, + { + "epoch": 0.754367201426025, + "grad_norm": 0.38075974332953916, + "learning_rate": 1.4886775806046977e-05, + "loss": 0.8677, + "num_tokens": 26532530111.0, + "step": 6348 + }, + { + "epoch": 0.754486036838978, + "grad_norm": 0.35167381335178, + "learning_rate": 1.4885213607453239e-05, + "loss": 0.8306, + "num_tokens": 26536717657.0, + "step": 6349 + }, + { + "epoch": 0.7546048722519311, + "grad_norm": 0.4224219515250358, + "learning_rate": 1.4883651264976906e-05, + "loss": 0.8802, + "num_tokens": 26540888059.0, + "step": 6350 + }, + { + "epoch": 0.7547237076648842, + "grad_norm": 0.4896775819116644, + "learning_rate": 1.4882088778675833e-05, + "loss": 0.842, + "num_tokens": 26545062847.0, + "step": 6351 + }, + { + "epoch": 0.7548425430778372, + "grad_norm": 0.34423208035646163, + "learning_rate": 1.4880526148607887e-05, + "loss": 0.8121, + "num_tokens": 26549251199.0, + "step": 6352 + }, + { + "epoch": 0.7549613784907903, + "grad_norm": 0.41102903088631926, + "learning_rate": 1.4878963374830939e-05, + "loss": 0.8587, + "num_tokens": 26553439719.0, + "step": 6353 + }, + { + "epoch": 0.7550802139037434, + "grad_norm": 0.42143114980453683, + "learning_rate": 1.4877400457402861e-05, + "loss": 0.8337, + "num_tokens": 26557629673.0, + "step": 6354 + }, + { + "epoch": 0.7551990493166963, + "grad_norm": 0.41644689030777227, + "learning_rate": 1.4875837396381541e-05, + "loss": 0.8211, + "num_tokens": 26561818926.0, + "step": 6355 + }, + { + "epoch": 0.7553178847296494, + "grad_norm": 0.3534371791716715, + "learning_rate": 1.4874274191824848e-05, + "loss": 0.8678, + "num_tokens": 26566006987.0, + "step": 6356 + }, + { + "epoch": 0.7554367201426025, + "grad_norm": 0.48643235049094846, + "learning_rate": 1.4872710843790684e-05, + "loss": 0.8525, + "num_tokens": 26570196495.0, + "step": 6357 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.42862211850594856, + "learning_rate": 1.487114735233695e-05, + "loss": 0.8259, + "num_tokens": 26574387128.0, + "step": 6358 + }, + { + "epoch": 0.7556743909685086, + "grad_norm": 0.4038683732329919, + "learning_rate": 1.4869583717521534e-05, + "loss": 0.864, + "num_tokens": 26578576468.0, + "step": 6359 + }, + { + "epoch": 0.7557932263814616, + "grad_norm": 0.43919452492590144, + "learning_rate": 1.4868019939402354e-05, + "loss": 0.8645, + "num_tokens": 26582765195.0, + "step": 6360 + }, + { + "epoch": 0.7559120617944147, + "grad_norm": 0.45098089629146954, + "learning_rate": 1.4866456018037316e-05, + "loss": 0.8354, + "num_tokens": 26586940072.0, + "step": 6361 + }, + { + "epoch": 0.7560308972073678, + "grad_norm": 0.44846361287468506, + "learning_rate": 1.4864891953484336e-05, + "loss": 0.8482, + "num_tokens": 26591100425.0, + "step": 6362 + }, + { + "epoch": 0.7561497326203208, + "grad_norm": 0.4433857113698739, + "learning_rate": 1.4863327745801344e-05, + "loss": 0.8184, + "num_tokens": 26595289552.0, + "step": 6363 + }, + { + "epoch": 0.7562685680332739, + "grad_norm": 0.35750868457257573, + "learning_rate": 1.486176339504626e-05, + "loss": 0.8265, + "num_tokens": 26599479004.0, + "step": 6364 + }, + { + "epoch": 0.756387403446227, + "grad_norm": 0.45475640901067316, + "learning_rate": 1.486019890127702e-05, + "loss": 0.8431, + "num_tokens": 26603668291.0, + "step": 6365 + }, + { + "epoch": 0.75650623885918, + "grad_norm": 0.3860686537930509, + "learning_rate": 1.4858634264551568e-05, + "loss": 0.844, + "num_tokens": 26607856392.0, + "step": 6366 + }, + { + "epoch": 0.7566250742721331, + "grad_norm": 0.5437392899915389, + "learning_rate": 1.485706948492784e-05, + "loss": 0.8651, + "num_tokens": 26612045153.0, + "step": 6367 + }, + { + "epoch": 0.7567439096850862, + "grad_norm": 0.3532636207963436, + "learning_rate": 1.4855504562463787e-05, + "loss": 0.8751, + "num_tokens": 26616200169.0, + "step": 6368 + }, + { + "epoch": 0.7568627450980392, + "grad_norm": 0.4614976603646318, + "learning_rate": 1.4853939497217366e-05, + "loss": 0.8716, + "num_tokens": 26620387141.0, + "step": 6369 + }, + { + "epoch": 0.7569815805109923, + "grad_norm": 0.4256479637431022, + "learning_rate": 1.4852374289246535e-05, + "loss": 0.8202, + "num_tokens": 26624577923.0, + "step": 6370 + }, + { + "epoch": 0.7571004159239453, + "grad_norm": 0.4477462913296172, + "learning_rate": 1.4850808938609257e-05, + "loss": 0.8098, + "num_tokens": 26628768413.0, + "step": 6371 + }, + { + "epoch": 0.7572192513368984, + "grad_norm": 0.419076626870723, + "learning_rate": 1.4849243445363504e-05, + "loss": 0.8373, + "num_tokens": 26632930974.0, + "step": 6372 + }, + { + "epoch": 0.7573380867498515, + "grad_norm": 0.3698863549611442, + "learning_rate": 1.4847677809567254e-05, + "loss": 0.8747, + "num_tokens": 26637109488.0, + "step": 6373 + }, + { + "epoch": 0.7574569221628045, + "grad_norm": 0.4228311398155363, + "learning_rate": 1.4846112031278483e-05, + "loss": 0.8458, + "num_tokens": 26641298632.0, + "step": 6374 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.3543845215692461, + "learning_rate": 1.4844546110555178e-05, + "loss": 0.8129, + "num_tokens": 26645488099.0, + "step": 6375 + }, + { + "epoch": 0.7576945929887107, + "grad_norm": 0.45600832212284237, + "learning_rate": 1.4842980047455338e-05, + "loss": 0.898, + "num_tokens": 26649611930.0, + "step": 6376 + }, + { + "epoch": 0.7578134284016637, + "grad_norm": 0.3711723222254995, + "learning_rate": 1.4841413842036948e-05, + "loss": 0.8562, + "num_tokens": 26653784408.0, + "step": 6377 + }, + { + "epoch": 0.7579322638146168, + "grad_norm": 0.3813617000590831, + "learning_rate": 1.4839847494358017e-05, + "loss": 0.8826, + "num_tokens": 26657972075.0, + "step": 6378 + }, + { + "epoch": 0.7580510992275699, + "grad_norm": 0.4401955262203052, + "learning_rate": 1.4838281004476552e-05, + "loss": 0.8497, + "num_tokens": 26662129408.0, + "step": 6379 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.3936030557221315, + "learning_rate": 1.483671437245056e-05, + "loss": 0.822, + "num_tokens": 26666314528.0, + "step": 6380 + }, + { + "epoch": 0.758288770053476, + "grad_norm": 0.46170429408820385, + "learning_rate": 1.4835147598338065e-05, + "loss": 0.8381, + "num_tokens": 26670503981.0, + "step": 6381 + }, + { + "epoch": 0.758407605466429, + "grad_norm": 0.42863970205987545, + "learning_rate": 1.4833580682197087e-05, + "loss": 0.8034, + "num_tokens": 26674692205.0, + "step": 6382 + }, + { + "epoch": 0.758526440879382, + "grad_norm": 0.37174644273092367, + "learning_rate": 1.4832013624085654e-05, + "loss": 0.8192, + "num_tokens": 26678881619.0, + "step": 6383 + }, + { + "epoch": 0.7586452762923351, + "grad_norm": 0.39876921491166856, + "learning_rate": 1.4830446424061802e-05, + "loss": 0.8204, + "num_tokens": 26683071983.0, + "step": 6384 + }, + { + "epoch": 0.7587641117052881, + "grad_norm": 0.4941315291382736, + "learning_rate": 1.4828879082183567e-05, + "loss": 0.8232, + "num_tokens": 26687261017.0, + "step": 6385 + }, + { + "epoch": 0.7588829471182412, + "grad_norm": 0.44177135887161745, + "learning_rate": 1.4827311598508996e-05, + "loss": 0.8643, + "num_tokens": 26691450952.0, + "step": 6386 + }, + { + "epoch": 0.7590017825311943, + "grad_norm": 0.36192054964847725, + "learning_rate": 1.4825743973096131e-05, + "loss": 0.8508, + "num_tokens": 26695627841.0, + "step": 6387 + }, + { + "epoch": 0.7591206179441473, + "grad_norm": 0.4320989096531791, + "learning_rate": 1.4824176206003033e-05, + "loss": 0.8231, + "num_tokens": 26699818277.0, + "step": 6388 + }, + { + "epoch": 0.7592394533571004, + "grad_norm": 0.5103508154452464, + "learning_rate": 1.4822608297287764e-05, + "loss": 0.8396, + "num_tokens": 26704007588.0, + "step": 6389 + }, + { + "epoch": 0.7593582887700535, + "grad_norm": 0.4059664503405553, + "learning_rate": 1.4821040247008382e-05, + "loss": 0.829, + "num_tokens": 26708197673.0, + "step": 6390 + }, + { + "epoch": 0.7594771241830065, + "grad_norm": 0.3940001510566987, + "learning_rate": 1.4819472055222963e-05, + "loss": 0.8186, + "num_tokens": 26712358261.0, + "step": 6391 + }, + { + "epoch": 0.7595959595959596, + "grad_norm": 0.4747803829806767, + "learning_rate": 1.4817903721989577e-05, + "loss": 0.9126, + "num_tokens": 26716547031.0, + "step": 6392 + }, + { + "epoch": 0.7597147950089127, + "grad_norm": 0.35986969872065977, + "learning_rate": 1.4816335247366309e-05, + "loss": 0.8356, + "num_tokens": 26720712003.0, + "step": 6393 + }, + { + "epoch": 0.7598336304218657, + "grad_norm": 0.4207441104880656, + "learning_rate": 1.4814766631411244e-05, + "loss": 0.8578, + "num_tokens": 26724901358.0, + "step": 6394 + }, + { + "epoch": 0.7599524658348188, + "grad_norm": 0.40563254037952745, + "learning_rate": 1.4813197874182472e-05, + "loss": 0.8479, + "num_tokens": 26729063932.0, + "step": 6395 + }, + { + "epoch": 0.7600713012477719, + "grad_norm": 0.44520552352381576, + "learning_rate": 1.4811628975738088e-05, + "loss": 0.871, + "num_tokens": 26733250828.0, + "step": 6396 + }, + { + "epoch": 0.7601901366607249, + "grad_norm": 0.4076623635223797, + "learning_rate": 1.4810059936136199e-05, + "loss": 0.8561, + "num_tokens": 26737440145.0, + "step": 6397 + }, + { + "epoch": 0.760308972073678, + "grad_norm": 0.40772130276510665, + "learning_rate": 1.4808490755434906e-05, + "loss": 0.8471, + "num_tokens": 26741627967.0, + "step": 6398 + }, + { + "epoch": 0.760427807486631, + "grad_norm": 0.45021614818655087, + "learning_rate": 1.4806921433692326e-05, + "loss": 0.8519, + "num_tokens": 26745816858.0, + "step": 6399 + }, + { + "epoch": 0.7605466428995841, + "grad_norm": 0.4180448750092575, + "learning_rate": 1.4805351970966571e-05, + "loss": 0.8524, + "num_tokens": 26750006528.0, + "step": 6400 + }, + { + "epoch": 0.7606654783125372, + "grad_norm": 0.5016583729325992, + "learning_rate": 1.480378236731577e-05, + "loss": 0.8647, + "num_tokens": 26754164854.0, + "step": 6401 + }, + { + "epoch": 0.7607843137254902, + "grad_norm": 0.3691780533516331, + "learning_rate": 1.4802212622798042e-05, + "loss": 0.8277, + "num_tokens": 26758352811.0, + "step": 6402 + }, + { + "epoch": 0.7609031491384433, + "grad_norm": 0.39896397385841254, + "learning_rate": 1.4800642737471524e-05, + "loss": 0.8348, + "num_tokens": 26762527259.0, + "step": 6403 + }, + { + "epoch": 0.7610219845513964, + "grad_norm": 0.418696513556785, + "learning_rate": 1.4799072711394362e-05, + "loss": 0.8413, + "num_tokens": 26766712081.0, + "step": 6404 + }, + { + "epoch": 0.7611408199643493, + "grad_norm": 0.4957619712818804, + "learning_rate": 1.4797502544624691e-05, + "loss": 0.84, + "num_tokens": 26770899582.0, + "step": 6405 + }, + { + "epoch": 0.7612596553773024, + "grad_norm": 0.37207185391466446, + "learning_rate": 1.4795932237220658e-05, + "loss": 0.8275, + "num_tokens": 26775088364.0, + "step": 6406 + }, + { + "epoch": 0.7613784907902555, + "grad_norm": 0.42440123615223163, + "learning_rate": 1.4794361789240421e-05, + "loss": 0.8317, + "num_tokens": 26779240885.0, + "step": 6407 + }, + { + "epoch": 0.7614973262032085, + "grad_norm": 0.4684816134900474, + "learning_rate": 1.4792791200742138e-05, + "loss": 0.8119, + "num_tokens": 26783410279.0, + "step": 6408 + }, + { + "epoch": 0.7616161616161616, + "grad_norm": 0.4197567882697852, + "learning_rate": 1.4791220471783973e-05, + "loss": 0.8784, + "num_tokens": 26787556274.0, + "step": 6409 + }, + { + "epoch": 0.7617349970291146, + "grad_norm": 0.4625346272776139, + "learning_rate": 1.4789649602424097e-05, + "loss": 0.8237, + "num_tokens": 26791745311.0, + "step": 6410 + }, + { + "epoch": 0.7618538324420677, + "grad_norm": 0.486436704551806, + "learning_rate": 1.4788078592720681e-05, + "loss": 0.7941, + "num_tokens": 26795934939.0, + "step": 6411 + }, + { + "epoch": 0.7619726678550208, + "grad_norm": 0.35037509858913446, + "learning_rate": 1.4786507442731906e-05, + "loss": 0.8322, + "num_tokens": 26800123779.0, + "step": 6412 + }, + { + "epoch": 0.7620915032679738, + "grad_norm": 0.5216288111940823, + "learning_rate": 1.4784936152515961e-05, + "loss": 0.8217, + "num_tokens": 26804312677.0, + "step": 6413 + }, + { + "epoch": 0.7622103386809269, + "grad_norm": 0.3499526075518159, + "learning_rate": 1.478336472213103e-05, + "loss": 0.8447, + "num_tokens": 26808446456.0, + "step": 6414 + }, + { + "epoch": 0.76232917409388, + "grad_norm": 0.4189687364097932, + "learning_rate": 1.4781793151635317e-05, + "loss": 0.8601, + "num_tokens": 26812571882.0, + "step": 6415 + }, + { + "epoch": 0.762448009506833, + "grad_norm": 0.5324815507574441, + "learning_rate": 1.478022144108701e-05, + "loss": 0.8297, + "num_tokens": 26816760848.0, + "step": 6416 + }, + { + "epoch": 0.7625668449197861, + "grad_norm": 0.3807025884231604, + "learning_rate": 1.4778649590544327e-05, + "loss": 0.8749, + "num_tokens": 26820946216.0, + "step": 6417 + }, + { + "epoch": 0.7626856803327392, + "grad_norm": 0.45821509002504557, + "learning_rate": 1.4777077600065472e-05, + "loss": 0.8399, + "num_tokens": 26825136516.0, + "step": 6418 + }, + { + "epoch": 0.7628045157456922, + "grad_norm": 0.42816797944938223, + "learning_rate": 1.4775505469708661e-05, + "loss": 0.854, + "num_tokens": 26829325222.0, + "step": 6419 + }, + { + "epoch": 0.7629233511586453, + "grad_norm": 0.432089603562448, + "learning_rate": 1.477393319953212e-05, + "loss": 0.861, + "num_tokens": 26833483083.0, + "step": 6420 + }, + { + "epoch": 0.7630421865715984, + "grad_norm": 0.45171794630922574, + "learning_rate": 1.4772360789594073e-05, + "loss": 0.8832, + "num_tokens": 26837672317.0, + "step": 6421 + }, + { + "epoch": 0.7631610219845514, + "grad_norm": 0.3790025010985003, + "learning_rate": 1.4770788239952746e-05, + "loss": 0.8076, + "num_tokens": 26841862457.0, + "step": 6422 + }, + { + "epoch": 0.7632798573975045, + "grad_norm": 0.4567111962966669, + "learning_rate": 1.4769215550666387e-05, + "loss": 0.8215, + "num_tokens": 26846026196.0, + "step": 6423 + }, + { + "epoch": 0.7633986928104575, + "grad_norm": 0.37684963221924234, + "learning_rate": 1.476764272179323e-05, + "loss": 0.8497, + "num_tokens": 26850216244.0, + "step": 6424 + }, + { + "epoch": 0.7635175282234106, + "grad_norm": 0.44089008525727, + "learning_rate": 1.4766069753391523e-05, + "loss": 0.8819, + "num_tokens": 26854385964.0, + "step": 6425 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 0.46168860053456084, + "learning_rate": 1.4764496645519521e-05, + "loss": 0.802, + "num_tokens": 26858555392.0, + "step": 6426 + }, + { + "epoch": 0.7637551990493167, + "grad_norm": 0.404666837971493, + "learning_rate": 1.4762923398235483e-05, + "loss": 0.8633, + "num_tokens": 26862743333.0, + "step": 6427 + }, + { + "epoch": 0.7638740344622698, + "grad_norm": 0.5181681720014696, + "learning_rate": 1.4761350011597665e-05, + "loss": 0.8644, + "num_tokens": 26866931812.0, + "step": 6428 + }, + { + "epoch": 0.7639928698752229, + "grad_norm": 0.3550405458693615, + "learning_rate": 1.4759776485664336e-05, + "loss": 0.8258, + "num_tokens": 26871120434.0, + "step": 6429 + }, + { + "epoch": 0.7641117052881758, + "grad_norm": 0.4974161657807691, + "learning_rate": 1.475820282049378e-05, + "loss": 0.8352, + "num_tokens": 26875308977.0, + "step": 6430 + }, + { + "epoch": 0.764230540701129, + "grad_norm": 0.39611682409367366, + "learning_rate": 1.4756629016144261e-05, + "loss": 0.8434, + "num_tokens": 26879488395.0, + "step": 6431 + }, + { + "epoch": 0.764349376114082, + "grad_norm": 0.4944030965515475, + "learning_rate": 1.4755055072674067e-05, + "loss": 0.8521, + "num_tokens": 26883645431.0, + "step": 6432 + }, + { + "epoch": 0.764468211527035, + "grad_norm": 0.4864529077893427, + "learning_rate": 1.475348099014149e-05, + "loss": 0.8554, + "num_tokens": 26887835682.0, + "step": 6433 + }, + { + "epoch": 0.7645870469399881, + "grad_norm": 0.3785887681410762, + "learning_rate": 1.4751906768604821e-05, + "loss": 0.8341, + "num_tokens": 26892016589.0, + "step": 6434 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.5500548577628042, + "learning_rate": 1.4750332408122359e-05, + "loss": 0.8549, + "num_tokens": 26896181689.0, + "step": 6435 + }, + { + "epoch": 0.7648247177658942, + "grad_norm": 0.39709088579744783, + "learning_rate": 1.4748757908752409e-05, + "loss": 0.8157, + "num_tokens": 26900370683.0, + "step": 6436 + }, + { + "epoch": 0.7649435531788473, + "grad_norm": 0.538084282557398, + "learning_rate": 1.4747183270553277e-05, + "loss": 0.8593, + "num_tokens": 26904556132.0, + "step": 6437 + }, + { + "epoch": 0.7650623885918003, + "grad_norm": 0.443171622846117, + "learning_rate": 1.4745608493583278e-05, + "loss": 0.837, + "num_tokens": 26908723630.0, + "step": 6438 + }, + { + "epoch": 0.7651812240047534, + "grad_norm": 0.41917887195350956, + "learning_rate": 1.4744033577900734e-05, + "loss": 0.834, + "num_tokens": 26912913033.0, + "step": 6439 + }, + { + "epoch": 0.7653000594177065, + "grad_norm": 0.45076873094720543, + "learning_rate": 1.4742458523563967e-05, + "loss": 0.8442, + "num_tokens": 26917102050.0, + "step": 6440 + }, + { + "epoch": 0.7654188948306595, + "grad_norm": 0.48065834859373063, + "learning_rate": 1.4740883330631308e-05, + "loss": 0.8593, + "num_tokens": 26921268625.0, + "step": 6441 + }, + { + "epoch": 0.7655377302436126, + "grad_norm": 0.3732769402622096, + "learning_rate": 1.473930799916109e-05, + "loss": 0.8575, + "num_tokens": 26925430874.0, + "step": 6442 + }, + { + "epoch": 0.7656565656565657, + "grad_norm": 0.5045932780716006, + "learning_rate": 1.4737732529211652e-05, + "loss": 0.8448, + "num_tokens": 26929619200.0, + "step": 6443 + }, + { + "epoch": 0.7657754010695187, + "grad_norm": 0.42383049050710775, + "learning_rate": 1.4736156920841346e-05, + "loss": 0.8211, + "num_tokens": 26933809291.0, + "step": 6444 + }, + { + "epoch": 0.7658942364824718, + "grad_norm": 0.43115800684369066, + "learning_rate": 1.4734581174108513e-05, + "loss": 0.843, + "num_tokens": 26937998743.0, + "step": 6445 + }, + { + "epoch": 0.7660130718954249, + "grad_norm": 0.396759177748673, + "learning_rate": 1.4733005289071516e-05, + "loss": 0.8327, + "num_tokens": 26942178044.0, + "step": 6446 + }, + { + "epoch": 0.7661319073083779, + "grad_norm": 0.45634924166631774, + "learning_rate": 1.4731429265788709e-05, + "loss": 0.8173, + "num_tokens": 26946367156.0, + "step": 6447 + }, + { + "epoch": 0.766250742721331, + "grad_norm": 0.44614904797826793, + "learning_rate": 1.472985310431846e-05, + "loss": 0.8167, + "num_tokens": 26950555509.0, + "step": 6448 + }, + { + "epoch": 0.766369578134284, + "grad_norm": 0.4608541284824987, + "learning_rate": 1.4728276804719137e-05, + "loss": 0.8421, + "num_tokens": 26954692618.0, + "step": 6449 + }, + { + "epoch": 0.7664884135472371, + "grad_norm": 0.4418544480735715, + "learning_rate": 1.4726700367049123e-05, + "loss": 0.876, + "num_tokens": 26958882012.0, + "step": 6450 + }, + { + "epoch": 0.7666072489601902, + "grad_norm": 0.41854831767155626, + "learning_rate": 1.4725123791366791e-05, + "loss": 0.8446, + "num_tokens": 26963069766.0, + "step": 6451 + }, + { + "epoch": 0.7667260843731432, + "grad_norm": 0.3885007650799635, + "learning_rate": 1.472354707773053e-05, + "loss": 0.8562, + "num_tokens": 26967260335.0, + "step": 6452 + }, + { + "epoch": 0.7668449197860963, + "grad_norm": 0.43550633516713727, + "learning_rate": 1.4721970226198734e-05, + "loss": 0.8234, + "num_tokens": 26971448749.0, + "step": 6453 + }, + { + "epoch": 0.7669637551990494, + "grad_norm": 0.3927805280711586, + "learning_rate": 1.4720393236829793e-05, + "loss": 0.8656, + "num_tokens": 26975614147.0, + "step": 6454 + }, + { + "epoch": 0.7670825906120023, + "grad_norm": 0.47422009202518123, + "learning_rate": 1.4718816109682113e-05, + "loss": 0.8528, + "num_tokens": 26979804400.0, + "step": 6455 + }, + { + "epoch": 0.7672014260249554, + "grad_norm": 0.376337090502688, + "learning_rate": 1.47172388448141e-05, + "loss": 0.8826, + "num_tokens": 26983989190.0, + "step": 6456 + }, + { + "epoch": 0.7673202614379085, + "grad_norm": 0.38514572548061987, + "learning_rate": 1.4715661442284164e-05, + "loss": 0.8335, + "num_tokens": 26988178163.0, + "step": 6457 + }, + { + "epoch": 0.7674390968508615, + "grad_norm": 0.3775208415847839, + "learning_rate": 1.471408390215072e-05, + "loss": 0.8196, + "num_tokens": 26992343184.0, + "step": 6458 + }, + { + "epoch": 0.7675579322638146, + "grad_norm": 0.42054687047882705, + "learning_rate": 1.4712506224472194e-05, + "loss": 0.8465, + "num_tokens": 26996531153.0, + "step": 6459 + }, + { + "epoch": 0.7676767676767676, + "grad_norm": 0.35611998294240266, + "learning_rate": 1.4710928409307008e-05, + "loss": 0.8393, + "num_tokens": 27000720210.0, + "step": 6460 + }, + { + "epoch": 0.7677956030897207, + "grad_norm": 0.3647403758165619, + "learning_rate": 1.4709350456713594e-05, + "loss": 0.8721, + "num_tokens": 27004908658.0, + "step": 6461 + }, + { + "epoch": 0.7679144385026738, + "grad_norm": 0.385544932899041, + "learning_rate": 1.4707772366750397e-05, + "loss": 0.8568, + "num_tokens": 27009094335.0, + "step": 6462 + }, + { + "epoch": 0.7680332739156268, + "grad_norm": 0.37641058424372337, + "learning_rate": 1.4706194139475849e-05, + "loss": 0.878, + "num_tokens": 27013283574.0, + "step": 6463 + }, + { + "epoch": 0.7681521093285799, + "grad_norm": 0.35972924439019144, + "learning_rate": 1.4704615774948401e-05, + "loss": 0.8281, + "num_tokens": 27017472627.0, + "step": 6464 + }, + { + "epoch": 0.768270944741533, + "grad_norm": 0.41079296879406524, + "learning_rate": 1.4703037273226506e-05, + "loss": 0.846, + "num_tokens": 27021660270.0, + "step": 6465 + }, + { + "epoch": 0.768389780154486, + "grad_norm": 0.43306936956266856, + "learning_rate": 1.4701458634368622e-05, + "loss": 0.842, + "num_tokens": 27025849712.0, + "step": 6466 + }, + { + "epoch": 0.7685086155674391, + "grad_norm": 0.36638276356988886, + "learning_rate": 1.4699879858433207e-05, + "loss": 0.9065, + "num_tokens": 27030038318.0, + "step": 6467 + }, + { + "epoch": 0.7686274509803922, + "grad_norm": 0.40172403209180735, + "learning_rate": 1.4698300945478731e-05, + "loss": 0.8605, + "num_tokens": 27034199742.0, + "step": 6468 + }, + { + "epoch": 0.7687462863933452, + "grad_norm": 0.442751257738247, + "learning_rate": 1.469672189556367e-05, + "loss": 0.8185, + "num_tokens": 27038387692.0, + "step": 6469 + }, + { + "epoch": 0.7688651218062983, + "grad_norm": 0.3441283817534529, + "learning_rate": 1.4695142708746495e-05, + "loss": 0.831, + "num_tokens": 27042575764.0, + "step": 6470 + }, + { + "epoch": 0.7689839572192514, + "grad_norm": 0.38387457649497086, + "learning_rate": 1.4693563385085696e-05, + "loss": 0.8377, + "num_tokens": 27046747571.0, + "step": 6471 + }, + { + "epoch": 0.7691027926322044, + "grad_norm": 0.40132355417425447, + "learning_rate": 1.4691983924639752e-05, + "loss": 0.8495, + "num_tokens": 27050936562.0, + "step": 6472 + }, + { + "epoch": 0.7692216280451575, + "grad_norm": 0.4308863158135536, + "learning_rate": 1.4690404327467163e-05, + "loss": 0.8709, + "num_tokens": 27055123655.0, + "step": 6473 + }, + { + "epoch": 0.7693404634581105, + "grad_norm": 0.41041622442046005, + "learning_rate": 1.4688824593626424e-05, + "loss": 0.8427, + "num_tokens": 27059310368.0, + "step": 6474 + }, + { + "epoch": 0.7694592988710636, + "grad_norm": 0.35915085155399923, + "learning_rate": 1.468724472317604e-05, + "loss": 0.8229, + "num_tokens": 27063461913.0, + "step": 6475 + }, + { + "epoch": 0.7695781342840167, + "grad_norm": 0.4126590656065649, + "learning_rate": 1.4685664716174514e-05, + "loss": 0.8249, + "num_tokens": 27067650362.0, + "step": 6476 + }, + { + "epoch": 0.7696969696969697, + "grad_norm": 0.3873691454158119, + "learning_rate": 1.4684084572680362e-05, + "loss": 0.8236, + "num_tokens": 27071825242.0, + "step": 6477 + }, + { + "epoch": 0.7698158051099228, + "grad_norm": 0.3835540496554763, + "learning_rate": 1.4682504292752103e-05, + "loss": 0.8573, + "num_tokens": 27076015007.0, + "step": 6478 + }, + { + "epoch": 0.7699346405228759, + "grad_norm": 0.39229483844698737, + "learning_rate": 1.468092387644826e-05, + "loss": 0.8429, + "num_tokens": 27080180287.0, + "step": 6479 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 0.37613977992799935, + "learning_rate": 1.467934332382736e-05, + "loss": 0.8363, + "num_tokens": 27084368701.0, + "step": 6480 + }, + { + "epoch": 0.770172311348782, + "grad_norm": 0.35657323625245385, + "learning_rate": 1.4677762634947937e-05, + "loss": 0.8524, + "num_tokens": 27088514821.0, + "step": 6481 + }, + { + "epoch": 0.770291146761735, + "grad_norm": 0.4563327689146122, + "learning_rate": 1.4676181809868526e-05, + "loss": 0.8627, + "num_tokens": 27092683911.0, + "step": 6482 + }, + { + "epoch": 0.770409982174688, + "grad_norm": 0.523563870727551, + "learning_rate": 1.4674600848647675e-05, + "loss": 0.8592, + "num_tokens": 27096873030.0, + "step": 6483 + }, + { + "epoch": 0.7705288175876411, + "grad_norm": 0.3779659181653852, + "learning_rate": 1.467301975134393e-05, + "loss": 0.7846, + "num_tokens": 27101036259.0, + "step": 6484 + }, + { + "epoch": 0.7706476530005941, + "grad_norm": 0.4011539461088373, + "learning_rate": 1.4671438518015845e-05, + "loss": 0.8169, + "num_tokens": 27105225183.0, + "step": 6485 + }, + { + "epoch": 0.7707664884135472, + "grad_norm": 0.4577592892791449, + "learning_rate": 1.466985714872198e-05, + "loss": 0.8201, + "num_tokens": 27109361699.0, + "step": 6486 + }, + { + "epoch": 0.7708853238265003, + "grad_norm": 0.4552161816465717, + "learning_rate": 1.4668275643520894e-05, + "loss": 0.8334, + "num_tokens": 27113548642.0, + "step": 6487 + }, + { + "epoch": 0.7710041592394533, + "grad_norm": 0.4464849448356153, + "learning_rate": 1.4666694002471163e-05, + "loss": 0.8221, + "num_tokens": 27117738353.0, + "step": 6488 + }, + { + "epoch": 0.7711229946524064, + "grad_norm": 0.43816297134118987, + "learning_rate": 1.4665112225631348e-05, + "loss": 0.8591, + "num_tokens": 27121927323.0, + "step": 6489 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.34395887435092476, + "learning_rate": 1.4663530313060042e-05, + "loss": 0.8608, + "num_tokens": 27126115953.0, + "step": 6490 + }, + { + "epoch": 0.7713606654783125, + "grad_norm": 0.44611088124698517, + "learning_rate": 1.4661948264815817e-05, + "loss": 0.8695, + "num_tokens": 27130304766.0, + "step": 6491 + }, + { + "epoch": 0.7714795008912656, + "grad_norm": 0.5166656911721128, + "learning_rate": 1.466036608095727e-05, + "loss": 0.8252, + "num_tokens": 27134495188.0, + "step": 6492 + }, + { + "epoch": 0.7715983363042187, + "grad_norm": 0.39991318333925113, + "learning_rate": 1.4658783761542992e-05, + "loss": 0.8424, + "num_tokens": 27138683738.0, + "step": 6493 + }, + { + "epoch": 0.7717171717171717, + "grad_norm": 0.4336141051244509, + "learning_rate": 1.4657201306631577e-05, + "loss": 0.8418, + "num_tokens": 27142873556.0, + "step": 6494 + }, + { + "epoch": 0.7718360071301248, + "grad_norm": 0.4637400694094737, + "learning_rate": 1.4655618716281635e-05, + "loss": 0.8387, + "num_tokens": 27147052576.0, + "step": 6495 + }, + { + "epoch": 0.7719548425430779, + "grad_norm": 0.451632355420188, + "learning_rate": 1.465403599055177e-05, + "loss": 0.81, + "num_tokens": 27151220585.0, + "step": 6496 + }, + { + "epoch": 0.7720736779560309, + "grad_norm": 0.4036534743405039, + "learning_rate": 1.4652453129500596e-05, + "loss": 0.8473, + "num_tokens": 27155408828.0, + "step": 6497 + }, + { + "epoch": 0.772192513368984, + "grad_norm": 0.40701271155939, + "learning_rate": 1.4650870133186736e-05, + "loss": 0.7966, + "num_tokens": 27159572501.0, + "step": 6498 + }, + { + "epoch": 0.772311348781937, + "grad_norm": 0.3934486990214456, + "learning_rate": 1.4649287001668812e-05, + "loss": 0.8705, + "num_tokens": 27163761130.0, + "step": 6499 + }, + { + "epoch": 0.7724301841948901, + "grad_norm": 0.449138136811493, + "learning_rate": 1.4647703735005449e-05, + "loss": 0.8427, + "num_tokens": 27167951547.0, + "step": 6500 + }, + { + "epoch": 0.7725490196078432, + "grad_norm": 0.39476240222788794, + "learning_rate": 1.4646120333255287e-05, + "loss": 0.8618, + "num_tokens": 27172113357.0, + "step": 6501 + }, + { + "epoch": 0.7726678550207962, + "grad_norm": 0.5043587197616053, + "learning_rate": 1.4644536796476958e-05, + "loss": 0.8209, + "num_tokens": 27176273200.0, + "step": 6502 + }, + { + "epoch": 0.7727866904337493, + "grad_norm": 0.4014190569574623, + "learning_rate": 1.4642953124729108e-05, + "loss": 0.8569, + "num_tokens": 27180462506.0, + "step": 6503 + }, + { + "epoch": 0.7729055258467024, + "grad_norm": 0.4025485781321484, + "learning_rate": 1.4641369318070393e-05, + "loss": 0.8407, + "num_tokens": 27184650198.0, + "step": 6504 + }, + { + "epoch": 0.7730243612596553, + "grad_norm": 0.4259081328791368, + "learning_rate": 1.4639785376559455e-05, + "loss": 0.8349, + "num_tokens": 27188839935.0, + "step": 6505 + }, + { + "epoch": 0.7731431966726084, + "grad_norm": 0.4019352270901771, + "learning_rate": 1.463820130025496e-05, + "loss": 0.8297, + "num_tokens": 27192988541.0, + "step": 6506 + }, + { + "epoch": 0.7732620320855615, + "grad_norm": 0.4337357643137729, + "learning_rate": 1.463661708921557e-05, + "loss": 0.8367, + "num_tokens": 27197179624.0, + "step": 6507 + }, + { + "epoch": 0.7733808674985145, + "grad_norm": 0.3944707735503222, + "learning_rate": 1.4635032743499951e-05, + "loss": 0.8605, + "num_tokens": 27201349819.0, + "step": 6508 + }, + { + "epoch": 0.7734997029114676, + "grad_norm": 0.3660175496403648, + "learning_rate": 1.4633448263166783e-05, + "loss": 0.8058, + "num_tokens": 27205539129.0, + "step": 6509 + }, + { + "epoch": 0.7736185383244206, + "grad_norm": 0.3407739704325629, + "learning_rate": 1.463186364827474e-05, + "loss": 0.8492, + "num_tokens": 27209705881.0, + "step": 6510 + }, + { + "epoch": 0.7737373737373737, + "grad_norm": 0.43947748416388016, + "learning_rate": 1.4630278898882505e-05, + "loss": 0.8266, + "num_tokens": 27213870858.0, + "step": 6511 + }, + { + "epoch": 0.7738562091503268, + "grad_norm": 0.4189426913392985, + "learning_rate": 1.4628694015048765e-05, + "loss": 0.8112, + "num_tokens": 27218021108.0, + "step": 6512 + }, + { + "epoch": 0.7739750445632798, + "grad_norm": 0.3501801892813375, + "learning_rate": 1.4627108996832223e-05, + "loss": 0.8603, + "num_tokens": 27222209082.0, + "step": 6513 + }, + { + "epoch": 0.7740938799762329, + "grad_norm": 0.4609649960692616, + "learning_rate": 1.462552384429157e-05, + "loss": 0.8405, + "num_tokens": 27226398617.0, + "step": 6514 + }, + { + "epoch": 0.774212715389186, + "grad_norm": 0.3712970469968871, + "learning_rate": 1.4623938557485506e-05, + "loss": 0.8339, + "num_tokens": 27230587512.0, + "step": 6515 + }, + { + "epoch": 0.774331550802139, + "grad_norm": 0.4437366305880733, + "learning_rate": 1.462235313647275e-05, + "loss": 0.8366, + "num_tokens": 27234735854.0, + "step": 6516 + }, + { + "epoch": 0.7744503862150921, + "grad_norm": 0.4763381056086087, + "learning_rate": 1.4620767581312007e-05, + "loss": 0.8418, + "num_tokens": 27238882764.0, + "step": 6517 + }, + { + "epoch": 0.7745692216280452, + "grad_norm": 0.32591072556389905, + "learning_rate": 1.4619181892061998e-05, + "loss": 0.8533, + "num_tokens": 27243072600.0, + "step": 6518 + }, + { + "epoch": 0.7746880570409982, + "grad_norm": 0.45996926761517787, + "learning_rate": 1.461759606878145e-05, + "loss": 0.8385, + "num_tokens": 27247246879.0, + "step": 6519 + }, + { + "epoch": 0.7748068924539513, + "grad_norm": 0.4012458773164493, + "learning_rate": 1.4616010111529085e-05, + "loss": 0.8476, + "num_tokens": 27251436023.0, + "step": 6520 + }, + { + "epoch": 0.7749257278669044, + "grad_norm": 0.3995152391082205, + "learning_rate": 1.4614424020363639e-05, + "loss": 0.85, + "num_tokens": 27255625066.0, + "step": 6521 + }, + { + "epoch": 0.7750445632798574, + "grad_norm": 0.4263162969989566, + "learning_rate": 1.4612837795343852e-05, + "loss": 0.8477, + "num_tokens": 27259813103.0, + "step": 6522 + }, + { + "epoch": 0.7751633986928105, + "grad_norm": 0.3563087508421509, + "learning_rate": 1.4611251436528466e-05, + "loss": 0.832, + "num_tokens": 27263996035.0, + "step": 6523 + }, + { + "epoch": 0.7752822341057635, + "grad_norm": 0.4379264502339362, + "learning_rate": 1.4609664943976229e-05, + "loss": 0.8588, + "num_tokens": 27268167121.0, + "step": 6524 + }, + { + "epoch": 0.7754010695187166, + "grad_norm": 0.36779491822657284, + "learning_rate": 1.4608078317745897e-05, + "loss": 0.8244, + "num_tokens": 27272323358.0, + "step": 6525 + }, + { + "epoch": 0.7755199049316697, + "grad_norm": 0.40764144132568847, + "learning_rate": 1.4606491557896223e-05, + "loss": 0.8584, + "num_tokens": 27276513610.0, + "step": 6526 + }, + { + "epoch": 0.7756387403446227, + "grad_norm": 0.4046551708325777, + "learning_rate": 1.4604904664485976e-05, + "loss": 0.8163, + "num_tokens": 27280703635.0, + "step": 6527 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.4982680415675082, + "learning_rate": 1.4603317637573916e-05, + "loss": 0.8266, + "num_tokens": 27284892566.0, + "step": 6528 + }, + { + "epoch": 0.7758764111705289, + "grad_norm": 0.379954248078305, + "learning_rate": 1.4601730477218825e-05, + "loss": 0.8515, + "num_tokens": 27289082257.0, + "step": 6529 + }, + { + "epoch": 0.7759952465834818, + "grad_norm": 0.3982078233345502, + "learning_rate": 1.4600143183479478e-05, + "loss": 0.8312, + "num_tokens": 27293271847.0, + "step": 6530 + }, + { + "epoch": 0.776114081996435, + "grad_norm": 0.4293853584199946, + "learning_rate": 1.459855575641465e-05, + "loss": 0.8225, + "num_tokens": 27297432501.0, + "step": 6531 + }, + { + "epoch": 0.776232917409388, + "grad_norm": 0.35825752627043034, + "learning_rate": 1.4596968196083138e-05, + "loss": 0.8256, + "num_tokens": 27301621302.0, + "step": 6532 + }, + { + "epoch": 0.776351752822341, + "grad_norm": 0.4464120757560783, + "learning_rate": 1.4595380502543737e-05, + "loss": 0.8511, + "num_tokens": 27305808970.0, + "step": 6533 + }, + { + "epoch": 0.7764705882352941, + "grad_norm": 0.420456760902499, + "learning_rate": 1.4593792675855234e-05, + "loss": 0.8528, + "num_tokens": 27309999385.0, + "step": 6534 + }, + { + "epoch": 0.7765894236482471, + "grad_norm": 0.38589064291846353, + "learning_rate": 1.459220471607644e-05, + "loss": 0.8248, + "num_tokens": 27314175949.0, + "step": 6535 + }, + { + "epoch": 0.7767082590612002, + "grad_norm": 0.4459575935741617, + "learning_rate": 1.4590616623266158e-05, + "loss": 0.8194, + "num_tokens": 27318365636.0, + "step": 6536 + }, + { + "epoch": 0.7768270944741533, + "grad_norm": 0.4116399012770746, + "learning_rate": 1.4589028397483203e-05, + "loss": 0.8517, + "num_tokens": 27322553443.0, + "step": 6537 + }, + { + "epoch": 0.7769459298871063, + "grad_norm": 0.4365111745033485, + "learning_rate": 1.458744003878639e-05, + "loss": 0.8036, + "num_tokens": 27326742519.0, + "step": 6538 + }, + { + "epoch": 0.7770647653000594, + "grad_norm": 0.46683674163454536, + "learning_rate": 1.4585851547234546e-05, + "loss": 0.9039, + "num_tokens": 27330921265.0, + "step": 6539 + }, + { + "epoch": 0.7771836007130125, + "grad_norm": 0.3797336737021534, + "learning_rate": 1.4584262922886494e-05, + "loss": 0.8722, + "num_tokens": 27335108709.0, + "step": 6540 + }, + { + "epoch": 0.7773024361259655, + "grad_norm": 0.46755583404667445, + "learning_rate": 1.4582674165801067e-05, + "loss": 0.8381, + "num_tokens": 27339297564.0, + "step": 6541 + }, + { + "epoch": 0.7774212715389186, + "grad_norm": 0.40076109288763057, + "learning_rate": 1.4581085276037103e-05, + "loss": 0.8076, + "num_tokens": 27343487324.0, + "step": 6542 + }, + { + "epoch": 0.7775401069518717, + "grad_norm": 0.4588576647891427, + "learning_rate": 1.4579496253653442e-05, + "loss": 0.8164, + "num_tokens": 27347677799.0, + "step": 6543 + }, + { + "epoch": 0.7776589423648247, + "grad_norm": 0.4400290680830235, + "learning_rate": 1.4577907098708934e-05, + "loss": 0.8849, + "num_tokens": 27351867494.0, + "step": 6544 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.40819763884753457, + "learning_rate": 1.4576317811262429e-05, + "loss": 0.8495, + "num_tokens": 27356040133.0, + "step": 6545 + }, + { + "epoch": 0.7778966131907309, + "grad_norm": 0.43198113882074535, + "learning_rate": 1.457472839137278e-05, + "loss": 0.8535, + "num_tokens": 27360229672.0, + "step": 6546 + }, + { + "epoch": 0.7780154486036839, + "grad_norm": 0.38274172419944014, + "learning_rate": 1.4573138839098854e-05, + "loss": 0.8487, + "num_tokens": 27364394490.0, + "step": 6547 + }, + { + "epoch": 0.778134284016637, + "grad_norm": 0.503114009198013, + "learning_rate": 1.457154915449952e-05, + "loss": 0.8008, + "num_tokens": 27368584298.0, + "step": 6548 + }, + { + "epoch": 0.77825311942959, + "grad_norm": 0.33861408602600906, + "learning_rate": 1.456995933763364e-05, + "loss": 0.7967, + "num_tokens": 27372774563.0, + "step": 6549 + }, + { + "epoch": 0.7783719548425431, + "grad_norm": 0.5406769549917833, + "learning_rate": 1.4568369388560096e-05, + "loss": 0.8169, + "num_tokens": 27376946582.0, + "step": 6550 + }, + { + "epoch": 0.7784907902554962, + "grad_norm": 0.38992340005373755, + "learning_rate": 1.456677930733777e-05, + "loss": 0.8807, + "num_tokens": 27381137449.0, + "step": 6551 + }, + { + "epoch": 0.7786096256684492, + "grad_norm": 0.5259680947135763, + "learning_rate": 1.4565189094025549e-05, + "loss": 0.8241, + "num_tokens": 27385325925.0, + "step": 6552 + }, + { + "epoch": 0.7787284610814023, + "grad_norm": 0.39155536821680825, + "learning_rate": 1.4563598748682317e-05, + "loss": 0.8462, + "num_tokens": 27389515641.0, + "step": 6553 + }, + { + "epoch": 0.7788472964943554, + "grad_norm": 0.5260957072661528, + "learning_rate": 1.4562008271366978e-05, + "loss": 0.8249, + "num_tokens": 27393701376.0, + "step": 6554 + }, + { + "epoch": 0.7789661319073083, + "grad_norm": 0.3706174194586511, + "learning_rate": 1.4560417662138427e-05, + "loss": 0.8426, + "num_tokens": 27397890274.0, + "step": 6555 + }, + { + "epoch": 0.7790849673202614, + "grad_norm": 0.5360677193408909, + "learning_rate": 1.4558826921055573e-05, + "loss": 0.8789, + "num_tokens": 27402001385.0, + "step": 6556 + }, + { + "epoch": 0.7792038027332145, + "grad_norm": 0.4086569365355437, + "learning_rate": 1.4557236048177325e-05, + "loss": 0.8302, + "num_tokens": 27406191377.0, + "step": 6557 + }, + { + "epoch": 0.7793226381461675, + "grad_norm": 0.5045991842042067, + "learning_rate": 1.4555645043562598e-05, + "loss": 0.8379, + "num_tokens": 27410371543.0, + "step": 6558 + }, + { + "epoch": 0.7794414735591206, + "grad_norm": 0.4218969393211183, + "learning_rate": 1.4554053907270314e-05, + "loss": 0.8053, + "num_tokens": 27414555691.0, + "step": 6559 + }, + { + "epoch": 0.7795603089720736, + "grad_norm": 0.5060313711546436, + "learning_rate": 1.4552462639359397e-05, + "loss": 0.8313, + "num_tokens": 27418746830.0, + "step": 6560 + }, + { + "epoch": 0.7796791443850267, + "grad_norm": 0.4261231960969242, + "learning_rate": 1.455087123988878e-05, + "loss": 0.8363, + "num_tokens": 27422929167.0, + "step": 6561 + }, + { + "epoch": 0.7797979797979798, + "grad_norm": 0.5284978148863338, + "learning_rate": 1.4549279708917392e-05, + "loss": 0.8806, + "num_tokens": 27427112180.0, + "step": 6562 + }, + { + "epoch": 0.7799168152109328, + "grad_norm": 0.4259430903585728, + "learning_rate": 1.4547688046504176e-05, + "loss": 0.8358, + "num_tokens": 27431303156.0, + "step": 6563 + }, + { + "epoch": 0.7800356506238859, + "grad_norm": 0.529609951857009, + "learning_rate": 1.4546096252708077e-05, + "loss": 0.8596, + "num_tokens": 27435468999.0, + "step": 6564 + }, + { + "epoch": 0.780154486036839, + "grad_norm": 0.4703570458967984, + "learning_rate": 1.4544504327588048e-05, + "loss": 0.8761, + "num_tokens": 27439600837.0, + "step": 6565 + }, + { + "epoch": 0.780273321449792, + "grad_norm": 0.4554665531990226, + "learning_rate": 1.4542912271203035e-05, + "loss": 0.7846, + "num_tokens": 27443767888.0, + "step": 6566 + }, + { + "epoch": 0.7803921568627451, + "grad_norm": 0.4220316693426265, + "learning_rate": 1.4541320083612e-05, + "loss": 0.8276, + "num_tokens": 27447956745.0, + "step": 6567 + }, + { + "epoch": 0.7805109922756982, + "grad_norm": 0.5293102284830504, + "learning_rate": 1.4539727764873912e-05, + "loss": 0.8524, + "num_tokens": 27452146323.0, + "step": 6568 + }, + { + "epoch": 0.7806298276886512, + "grad_norm": 0.44689965559049066, + "learning_rate": 1.453813531504774e-05, + "loss": 0.8287, + "num_tokens": 27456306223.0, + "step": 6569 + }, + { + "epoch": 0.7807486631016043, + "grad_norm": 0.5085806285599098, + "learning_rate": 1.453654273419245e-05, + "loss": 0.8559, + "num_tokens": 27460495832.0, + "step": 6570 + }, + { + "epoch": 0.7808674985145574, + "grad_norm": 0.4458976851615963, + "learning_rate": 1.4534950022367027e-05, + "loss": 0.8623, + "num_tokens": 27464666840.0, + "step": 6571 + }, + { + "epoch": 0.7809863339275104, + "grad_norm": 0.47542998463841735, + "learning_rate": 1.453335717963045e-05, + "loss": 0.8564, + "num_tokens": 27468855259.0, + "step": 6572 + }, + { + "epoch": 0.7811051693404635, + "grad_norm": 0.42815237235786263, + "learning_rate": 1.4531764206041712e-05, + "loss": 0.853, + "num_tokens": 27473039123.0, + "step": 6573 + }, + { + "epoch": 0.7812240047534165, + "grad_norm": 0.4776665226120822, + "learning_rate": 1.4530171101659808e-05, + "loss": 0.8432, + "num_tokens": 27477228569.0, + "step": 6574 + }, + { + "epoch": 0.7813428401663696, + "grad_norm": 0.4410873160597133, + "learning_rate": 1.4528577866543725e-05, + "loss": 0.8485, + "num_tokens": 27481417314.0, + "step": 6575 + }, + { + "epoch": 0.7814616755793227, + "grad_norm": 0.4381274388084719, + "learning_rate": 1.4526984500752476e-05, + "loss": 0.8144, + "num_tokens": 27485604603.0, + "step": 6576 + }, + { + "epoch": 0.7815805109922757, + "grad_norm": 0.4205385123466237, + "learning_rate": 1.452539100434507e-05, + "loss": 0.8167, + "num_tokens": 27489751409.0, + "step": 6577 + }, + { + "epoch": 0.7816993464052288, + "grad_norm": 0.4789396130951594, + "learning_rate": 1.4523797377380508e-05, + "loss": 0.8567, + "num_tokens": 27493923401.0, + "step": 6578 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 0.389308035970197, + "learning_rate": 1.4522203619917821e-05, + "loss": 0.8761, + "num_tokens": 27498109244.0, + "step": 6579 + }, + { + "epoch": 0.7819370172311348, + "grad_norm": 0.5020316708972316, + "learning_rate": 1.4520609732016023e-05, + "loss": 0.8441, + "num_tokens": 27502282093.0, + "step": 6580 + }, + { + "epoch": 0.782055852644088, + "grad_norm": 0.42125854803217894, + "learning_rate": 1.4519015713734141e-05, + "loss": 0.8299, + "num_tokens": 27506472436.0, + "step": 6581 + }, + { + "epoch": 0.782174688057041, + "grad_norm": 0.40644636987299415, + "learning_rate": 1.4517421565131214e-05, + "loss": 0.8707, + "num_tokens": 27510631502.0, + "step": 6582 + }, + { + "epoch": 0.782293523469994, + "grad_norm": 0.383921646823124, + "learning_rate": 1.451582728626627e-05, + "loss": 0.84, + "num_tokens": 27514821532.0, + "step": 6583 + }, + { + "epoch": 0.7824123588829471, + "grad_norm": 0.5178697370341929, + "learning_rate": 1.4514232877198355e-05, + "loss": 0.8231, + "num_tokens": 27519010236.0, + "step": 6584 + }, + { + "epoch": 0.7825311942959001, + "grad_norm": 0.40113873785788734, + "learning_rate": 1.4512638337986518e-05, + "loss": 0.8368, + "num_tokens": 27523148630.0, + "step": 6585 + }, + { + "epoch": 0.7826500297088532, + "grad_norm": 0.46824981002573063, + "learning_rate": 1.45110436686898e-05, + "loss": 0.8617, + "num_tokens": 27527335598.0, + "step": 6586 + }, + { + "epoch": 0.7827688651218063, + "grad_norm": 0.4027921723981047, + "learning_rate": 1.4509448869367271e-05, + "loss": 0.8351, + "num_tokens": 27531525230.0, + "step": 6587 + }, + { + "epoch": 0.7828877005347593, + "grad_norm": 0.4744376609677657, + "learning_rate": 1.450785394007798e-05, + "loss": 0.86, + "num_tokens": 27535714199.0, + "step": 6588 + }, + { + "epoch": 0.7830065359477124, + "grad_norm": 0.39191999003078243, + "learning_rate": 1.4506258880881e-05, + "loss": 0.8226, + "num_tokens": 27539904237.0, + "step": 6589 + }, + { + "epoch": 0.7831253713606655, + "grad_norm": 0.47540660069705515, + "learning_rate": 1.4504663691835397e-05, + "loss": 0.836, + "num_tokens": 27544085270.0, + "step": 6590 + }, + { + "epoch": 0.7832442067736185, + "grad_norm": 0.42793864866181225, + "learning_rate": 1.4503068373000246e-05, + "loss": 0.8367, + "num_tokens": 27548244839.0, + "step": 6591 + }, + { + "epoch": 0.7833630421865716, + "grad_norm": 0.4673313485566191, + "learning_rate": 1.4501472924434632e-05, + "loss": 0.831, + "num_tokens": 27552433501.0, + "step": 6592 + }, + { + "epoch": 0.7834818775995247, + "grad_norm": 0.452280345574975, + "learning_rate": 1.4499877346197638e-05, + "loss": 0.8419, + "num_tokens": 27556620282.0, + "step": 6593 + }, + { + "epoch": 0.7836007130124777, + "grad_norm": 0.40887956301386985, + "learning_rate": 1.4498281638348348e-05, + "loss": 0.83, + "num_tokens": 27560800632.0, + "step": 6594 + }, + { + "epoch": 0.7837195484254308, + "grad_norm": 0.3836878028378632, + "learning_rate": 1.4496685800945865e-05, + "loss": 0.8514, + "num_tokens": 27564987489.0, + "step": 6595 + }, + { + "epoch": 0.7838383838383839, + "grad_norm": 0.43918913234780466, + "learning_rate": 1.4495089834049283e-05, + "loss": 0.8486, + "num_tokens": 27569144484.0, + "step": 6596 + }, + { + "epoch": 0.7839572192513369, + "grad_norm": 0.43814117682481296, + "learning_rate": 1.4493493737717708e-05, + "loss": 0.8498, + "num_tokens": 27573333576.0, + "step": 6597 + }, + { + "epoch": 0.78407605466429, + "grad_norm": 0.4228596782678596, + "learning_rate": 1.4491897512010245e-05, + "loss": 0.841, + "num_tokens": 27577523095.0, + "step": 6598 + }, + { + "epoch": 0.784194890077243, + "grad_norm": 0.4624226834154265, + "learning_rate": 1.4490301156986014e-05, + "loss": 0.835, + "num_tokens": 27581686990.0, + "step": 6599 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.38879705188257396, + "learning_rate": 1.4488704672704127e-05, + "loss": 0.8457, + "num_tokens": 27585875413.0, + "step": 6600 + }, + { + "epoch": 0.7844325609031492, + "grad_norm": 0.4189454966939189, + "learning_rate": 1.4487108059223713e-05, + "loss": 0.8312, + "num_tokens": 27590047736.0, + "step": 6601 + }, + { + "epoch": 0.7845513963161022, + "grad_norm": 0.3860441247743138, + "learning_rate": 1.4485511316603898e-05, + "loss": 0.8498, + "num_tokens": 27594236717.0, + "step": 6602 + }, + { + "epoch": 0.7846702317290553, + "grad_norm": 0.38234374951958566, + "learning_rate": 1.4483914444903815e-05, + "loss": 0.8368, + "num_tokens": 27598426144.0, + "step": 6603 + }, + { + "epoch": 0.7847890671420084, + "grad_norm": 0.378428599438179, + "learning_rate": 1.4482317444182594e-05, + "loss": 0.846, + "num_tokens": 27602599478.0, + "step": 6604 + }, + { + "epoch": 0.7849079025549613, + "grad_norm": 0.42567454925837983, + "learning_rate": 1.448072031449939e-05, + "loss": 0.8864, + "num_tokens": 27606763212.0, + "step": 6605 + }, + { + "epoch": 0.7850267379679144, + "grad_norm": 0.4161077986052806, + "learning_rate": 1.4479123055913347e-05, + "loss": 0.8727, + "num_tokens": 27610952321.0, + "step": 6606 + }, + { + "epoch": 0.7851455733808675, + "grad_norm": 0.3909245914988244, + "learning_rate": 1.4477525668483604e-05, + "loss": 0.8712, + "num_tokens": 27615116028.0, + "step": 6607 + }, + { + "epoch": 0.7852644087938205, + "grad_norm": 0.4261226752960143, + "learning_rate": 1.4475928152269339e-05, + "loss": 0.8518, + "num_tokens": 27619281838.0, + "step": 6608 + }, + { + "epoch": 0.7853832442067736, + "grad_norm": 0.3964171776913129, + "learning_rate": 1.4474330507329694e-05, + "loss": 0.8979, + "num_tokens": 27623470311.0, + "step": 6609 + }, + { + "epoch": 0.7855020796197266, + "grad_norm": 0.4640238079962116, + "learning_rate": 1.4472732733723844e-05, + "loss": 0.8598, + "num_tokens": 27627658169.0, + "step": 6610 + }, + { + "epoch": 0.7856209150326797, + "grad_norm": 0.3561002704094915, + "learning_rate": 1.4471134831510964e-05, + "loss": 0.8716, + "num_tokens": 27631847975.0, + "step": 6611 + }, + { + "epoch": 0.7857397504456328, + "grad_norm": 0.38028757857661405, + "learning_rate": 1.4469536800750223e-05, + "loss": 0.8851, + "num_tokens": 27636027318.0, + "step": 6612 + }, + { + "epoch": 0.7858585858585858, + "grad_norm": 0.3752967840715614, + "learning_rate": 1.4467938641500802e-05, + "loss": 0.8667, + "num_tokens": 27640169531.0, + "step": 6613 + }, + { + "epoch": 0.7859774212715389, + "grad_norm": 0.3042983925380835, + "learning_rate": 1.446634035382189e-05, + "loss": 0.8586, + "num_tokens": 27644358155.0, + "step": 6614 + }, + { + "epoch": 0.786096256684492, + "grad_norm": 0.40280373087062243, + "learning_rate": 1.4464741937772673e-05, + "loss": 0.8659, + "num_tokens": 27648545357.0, + "step": 6615 + }, + { + "epoch": 0.786215092097445, + "grad_norm": 0.40503856080604905, + "learning_rate": 1.4463143393412346e-05, + "loss": 0.8523, + "num_tokens": 27652731777.0, + "step": 6616 + }, + { + "epoch": 0.7863339275103981, + "grad_norm": 0.4893024531061611, + "learning_rate": 1.4461544720800114e-05, + "loss": 0.8435, + "num_tokens": 27656919017.0, + "step": 6617 + }, + { + "epoch": 0.7864527629233512, + "grad_norm": 0.40901117826317757, + "learning_rate": 1.4459945919995173e-05, + "loss": 0.8144, + "num_tokens": 27661109397.0, + "step": 6618 + }, + { + "epoch": 0.7865715983363042, + "grad_norm": 0.34511908477124803, + "learning_rate": 1.4458346991056742e-05, + "loss": 0.8334, + "num_tokens": 27665298418.0, + "step": 6619 + }, + { + "epoch": 0.7866904337492573, + "grad_norm": 0.3985472849256425, + "learning_rate": 1.4456747934044022e-05, + "loss": 0.8212, + "num_tokens": 27669488230.0, + "step": 6620 + }, + { + "epoch": 0.7868092691622104, + "grad_norm": 0.38702602759830984, + "learning_rate": 1.4455148749016244e-05, + "loss": 0.846, + "num_tokens": 27673678135.0, + "step": 6621 + }, + { + "epoch": 0.7869281045751634, + "grad_norm": 0.4523717012305875, + "learning_rate": 1.4453549436032623e-05, + "loss": 0.8569, + "num_tokens": 27677867271.0, + "step": 6622 + }, + { + "epoch": 0.7870469399881165, + "grad_norm": 0.4707621997585548, + "learning_rate": 1.4451949995152388e-05, + "loss": 0.8538, + "num_tokens": 27682043108.0, + "step": 6623 + }, + { + "epoch": 0.7871657754010695, + "grad_norm": 0.35225651277463027, + "learning_rate": 1.4450350426434776e-05, + "loss": 0.8574, + "num_tokens": 27686232831.0, + "step": 6624 + }, + { + "epoch": 0.7872846108140226, + "grad_norm": 0.45715120698828204, + "learning_rate": 1.4448750729939022e-05, + "loss": 0.7898, + "num_tokens": 27690422818.0, + "step": 6625 + }, + { + "epoch": 0.7874034462269757, + "grad_norm": 0.34351705743314015, + "learning_rate": 1.4447150905724366e-05, + "loss": 0.8662, + "num_tokens": 27694611762.0, + "step": 6626 + }, + { + "epoch": 0.7875222816399287, + "grad_norm": 0.3948948161416008, + "learning_rate": 1.4445550953850059e-05, + "loss": 0.7993, + "num_tokens": 27698801477.0, + "step": 6627 + }, + { + "epoch": 0.7876411170528818, + "grad_norm": 0.3685275304297446, + "learning_rate": 1.444395087437535e-05, + "loss": 0.8488, + "num_tokens": 27702991493.0, + "step": 6628 + }, + { + "epoch": 0.7877599524658349, + "grad_norm": 0.385662933676602, + "learning_rate": 1.4442350667359494e-05, + "loss": 0.7929, + "num_tokens": 27707165161.0, + "step": 6629 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 0.4231991587034332, + "learning_rate": 1.4440750332861755e-05, + "loss": 0.8129, + "num_tokens": 27711351890.0, + "step": 6630 + }, + { + "epoch": 0.787997623291741, + "grad_norm": 0.38422448901858397, + "learning_rate": 1.4439149870941399e-05, + "loss": 0.8177, + "num_tokens": 27715540193.0, + "step": 6631 + }, + { + "epoch": 0.788116458704694, + "grad_norm": 0.4044150462100598, + "learning_rate": 1.4437549281657694e-05, + "loss": 0.8464, + "num_tokens": 27719730728.0, + "step": 6632 + }, + { + "epoch": 0.788235294117647, + "grad_norm": 0.3259165519218142, + "learning_rate": 1.4435948565069914e-05, + "loss": 0.8539, + "num_tokens": 27723919840.0, + "step": 6633 + }, + { + "epoch": 0.7883541295306001, + "grad_norm": 0.43434568504224214, + "learning_rate": 1.4434347721237347e-05, + "loss": 0.8501, + "num_tokens": 27728108166.0, + "step": 6634 + }, + { + "epoch": 0.7884729649435532, + "grad_norm": 0.40054761397686356, + "learning_rate": 1.4432746750219271e-05, + "loss": 0.8983, + "num_tokens": 27732296545.0, + "step": 6635 + }, + { + "epoch": 0.7885918003565062, + "grad_norm": 0.34895744228340786, + "learning_rate": 1.4431145652074972e-05, + "loss": 0.8573, + "num_tokens": 27736484028.0, + "step": 6636 + }, + { + "epoch": 0.7887106357694593, + "grad_norm": 0.40074225489324067, + "learning_rate": 1.4429544426863753e-05, + "loss": 0.8437, + "num_tokens": 27740672115.0, + "step": 6637 + }, + { + "epoch": 0.7888294711824123, + "grad_norm": 0.4426090787913759, + "learning_rate": 1.4427943074644908e-05, + "loss": 0.8383, + "num_tokens": 27744853529.0, + "step": 6638 + }, + { + "epoch": 0.7889483065953654, + "grad_norm": 0.455201402287119, + "learning_rate": 1.4426341595477738e-05, + "loss": 0.8452, + "num_tokens": 27749042718.0, + "step": 6639 + }, + { + "epoch": 0.7890671420083185, + "grad_norm": 0.37695830595596097, + "learning_rate": 1.4424739989421552e-05, + "loss": 0.8185, + "num_tokens": 27753232060.0, + "step": 6640 + }, + { + "epoch": 0.7891859774212715, + "grad_norm": 0.3466166859748177, + "learning_rate": 1.442313825653567e-05, + "loss": 0.8348, + "num_tokens": 27757407692.0, + "step": 6641 + }, + { + "epoch": 0.7893048128342246, + "grad_norm": 0.41665309872524, + "learning_rate": 1.4421536396879403e-05, + "loss": 0.8184, + "num_tokens": 27761587499.0, + "step": 6642 + }, + { + "epoch": 0.7894236482471777, + "grad_norm": 0.4119024293573083, + "learning_rate": 1.4419934410512074e-05, + "loss": 0.8462, + "num_tokens": 27765775586.0, + "step": 6643 + }, + { + "epoch": 0.7895424836601307, + "grad_norm": 0.46317585435276876, + "learning_rate": 1.4418332297493012e-05, + "loss": 0.8528, + "num_tokens": 27769964330.0, + "step": 6644 + }, + { + "epoch": 0.7896613190730838, + "grad_norm": 0.34346856710862733, + "learning_rate": 1.4416730057881547e-05, + "loss": 0.8149, + "num_tokens": 27774150951.0, + "step": 6645 + }, + { + "epoch": 0.7897801544860369, + "grad_norm": 0.3468076950255286, + "learning_rate": 1.4415127691737014e-05, + "loss": 0.8877, + "num_tokens": 27778340613.0, + "step": 6646 + }, + { + "epoch": 0.7898989898989899, + "grad_norm": 0.4235772329956788, + "learning_rate": 1.4413525199118757e-05, + "loss": 0.8321, + "num_tokens": 27782531275.0, + "step": 6647 + }, + { + "epoch": 0.790017825311943, + "grad_norm": 0.416560645388366, + "learning_rate": 1.441192258008612e-05, + "loss": 0.8617, + "num_tokens": 27786719473.0, + "step": 6648 + }, + { + "epoch": 0.790136660724896, + "grad_norm": 0.4772201044123695, + "learning_rate": 1.4410319834698452e-05, + "loss": 0.8206, + "num_tokens": 27790909817.0, + "step": 6649 + }, + { + "epoch": 0.7902554961378491, + "grad_norm": 0.4125972415321691, + "learning_rate": 1.4408716963015114e-05, + "loss": 0.8313, + "num_tokens": 27795088751.0, + "step": 6650 + }, + { + "epoch": 0.7903743315508022, + "grad_norm": 0.3849093203134165, + "learning_rate": 1.4407113965095461e-05, + "loss": 0.8243, + "num_tokens": 27799279952.0, + "step": 6651 + }, + { + "epoch": 0.7904931669637552, + "grad_norm": 0.40651406269691204, + "learning_rate": 1.4405510840998856e-05, + "loss": 0.8118, + "num_tokens": 27803457454.0, + "step": 6652 + }, + { + "epoch": 0.7906120023767083, + "grad_norm": 0.4189839370956885, + "learning_rate": 1.4403907590784671e-05, + "loss": 0.8261, + "num_tokens": 27807645114.0, + "step": 6653 + }, + { + "epoch": 0.7907308377896614, + "grad_norm": 0.3751932148291585, + "learning_rate": 1.440230421451228e-05, + "loss": 0.885, + "num_tokens": 27811834040.0, + "step": 6654 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.46153398356331216, + "learning_rate": 1.440070071224106e-05, + "loss": 0.8301, + "num_tokens": 27816023540.0, + "step": 6655 + }, + { + "epoch": 0.7909685086155674, + "grad_norm": 0.4285907338569781, + "learning_rate": 1.4399097084030394e-05, + "loss": 0.8303, + "num_tokens": 27820210429.0, + "step": 6656 + }, + { + "epoch": 0.7910873440285205, + "grad_norm": 0.40385041270677885, + "learning_rate": 1.4397493329939672e-05, + "loss": 0.8273, + "num_tokens": 27824361744.0, + "step": 6657 + }, + { + "epoch": 0.7912061794414735, + "grad_norm": 0.33952478746045645, + "learning_rate": 1.4395889450028282e-05, + "loss": 0.8001, + "num_tokens": 27828546596.0, + "step": 6658 + }, + { + "epoch": 0.7913250148544266, + "grad_norm": 0.3918135985077931, + "learning_rate": 1.4394285444355626e-05, + "loss": 0.8266, + "num_tokens": 27832732497.0, + "step": 6659 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 0.4086764199075708, + "learning_rate": 1.4392681312981103e-05, + "loss": 0.7995, + "num_tokens": 27836914083.0, + "step": 6660 + }, + { + "epoch": 0.7915626856803327, + "grad_norm": 0.3758133420879849, + "learning_rate": 1.439107705596412e-05, + "loss": 0.8731, + "num_tokens": 27841101461.0, + "step": 6661 + }, + { + "epoch": 0.7916815210932858, + "grad_norm": 0.4748029544686269, + "learning_rate": 1.4389472673364088e-05, + "loss": 0.852, + "num_tokens": 27845290719.0, + "step": 6662 + }, + { + "epoch": 0.7918003565062388, + "grad_norm": 0.37799292502884524, + "learning_rate": 1.4387868165240426e-05, + "loss": 0.8148, + "num_tokens": 27849469058.0, + "step": 6663 + }, + { + "epoch": 0.7919191919191919, + "grad_norm": 0.39306948805752445, + "learning_rate": 1.4386263531652544e-05, + "loss": 0.8534, + "num_tokens": 27853627605.0, + "step": 6664 + }, + { + "epoch": 0.792038027332145, + "grad_norm": 0.405421153511371, + "learning_rate": 1.4384658772659878e-05, + "loss": 0.8038, + "num_tokens": 27857814717.0, + "step": 6665 + }, + { + "epoch": 0.792156862745098, + "grad_norm": 0.3842657154149822, + "learning_rate": 1.4383053888321856e-05, + "loss": 0.8343, + "num_tokens": 27861976882.0, + "step": 6666 + }, + { + "epoch": 0.7922756981580511, + "grad_norm": 0.4381450479211204, + "learning_rate": 1.4381448878697907e-05, + "loss": 0.8754, + "num_tokens": 27866150908.0, + "step": 6667 + }, + { + "epoch": 0.7923945335710042, + "grad_norm": 0.3669951769609041, + "learning_rate": 1.4379843743847477e-05, + "loss": 0.8378, + "num_tokens": 27870340901.0, + "step": 6668 + }, + { + "epoch": 0.7925133689839572, + "grad_norm": 0.44704168192624155, + "learning_rate": 1.4378238483830003e-05, + "loss": 0.8284, + "num_tokens": 27874508463.0, + "step": 6669 + }, + { + "epoch": 0.7926322043969103, + "grad_norm": 0.32446683835399437, + "learning_rate": 1.4376633098704938e-05, + "loss": 0.8461, + "num_tokens": 27878697843.0, + "step": 6670 + }, + { + "epoch": 0.7927510398098634, + "grad_norm": 0.4203400004593807, + "learning_rate": 1.4375027588531728e-05, + "loss": 0.833, + "num_tokens": 27882887519.0, + "step": 6671 + }, + { + "epoch": 0.7928698752228164, + "grad_norm": 0.3910625356051499, + "learning_rate": 1.4373421953369837e-05, + "loss": 0.8429, + "num_tokens": 27887078372.0, + "step": 6672 + }, + { + "epoch": 0.7929887106357695, + "grad_norm": 0.4244335063747357, + "learning_rate": 1.4371816193278726e-05, + "loss": 0.807, + "num_tokens": 27891269080.0, + "step": 6673 + }, + { + "epoch": 0.7931075460487225, + "grad_norm": 0.3999809759855945, + "learning_rate": 1.437021030831786e-05, + "loss": 0.8558, + "num_tokens": 27895414098.0, + "step": 6674 + }, + { + "epoch": 0.7932263814616756, + "grad_norm": 0.3932289560243289, + "learning_rate": 1.4368604298546714e-05, + "loss": 0.8507, + "num_tokens": 27899602723.0, + "step": 6675 + }, + { + "epoch": 0.7933452168746287, + "grad_norm": 0.37330555105669344, + "learning_rate": 1.4366998164024762e-05, + "loss": 0.8634, + "num_tokens": 27903774372.0, + "step": 6676 + }, + { + "epoch": 0.7934640522875817, + "grad_norm": 0.5549707033418191, + "learning_rate": 1.4365391904811479e-05, + "loss": 0.8687, + "num_tokens": 27907964329.0, + "step": 6677 + }, + { + "epoch": 0.7935828877005348, + "grad_norm": 0.37204478031191435, + "learning_rate": 1.4363785520966357e-05, + "loss": 0.8699, + "num_tokens": 27912152370.0, + "step": 6678 + }, + { + "epoch": 0.7937017231134879, + "grad_norm": 0.5145725060221311, + "learning_rate": 1.4362179012548888e-05, + "loss": 0.8337, + "num_tokens": 27916342694.0, + "step": 6679 + }, + { + "epoch": 0.7938205585264408, + "grad_norm": 0.39417821918786144, + "learning_rate": 1.4360572379618555e-05, + "loss": 0.8443, + "num_tokens": 27920531739.0, + "step": 6680 + }, + { + "epoch": 0.793939393939394, + "grad_norm": 0.5007882054805348, + "learning_rate": 1.435896562223487e-05, + "loss": 0.8231, + "num_tokens": 27924702341.0, + "step": 6681 + }, + { + "epoch": 0.794058229352347, + "grad_norm": 0.41935246731755776, + "learning_rate": 1.4357358740457332e-05, + "loss": 0.864, + "num_tokens": 27928877796.0, + "step": 6682 + }, + { + "epoch": 0.7941770647653, + "grad_norm": 0.41441966558942783, + "learning_rate": 1.4355751734345442e-05, + "loss": 0.8833, + "num_tokens": 27933067452.0, + "step": 6683 + }, + { + "epoch": 0.7942959001782531, + "grad_norm": 0.42853855422454695, + "learning_rate": 1.4354144603958724e-05, + "loss": 0.8697, + "num_tokens": 27937257044.0, + "step": 6684 + }, + { + "epoch": 0.7944147355912062, + "grad_norm": 0.45283342879990207, + "learning_rate": 1.435253734935669e-05, + "loss": 0.8606, + "num_tokens": 27941428288.0, + "step": 6685 + }, + { + "epoch": 0.7945335710041592, + "grad_norm": 0.4434503835902792, + "learning_rate": 1.4350929970598863e-05, + "loss": 0.8418, + "num_tokens": 27945590390.0, + "step": 6686 + }, + { + "epoch": 0.7946524064171123, + "grad_norm": 0.3765823348896552, + "learning_rate": 1.4349322467744768e-05, + "loss": 0.8353, + "num_tokens": 27949779195.0, + "step": 6687 + }, + { + "epoch": 0.7947712418300653, + "grad_norm": 0.39948663514155525, + "learning_rate": 1.4347714840853938e-05, + "loss": 0.8537, + "num_tokens": 27953967472.0, + "step": 6688 + }, + { + "epoch": 0.7948900772430184, + "grad_norm": 0.5268301878504303, + "learning_rate": 1.4346107089985913e-05, + "loss": 0.8344, + "num_tokens": 27958158272.0, + "step": 6689 + }, + { + "epoch": 0.7950089126559715, + "grad_norm": 0.40784323797977773, + "learning_rate": 1.4344499215200223e-05, + "loss": 0.8073, + "num_tokens": 27962293587.0, + "step": 6690 + }, + { + "epoch": 0.7951277480689245, + "grad_norm": 0.420539181502663, + "learning_rate": 1.4342891216556421e-05, + "loss": 0.8708, + "num_tokens": 27966481384.0, + "step": 6691 + }, + { + "epoch": 0.7952465834818776, + "grad_norm": 0.3923829501327915, + "learning_rate": 1.4341283094114057e-05, + "loss": 0.8571, + "num_tokens": 27970638684.0, + "step": 6692 + }, + { + "epoch": 0.7953654188948307, + "grad_norm": 0.4330166506961981, + "learning_rate": 1.4339674847932679e-05, + "loss": 0.8383, + "num_tokens": 27974827699.0, + "step": 6693 + }, + { + "epoch": 0.7954842543077837, + "grad_norm": 0.4614146354006407, + "learning_rate": 1.4338066478071853e-05, + "loss": 0.8576, + "num_tokens": 27978989847.0, + "step": 6694 + }, + { + "epoch": 0.7956030897207368, + "grad_norm": 0.3521045886370094, + "learning_rate": 1.4336457984591137e-05, + "loss": 0.847, + "num_tokens": 27983173416.0, + "step": 6695 + }, + { + "epoch": 0.7957219251336899, + "grad_norm": 0.43258595366410696, + "learning_rate": 1.43348493675501e-05, + "loss": 0.8443, + "num_tokens": 27987362357.0, + "step": 6696 + }, + { + "epoch": 0.7958407605466429, + "grad_norm": 0.4494318304325294, + "learning_rate": 1.4333240627008317e-05, + "loss": 0.8255, + "num_tokens": 27991523701.0, + "step": 6697 + }, + { + "epoch": 0.795959595959596, + "grad_norm": 0.4760869264404366, + "learning_rate": 1.4331631763025365e-05, + "loss": 0.8332, + "num_tokens": 27995680309.0, + "step": 6698 + }, + { + "epoch": 0.796078431372549, + "grad_norm": 0.3883774291474211, + "learning_rate": 1.4330022775660823e-05, + "loss": 0.8271, + "num_tokens": 27999868470.0, + "step": 6699 + }, + { + "epoch": 0.7961972667855021, + "grad_norm": 0.423849456058467, + "learning_rate": 1.4328413664974276e-05, + "loss": 0.8254, + "num_tokens": 28004030791.0, + "step": 6700 + }, + { + "epoch": 0.7963161021984552, + "grad_norm": 0.408947156708779, + "learning_rate": 1.4326804431025321e-05, + "loss": 0.8423, + "num_tokens": 28008219306.0, + "step": 6701 + }, + { + "epoch": 0.7964349376114082, + "grad_norm": 0.5294306835099075, + "learning_rate": 1.4325195073873548e-05, + "loss": 0.8098, + "num_tokens": 28012378897.0, + "step": 6702 + }, + { + "epoch": 0.7965537730243613, + "grad_norm": 0.37497917672581316, + "learning_rate": 1.432358559357856e-05, + "loss": 0.8202, + "num_tokens": 28016570043.0, + "step": 6703 + }, + { + "epoch": 0.7966726084373144, + "grad_norm": 0.3605905347084809, + "learning_rate": 1.432197599019996e-05, + "loss": 0.8527, + "num_tokens": 28020758363.0, + "step": 6704 + }, + { + "epoch": 0.7967914438502673, + "grad_norm": 0.46358940662703507, + "learning_rate": 1.4320366263797356e-05, + "loss": 0.826, + "num_tokens": 28024947676.0, + "step": 6705 + }, + { + "epoch": 0.7969102792632204, + "grad_norm": 0.47167731900611237, + "learning_rate": 1.4318756414430364e-05, + "loss": 0.8583, + "num_tokens": 28029136499.0, + "step": 6706 + }, + { + "epoch": 0.7970291146761735, + "grad_norm": 0.38406518250353583, + "learning_rate": 1.4317146442158601e-05, + "loss": 0.854, + "num_tokens": 28033310739.0, + "step": 6707 + }, + { + "epoch": 0.7971479500891265, + "grad_norm": 0.45162887132576607, + "learning_rate": 1.431553634704169e-05, + "loss": 0.8521, + "num_tokens": 28037498659.0, + "step": 6708 + }, + { + "epoch": 0.7972667855020796, + "grad_norm": 0.41072786841650183, + "learning_rate": 1.4313926129139254e-05, + "loss": 0.8215, + "num_tokens": 28041682151.0, + "step": 6709 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.5049550192449764, + "learning_rate": 1.4312315788510934e-05, + "loss": 0.836, + "num_tokens": 28045829369.0, + "step": 6710 + }, + { + "epoch": 0.7975044563279857, + "grad_norm": 0.3978520192056312, + "learning_rate": 1.4310705325216359e-05, + "loss": 0.8419, + "num_tokens": 28050007085.0, + "step": 6711 + }, + { + "epoch": 0.7976232917409388, + "grad_norm": 0.415128528428894, + "learning_rate": 1.430909473931517e-05, + "loss": 0.857, + "num_tokens": 28054191008.0, + "step": 6712 + }, + { + "epoch": 0.7977421271538918, + "grad_norm": 0.5087326277771389, + "learning_rate": 1.4307484030867018e-05, + "loss": 0.8584, + "num_tokens": 28058321958.0, + "step": 6713 + }, + { + "epoch": 0.7978609625668449, + "grad_norm": 0.3789558834700834, + "learning_rate": 1.4305873199931549e-05, + "loss": 0.8326, + "num_tokens": 28062499134.0, + "step": 6714 + }, + { + "epoch": 0.797979797979798, + "grad_norm": 0.43642440099381585, + "learning_rate": 1.4304262246568417e-05, + "loss": 0.8497, + "num_tokens": 28066688341.0, + "step": 6715 + }, + { + "epoch": 0.798098633392751, + "grad_norm": 0.4538445748457464, + "learning_rate": 1.4302651170837283e-05, + "loss": 0.823, + "num_tokens": 28070878908.0, + "step": 6716 + }, + { + "epoch": 0.7982174688057041, + "grad_norm": 0.33572568478609877, + "learning_rate": 1.4301039972797805e-05, + "loss": 0.8363, + "num_tokens": 28075056008.0, + "step": 6717 + }, + { + "epoch": 0.7983363042186572, + "grad_norm": 0.5181395728583689, + "learning_rate": 1.429942865250966e-05, + "loss": 0.8399, + "num_tokens": 28079227584.0, + "step": 6718 + }, + { + "epoch": 0.7984551396316102, + "grad_norm": 0.39566412668241674, + "learning_rate": 1.4297817210032516e-05, + "loss": 0.8509, + "num_tokens": 28083416991.0, + "step": 6719 + }, + { + "epoch": 0.7985739750445633, + "grad_norm": 0.3623665161118942, + "learning_rate": 1.429620564542605e-05, + "loss": 0.8834, + "num_tokens": 28087602632.0, + "step": 6720 + }, + { + "epoch": 0.7986928104575164, + "grad_norm": 0.43317949467128986, + "learning_rate": 1.4294593958749945e-05, + "loss": 0.9098, + "num_tokens": 28091786736.0, + "step": 6721 + }, + { + "epoch": 0.7988116458704694, + "grad_norm": 0.41672358840988644, + "learning_rate": 1.4292982150063885e-05, + "loss": 0.8182, + "num_tokens": 28095975190.0, + "step": 6722 + }, + { + "epoch": 0.7989304812834225, + "grad_norm": 0.41416686702737493, + "learning_rate": 1.4291370219427564e-05, + "loss": 0.8343, + "num_tokens": 28100164547.0, + "step": 6723 + }, + { + "epoch": 0.7990493166963755, + "grad_norm": 0.43816290718216716, + "learning_rate": 1.4289758166900676e-05, + "loss": 0.8354, + "num_tokens": 28104319869.0, + "step": 6724 + }, + { + "epoch": 0.7991681521093286, + "grad_norm": 0.36654746089951823, + "learning_rate": 1.4288145992542917e-05, + "loss": 0.87, + "num_tokens": 28108509408.0, + "step": 6725 + }, + { + "epoch": 0.7992869875222817, + "grad_norm": 0.4298474353376879, + "learning_rate": 1.4286533696413996e-05, + "loss": 0.8124, + "num_tokens": 28112674337.0, + "step": 6726 + }, + { + "epoch": 0.7994058229352347, + "grad_norm": 0.4542826787433914, + "learning_rate": 1.4284921278573622e-05, + "loss": 0.8462, + "num_tokens": 28116862537.0, + "step": 6727 + }, + { + "epoch": 0.7995246583481878, + "grad_norm": 0.32310152455958735, + "learning_rate": 1.4283308739081505e-05, + "loss": 0.8047, + "num_tokens": 28121051672.0, + "step": 6728 + }, + { + "epoch": 0.7996434937611409, + "grad_norm": 0.4863245095247933, + "learning_rate": 1.4281696077997362e-05, + "loss": 0.8362, + "num_tokens": 28125224393.0, + "step": 6729 + }, + { + "epoch": 0.7997623291740938, + "grad_norm": 0.3899822561009517, + "learning_rate": 1.428008329538092e-05, + "loss": 0.8415, + "num_tokens": 28129414803.0, + "step": 6730 + }, + { + "epoch": 0.7998811645870469, + "grad_norm": 0.4811737612359013, + "learning_rate": 1.4278470391291903e-05, + "loss": 0.8481, + "num_tokens": 28133597519.0, + "step": 6731 + }, + { + "epoch": 0.8, + "grad_norm": 0.39502583556137505, + "learning_rate": 1.4276857365790043e-05, + "loss": 0.8341, + "num_tokens": 28137785989.0, + "step": 6732 + }, + { + "epoch": 0.800118835412953, + "grad_norm": 0.3845422001067121, + "learning_rate": 1.4275244218935073e-05, + "loss": 0.8473, + "num_tokens": 28141975164.0, + "step": 6733 + }, + { + "epoch": 0.8002376708259061, + "grad_norm": 0.45149397186434714, + "learning_rate": 1.4273630950786738e-05, + "loss": 0.8389, + "num_tokens": 28146165869.0, + "step": 6734 + }, + { + "epoch": 0.8003565062388592, + "grad_norm": 0.37346303272500675, + "learning_rate": 1.427201756140478e-05, + "loss": 0.884, + "num_tokens": 28150355769.0, + "step": 6735 + }, + { + "epoch": 0.8004753416518122, + "grad_norm": 0.5043314554176336, + "learning_rate": 1.4270404050848952e-05, + "loss": 0.8494, + "num_tokens": 28154544608.0, + "step": 6736 + }, + { + "epoch": 0.8005941770647653, + "grad_norm": 0.3707343748506062, + "learning_rate": 1.4268790419178996e-05, + "loss": 0.8527, + "num_tokens": 28158734880.0, + "step": 6737 + }, + { + "epoch": 0.8007130124777183, + "grad_norm": 0.439823858607196, + "learning_rate": 1.4267176666454687e-05, + "loss": 0.8756, + "num_tokens": 28162901126.0, + "step": 6738 + }, + { + "epoch": 0.8008318478906714, + "grad_norm": 0.38832247227173, + "learning_rate": 1.426556279273578e-05, + "loss": 0.8294, + "num_tokens": 28167082232.0, + "step": 6739 + }, + { + "epoch": 0.8009506833036245, + "grad_norm": 0.4315787060651351, + "learning_rate": 1.4263948798082034e-05, + "loss": 0.8237, + "num_tokens": 28171271328.0, + "step": 6740 + }, + { + "epoch": 0.8010695187165775, + "grad_norm": 0.4884768575733331, + "learning_rate": 1.4262334682553234e-05, + "loss": 0.8353, + "num_tokens": 28175444087.0, + "step": 6741 + }, + { + "epoch": 0.8011883541295306, + "grad_norm": 0.34003394703901263, + "learning_rate": 1.4260720446209149e-05, + "loss": 0.8535, + "num_tokens": 28179632851.0, + "step": 6742 + }, + { + "epoch": 0.8013071895424837, + "grad_norm": 0.4678689286288869, + "learning_rate": 1.4259106089109562e-05, + "loss": 0.8925, + "num_tokens": 28183821636.0, + "step": 6743 + }, + { + "epoch": 0.8014260249554367, + "grad_norm": 0.4419987161932701, + "learning_rate": 1.4257491611314253e-05, + "loss": 0.8343, + "num_tokens": 28188010167.0, + "step": 6744 + }, + { + "epoch": 0.8015448603683898, + "grad_norm": 0.41695915354668195, + "learning_rate": 1.4255877012883021e-05, + "loss": 0.8503, + "num_tokens": 28192199213.0, + "step": 6745 + }, + { + "epoch": 0.8016636957813429, + "grad_norm": 0.42591333463849285, + "learning_rate": 1.4254262293875653e-05, + "loss": 0.8381, + "num_tokens": 28196388127.0, + "step": 6746 + }, + { + "epoch": 0.8017825311942959, + "grad_norm": 0.34855766985580194, + "learning_rate": 1.4252647454351949e-05, + "loss": 0.8192, + "num_tokens": 28200577556.0, + "step": 6747 + }, + { + "epoch": 0.801901366607249, + "grad_norm": 0.47242448617370497, + "learning_rate": 1.4251032494371712e-05, + "loss": 0.8534, + "num_tokens": 28204739909.0, + "step": 6748 + }, + { + "epoch": 0.802020202020202, + "grad_norm": 0.4306624285672957, + "learning_rate": 1.4249417413994752e-05, + "loss": 0.8201, + "num_tokens": 28208928222.0, + "step": 6749 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 0.4267240316699727, + "learning_rate": 1.4247802213280875e-05, + "loss": 0.8517, + "num_tokens": 28213093468.0, + "step": 6750 + }, + { + "epoch": 0.8022578728461082, + "grad_norm": 0.4042274366521463, + "learning_rate": 1.4246186892289903e-05, + "loss": 0.8938, + "num_tokens": 28217280844.0, + "step": 6751 + }, + { + "epoch": 0.8023767082590612, + "grad_norm": 0.4703086677559075, + "learning_rate": 1.4244571451081656e-05, + "loss": 0.8526, + "num_tokens": 28221469516.0, + "step": 6752 + }, + { + "epoch": 0.8024955436720143, + "grad_norm": 0.33755290488900097, + "learning_rate": 1.4242955889715957e-05, + "loss": 0.8428, + "num_tokens": 28225646461.0, + "step": 6753 + }, + { + "epoch": 0.8026143790849674, + "grad_norm": 0.44187852100404157, + "learning_rate": 1.4241340208252637e-05, + "loss": 0.8517, + "num_tokens": 28229822119.0, + "step": 6754 + }, + { + "epoch": 0.8027332144979203, + "grad_norm": 0.36168132241988016, + "learning_rate": 1.423972440675153e-05, + "loss": 0.8698, + "num_tokens": 28234011988.0, + "step": 6755 + }, + { + "epoch": 0.8028520499108734, + "grad_norm": 0.4171546475644546, + "learning_rate": 1.423810848527247e-05, + "loss": 0.8977, + "num_tokens": 28238199439.0, + "step": 6756 + }, + { + "epoch": 0.8029708853238265, + "grad_norm": 0.3798933986659252, + "learning_rate": 1.4236492443875312e-05, + "loss": 0.8813, + "num_tokens": 28242389953.0, + "step": 6757 + }, + { + "epoch": 0.8030897207367795, + "grad_norm": 0.3805769765597466, + "learning_rate": 1.4234876282619891e-05, + "loss": 0.8548, + "num_tokens": 28246578830.0, + "step": 6758 + }, + { + "epoch": 0.8032085561497326, + "grad_norm": 0.4114216942789099, + "learning_rate": 1.4233260001566064e-05, + "loss": 0.8189, + "num_tokens": 28250743776.0, + "step": 6759 + }, + { + "epoch": 0.8033273915626857, + "grad_norm": 0.39049553848336244, + "learning_rate": 1.4231643600773688e-05, + "loss": 0.8165, + "num_tokens": 28254932229.0, + "step": 6760 + }, + { + "epoch": 0.8034462269756387, + "grad_norm": 0.46555522532858423, + "learning_rate": 1.4230027080302621e-05, + "loss": 0.8709, + "num_tokens": 28259120369.0, + "step": 6761 + }, + { + "epoch": 0.8035650623885918, + "grad_norm": 0.3664976877053434, + "learning_rate": 1.4228410440212738e-05, + "loss": 0.8445, + "num_tokens": 28263272762.0, + "step": 6762 + }, + { + "epoch": 0.8036838978015448, + "grad_norm": 0.3851859845436011, + "learning_rate": 1.4226793680563893e-05, + "loss": 0.8221, + "num_tokens": 28267453387.0, + "step": 6763 + }, + { + "epoch": 0.8038027332144979, + "grad_norm": 0.3930762789623205, + "learning_rate": 1.422517680141597e-05, + "loss": 0.8084, + "num_tokens": 28271643180.0, + "step": 6764 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.3692264227685438, + "learning_rate": 1.4223559802828846e-05, + "loss": 0.8667, + "num_tokens": 28275832731.0, + "step": 6765 + }, + { + "epoch": 0.804040404040404, + "grad_norm": 0.41851204527068026, + "learning_rate": 1.4221942684862403e-05, + "loss": 0.8514, + "num_tokens": 28280007204.0, + "step": 6766 + }, + { + "epoch": 0.8041592394533571, + "grad_norm": 0.4192245074497503, + "learning_rate": 1.4220325447576531e-05, + "loss": 0.8463, + "num_tokens": 28284195501.0, + "step": 6767 + }, + { + "epoch": 0.8042780748663102, + "grad_norm": 0.37994012013477335, + "learning_rate": 1.4218708091031117e-05, + "loss": 0.8544, + "num_tokens": 28288361827.0, + "step": 6768 + }, + { + "epoch": 0.8043969102792632, + "grad_norm": 0.4500850270017901, + "learning_rate": 1.4217090615286058e-05, + "loss": 0.8076, + "num_tokens": 28292550315.0, + "step": 6769 + }, + { + "epoch": 0.8045157456922163, + "grad_norm": 0.4024362672686158, + "learning_rate": 1.421547302040126e-05, + "loss": 0.8825, + "num_tokens": 28296717562.0, + "step": 6770 + }, + { + "epoch": 0.8046345811051694, + "grad_norm": 0.4871880312296316, + "learning_rate": 1.4213855306436623e-05, + "loss": 0.8552, + "num_tokens": 28300897289.0, + "step": 6771 + }, + { + "epoch": 0.8047534165181224, + "grad_norm": 0.36771260575728226, + "learning_rate": 1.4212237473452058e-05, + "loss": 0.8195, + "num_tokens": 28305065611.0, + "step": 6772 + }, + { + "epoch": 0.8048722519310755, + "grad_norm": 0.43430471964628836, + "learning_rate": 1.4210619521507479e-05, + "loss": 0.8308, + "num_tokens": 28309235713.0, + "step": 6773 + }, + { + "epoch": 0.8049910873440285, + "grad_norm": 0.44150234731352767, + "learning_rate": 1.42090014506628e-05, + "loss": 0.8498, + "num_tokens": 28313423580.0, + "step": 6774 + }, + { + "epoch": 0.8051099227569816, + "grad_norm": 0.3475090632416837, + "learning_rate": 1.420738326097795e-05, + "loss": 0.8499, + "num_tokens": 28317609006.0, + "step": 6775 + }, + { + "epoch": 0.8052287581699347, + "grad_norm": 0.41218451020485575, + "learning_rate": 1.4205764952512853e-05, + "loss": 0.8268, + "num_tokens": 28321774094.0, + "step": 6776 + }, + { + "epoch": 0.8053475935828877, + "grad_norm": 0.36525041740047715, + "learning_rate": 1.420414652532744e-05, + "loss": 0.8513, + "num_tokens": 28325962766.0, + "step": 6777 + }, + { + "epoch": 0.8054664289958408, + "grad_norm": 0.42488967637837805, + "learning_rate": 1.4202527979481653e-05, + "loss": 0.8437, + "num_tokens": 28330125857.0, + "step": 6778 + }, + { + "epoch": 0.8055852644087939, + "grad_norm": 0.35512077848476203, + "learning_rate": 1.420090931503542e-05, + "loss": 0.8447, + "num_tokens": 28334300368.0, + "step": 6779 + }, + { + "epoch": 0.8057040998217468, + "grad_norm": 0.4488615541506806, + "learning_rate": 1.4199290532048698e-05, + "loss": 0.8234, + "num_tokens": 28338491396.0, + "step": 6780 + }, + { + "epoch": 0.8058229352346999, + "grad_norm": 0.44103426831285697, + "learning_rate": 1.4197671630581428e-05, + "loss": 0.8505, + "num_tokens": 28342680479.0, + "step": 6781 + }, + { + "epoch": 0.805941770647653, + "grad_norm": 0.3541437937230835, + "learning_rate": 1.4196052610693562e-05, + "loss": 0.8594, + "num_tokens": 28346842231.0, + "step": 6782 + }, + { + "epoch": 0.806060606060606, + "grad_norm": 0.3882981182003317, + "learning_rate": 1.4194433472445069e-05, + "loss": 0.8197, + "num_tokens": 28351030794.0, + "step": 6783 + }, + { + "epoch": 0.8061794414735591, + "grad_norm": 0.3324912693624186, + "learning_rate": 1.41928142158959e-05, + "loss": 0.8483, + "num_tokens": 28355220476.0, + "step": 6784 + }, + { + "epoch": 0.8062982768865122, + "grad_norm": 0.3418122160370328, + "learning_rate": 1.4191194841106026e-05, + "loss": 0.8431, + "num_tokens": 28359386091.0, + "step": 6785 + }, + { + "epoch": 0.8064171122994652, + "grad_norm": 0.3776965052317366, + "learning_rate": 1.4189575348135419e-05, + "loss": 0.8398, + "num_tokens": 28363568184.0, + "step": 6786 + }, + { + "epoch": 0.8065359477124183, + "grad_norm": 0.40033472615644067, + "learning_rate": 1.4187955737044051e-05, + "loss": 0.7875, + "num_tokens": 28367736987.0, + "step": 6787 + }, + { + "epoch": 0.8066547831253713, + "grad_norm": 0.3499727282058299, + "learning_rate": 1.4186336007891901e-05, + "loss": 0.7979, + "num_tokens": 28371926543.0, + "step": 6788 + }, + { + "epoch": 0.8067736185383244, + "grad_norm": 0.3996251291843178, + "learning_rate": 1.4184716160738958e-05, + "loss": 0.8602, + "num_tokens": 28376115338.0, + "step": 6789 + }, + { + "epoch": 0.8068924539512775, + "grad_norm": 0.42027119056566625, + "learning_rate": 1.4183096195645208e-05, + "loss": 0.8946, + "num_tokens": 28380304547.0, + "step": 6790 + }, + { + "epoch": 0.8070112893642305, + "grad_norm": 0.41929095476127304, + "learning_rate": 1.4181476112670643e-05, + "loss": 0.8406, + "num_tokens": 28384495031.0, + "step": 6791 + }, + { + "epoch": 0.8071301247771836, + "grad_norm": 0.4002841946770164, + "learning_rate": 1.417985591187526e-05, + "loss": 0.8719, + "num_tokens": 28388676039.0, + "step": 6792 + }, + { + "epoch": 0.8072489601901367, + "grad_norm": 0.39783759528070173, + "learning_rate": 1.4178235593319063e-05, + "loss": 0.8416, + "num_tokens": 28392832941.0, + "step": 6793 + }, + { + "epoch": 0.8073677956030897, + "grad_norm": 0.35676903619723527, + "learning_rate": 1.4176615157062055e-05, + "loss": 0.8407, + "num_tokens": 28397023284.0, + "step": 6794 + }, + { + "epoch": 0.8074866310160428, + "grad_norm": 0.38397569330708176, + "learning_rate": 1.4174994603164248e-05, + "loss": 0.8069, + "num_tokens": 28401213883.0, + "step": 6795 + }, + { + "epoch": 0.8076054664289959, + "grad_norm": 0.40763140750706395, + "learning_rate": 1.4173373931685657e-05, + "loss": 0.8141, + "num_tokens": 28405401436.0, + "step": 6796 + }, + { + "epoch": 0.8077243018419489, + "grad_norm": 0.37004019943579525, + "learning_rate": 1.4171753142686295e-05, + "loss": 0.8749, + "num_tokens": 28409590942.0, + "step": 6797 + }, + { + "epoch": 0.807843137254902, + "grad_norm": 0.39692247330309666, + "learning_rate": 1.4170132236226196e-05, + "loss": 0.8366, + "num_tokens": 28413779768.0, + "step": 6798 + }, + { + "epoch": 0.807961972667855, + "grad_norm": 0.39667525594755804, + "learning_rate": 1.4168511212365378e-05, + "loss": 0.8537, + "num_tokens": 28417968653.0, + "step": 6799 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.4874347597200796, + "learning_rate": 1.4166890071163877e-05, + "loss": 0.8032, + "num_tokens": 28422134902.0, + "step": 6800 + }, + { + "epoch": 0.8081996434937612, + "grad_norm": 0.41294884740186166, + "learning_rate": 1.416526881268173e-05, + "loss": 0.8391, + "num_tokens": 28426324826.0, + "step": 6801 + }, + { + "epoch": 0.8083184789067142, + "grad_norm": 0.3488161293491895, + "learning_rate": 1.4163647436978978e-05, + "loss": 0.8451, + "num_tokens": 28430487893.0, + "step": 6802 + }, + { + "epoch": 0.8084373143196673, + "grad_norm": 0.4446010682730333, + "learning_rate": 1.4162025944115665e-05, + "loss": 0.8209, + "num_tokens": 28434677142.0, + "step": 6803 + }, + { + "epoch": 0.8085561497326204, + "grad_norm": 0.3647661520688312, + "learning_rate": 1.4160404334151838e-05, + "loss": 0.8677, + "num_tokens": 28438865994.0, + "step": 6804 + }, + { + "epoch": 0.8086749851455733, + "grad_norm": 0.35072358501825235, + "learning_rate": 1.4158782607147555e-05, + "loss": 0.8404, + "num_tokens": 28443054811.0, + "step": 6805 + }, + { + "epoch": 0.8087938205585264, + "grad_norm": 0.35302803050458187, + "learning_rate": 1.415716076316287e-05, + "loss": 0.8496, + "num_tokens": 28447242819.0, + "step": 6806 + }, + { + "epoch": 0.8089126559714795, + "grad_norm": 0.412355017421107, + "learning_rate": 1.4155538802257848e-05, + "loss": 0.8204, + "num_tokens": 28451430547.0, + "step": 6807 + }, + { + "epoch": 0.8090314913844325, + "grad_norm": 0.41572413375680817, + "learning_rate": 1.4153916724492554e-05, + "loss": 0.8476, + "num_tokens": 28455611281.0, + "step": 6808 + }, + { + "epoch": 0.8091503267973856, + "grad_norm": 0.47022028438290303, + "learning_rate": 1.4152294529927065e-05, + "loss": 0.8513, + "num_tokens": 28459778155.0, + "step": 6809 + }, + { + "epoch": 0.8092691622103387, + "grad_norm": 0.33978798116119086, + "learning_rate": 1.4150672218621443e-05, + "loss": 0.8135, + "num_tokens": 28463968151.0, + "step": 6810 + }, + { + "epoch": 0.8093879976232917, + "grad_norm": 0.5497447080747108, + "learning_rate": 1.4149049790635783e-05, + "loss": 0.8682, + "num_tokens": 28468126273.0, + "step": 6811 + }, + { + "epoch": 0.8095068330362448, + "grad_norm": 0.3844316822379662, + "learning_rate": 1.414742724603016e-05, + "loss": 0.8617, + "num_tokens": 28472313838.0, + "step": 6812 + }, + { + "epoch": 0.8096256684491978, + "grad_norm": 0.4294298908905605, + "learning_rate": 1.4145804584864666e-05, + "loss": 0.8491, + "num_tokens": 28476502077.0, + "step": 6813 + }, + { + "epoch": 0.8097445038621509, + "grad_norm": 0.40912327124845366, + "learning_rate": 1.414418180719939e-05, + "loss": 0.8444, + "num_tokens": 28480690719.0, + "step": 6814 + }, + { + "epoch": 0.809863339275104, + "grad_norm": 0.509123803232418, + "learning_rate": 1.414255891309443e-05, + "loss": 0.8635, + "num_tokens": 28484875600.0, + "step": 6815 + }, + { + "epoch": 0.809982174688057, + "grad_norm": 0.38022601636444675, + "learning_rate": 1.4140935902609892e-05, + "loss": 0.8301, + "num_tokens": 28489059317.0, + "step": 6816 + }, + { + "epoch": 0.8101010101010101, + "grad_norm": 0.4480477881365717, + "learning_rate": 1.4139312775805874e-05, + "loss": 0.8277, + "num_tokens": 28493220190.0, + "step": 6817 + }, + { + "epoch": 0.8102198455139632, + "grad_norm": 0.3998682699904842, + "learning_rate": 1.4137689532742492e-05, + "loss": 0.8411, + "num_tokens": 28497408236.0, + "step": 6818 + }, + { + "epoch": 0.8103386809269162, + "grad_norm": 0.37564473642263696, + "learning_rate": 1.4136066173479856e-05, + "loss": 0.8473, + "num_tokens": 28501597048.0, + "step": 6819 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.39853414860592934, + "learning_rate": 1.4134442698078087e-05, + "loss": 0.8467, + "num_tokens": 28505763626.0, + "step": 6820 + }, + { + "epoch": 0.8105763517528224, + "grad_norm": 0.39878189153164584, + "learning_rate": 1.4132819106597306e-05, + "loss": 0.8583, + "num_tokens": 28509953786.0, + "step": 6821 + }, + { + "epoch": 0.8106951871657754, + "grad_norm": 0.40759536501153004, + "learning_rate": 1.4131195399097643e-05, + "loss": 0.8173, + "num_tokens": 28514128494.0, + "step": 6822 + }, + { + "epoch": 0.8108140225787285, + "grad_norm": 0.4502313606053067, + "learning_rate": 1.4129571575639227e-05, + "loss": 0.8622, + "num_tokens": 28518318235.0, + "step": 6823 + }, + { + "epoch": 0.8109328579916815, + "grad_norm": 0.37109709674043795, + "learning_rate": 1.4127947636282193e-05, + "loss": 0.8559, + "num_tokens": 28522476663.0, + "step": 6824 + }, + { + "epoch": 0.8110516934046346, + "grad_norm": 0.44477302970914256, + "learning_rate": 1.4126323581086686e-05, + "loss": 0.8271, + "num_tokens": 28526665192.0, + "step": 6825 + }, + { + "epoch": 0.8111705288175877, + "grad_norm": 0.34896366465468287, + "learning_rate": 1.412469941011284e-05, + "loss": 0.8499, + "num_tokens": 28530852414.0, + "step": 6826 + }, + { + "epoch": 0.8112893642305407, + "grad_norm": 0.4018515256578128, + "learning_rate": 1.4123075123420817e-05, + "loss": 0.8384, + "num_tokens": 28535042324.0, + "step": 6827 + }, + { + "epoch": 0.8114081996434938, + "grad_norm": 0.3961862814826714, + "learning_rate": 1.4121450721070757e-05, + "loss": 0.8292, + "num_tokens": 28539232009.0, + "step": 6828 + }, + { + "epoch": 0.8115270350564469, + "grad_norm": 0.44729944819725564, + "learning_rate": 1.4119826203122827e-05, + "loss": 0.8388, + "num_tokens": 28543422136.0, + "step": 6829 + }, + { + "epoch": 0.8116458704693998, + "grad_norm": 0.35245396743822227, + "learning_rate": 1.4118201569637183e-05, + "loss": 0.8303, + "num_tokens": 28547581502.0, + "step": 6830 + }, + { + "epoch": 0.8117647058823529, + "grad_norm": 0.4661573766207121, + "learning_rate": 1.4116576820673992e-05, + "loss": 0.8319, + "num_tokens": 28551771900.0, + "step": 6831 + }, + { + "epoch": 0.811883541295306, + "grad_norm": 0.42115189854119206, + "learning_rate": 1.4114951956293423e-05, + "loss": 0.8647, + "num_tokens": 28555960355.0, + "step": 6832 + }, + { + "epoch": 0.812002376708259, + "grad_norm": 0.4166097899210047, + "learning_rate": 1.411332697655565e-05, + "loss": 0.8443, + "num_tokens": 28560151169.0, + "step": 6833 + }, + { + "epoch": 0.8121212121212121, + "grad_norm": 0.4387911064771252, + "learning_rate": 1.4111701881520856e-05, + "loss": 0.8691, + "num_tokens": 28564327288.0, + "step": 6834 + }, + { + "epoch": 0.8122400475341652, + "grad_norm": 0.42455733915737853, + "learning_rate": 1.4110076671249223e-05, + "loss": 0.857, + "num_tokens": 28568510917.0, + "step": 6835 + }, + { + "epoch": 0.8123588829471182, + "grad_norm": 0.39848935087533555, + "learning_rate": 1.4108451345800926e-05, + "loss": 0.8382, + "num_tokens": 28572699412.0, + "step": 6836 + }, + { + "epoch": 0.8124777183600713, + "grad_norm": 0.4275269081992, + "learning_rate": 1.410682590523617e-05, + "loss": 0.8272, + "num_tokens": 28576880475.0, + "step": 6837 + }, + { + "epoch": 0.8125965537730243, + "grad_norm": 0.369370203819596, + "learning_rate": 1.4105200349615151e-05, + "loss": 0.7952, + "num_tokens": 28581060901.0, + "step": 6838 + }, + { + "epoch": 0.8127153891859774, + "grad_norm": 0.4240934689781881, + "learning_rate": 1.4103574678998057e-05, + "loss": 0.8283, + "num_tokens": 28585251338.0, + "step": 6839 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 0.4294486253634206, + "learning_rate": 1.4101948893445105e-05, + "loss": 0.8689, + "num_tokens": 28589433231.0, + "step": 6840 + }, + { + "epoch": 0.8129530600118835, + "grad_norm": 0.4187518435851546, + "learning_rate": 1.4100322993016495e-05, + "loss": 0.8449, + "num_tokens": 28593596823.0, + "step": 6841 + }, + { + "epoch": 0.8130718954248366, + "grad_norm": 0.4191729329439334, + "learning_rate": 1.4098696977772442e-05, + "loss": 0.8249, + "num_tokens": 28597762591.0, + "step": 6842 + }, + { + "epoch": 0.8131907308377897, + "grad_norm": 0.5255669807247603, + "learning_rate": 1.4097070847773165e-05, + "loss": 0.8256, + "num_tokens": 28601915474.0, + "step": 6843 + }, + { + "epoch": 0.8133095662507427, + "grad_norm": 0.4638173497054869, + "learning_rate": 1.409544460307888e-05, + "loss": 0.8826, + "num_tokens": 28606084189.0, + "step": 6844 + }, + { + "epoch": 0.8134284016636958, + "grad_norm": 0.38356549350340513, + "learning_rate": 1.4093818243749817e-05, + "loss": 0.8041, + "num_tokens": 28610240488.0, + "step": 6845 + }, + { + "epoch": 0.8135472370766489, + "grad_norm": 0.33516488492943586, + "learning_rate": 1.4092191769846203e-05, + "loss": 0.8549, + "num_tokens": 28614416821.0, + "step": 6846 + }, + { + "epoch": 0.8136660724896019, + "grad_norm": 0.4732111253924457, + "learning_rate": 1.4090565181428275e-05, + "loss": 0.8253, + "num_tokens": 28618586131.0, + "step": 6847 + }, + { + "epoch": 0.813784907902555, + "grad_norm": 0.4178965471902331, + "learning_rate": 1.4088938478556267e-05, + "loss": 0.8055, + "num_tokens": 28622770587.0, + "step": 6848 + }, + { + "epoch": 0.813903743315508, + "grad_norm": 0.4168695618661007, + "learning_rate": 1.4087311661290424e-05, + "loss": 0.8489, + "num_tokens": 28626938435.0, + "step": 6849 + }, + { + "epoch": 0.8140225787284611, + "grad_norm": 0.3887882224499014, + "learning_rate": 1.4085684729690992e-05, + "loss": 0.8354, + "num_tokens": 28631100227.0, + "step": 6850 + }, + { + "epoch": 0.8141414141414142, + "grad_norm": 0.45174205625803426, + "learning_rate": 1.4084057683818221e-05, + "loss": 0.8483, + "num_tokens": 28635288443.0, + "step": 6851 + }, + { + "epoch": 0.8142602495543672, + "grad_norm": 0.4241021757061804, + "learning_rate": 1.4082430523732365e-05, + "loss": 0.8551, + "num_tokens": 28639456031.0, + "step": 6852 + }, + { + "epoch": 0.8143790849673203, + "grad_norm": 0.3920810188568485, + "learning_rate": 1.4080803249493686e-05, + "loss": 0.8216, + "num_tokens": 28643645689.0, + "step": 6853 + }, + { + "epoch": 0.8144979203802734, + "grad_norm": 0.44194602247950593, + "learning_rate": 1.4079175861162448e-05, + "loss": 0.8152, + "num_tokens": 28647835659.0, + "step": 6854 + }, + { + "epoch": 0.8146167557932263, + "grad_norm": 0.4010776798563123, + "learning_rate": 1.4077548358798914e-05, + "loss": 0.8438, + "num_tokens": 28652023139.0, + "step": 6855 + }, + { + "epoch": 0.8147355912061794, + "grad_norm": 0.3628791544214775, + "learning_rate": 1.407592074246336e-05, + "loss": 0.8325, + "num_tokens": 28656211876.0, + "step": 6856 + }, + { + "epoch": 0.8148544266191325, + "grad_norm": 0.35638881170615616, + "learning_rate": 1.4074293012216063e-05, + "loss": 0.8428, + "num_tokens": 28660378441.0, + "step": 6857 + }, + { + "epoch": 0.8149732620320855, + "grad_norm": 0.4139901477546292, + "learning_rate": 1.4072665168117296e-05, + "loss": 0.8334, + "num_tokens": 28664567609.0, + "step": 6858 + }, + { + "epoch": 0.8150920974450386, + "grad_norm": 0.42278831028851416, + "learning_rate": 1.4071037210227356e-05, + "loss": 0.8178, + "num_tokens": 28668720041.0, + "step": 6859 + }, + { + "epoch": 0.8152109328579917, + "grad_norm": 0.43296835854301446, + "learning_rate": 1.406940913860652e-05, + "loss": 0.8408, + "num_tokens": 28672907268.0, + "step": 6860 + }, + { + "epoch": 0.8153297682709447, + "grad_norm": 0.32744523751869015, + "learning_rate": 1.4067780953315087e-05, + "loss": 0.834, + "num_tokens": 28677086488.0, + "step": 6861 + }, + { + "epoch": 0.8154486036838978, + "grad_norm": 0.4393156548907536, + "learning_rate": 1.4066152654413352e-05, + "loss": 0.8018, + "num_tokens": 28681268236.0, + "step": 6862 + }, + { + "epoch": 0.8155674390968508, + "grad_norm": 0.3829379078472367, + "learning_rate": 1.4064524241961617e-05, + "loss": 0.8365, + "num_tokens": 28685455839.0, + "step": 6863 + }, + { + "epoch": 0.8156862745098039, + "grad_norm": 0.38334110371491326, + "learning_rate": 1.4062895716020191e-05, + "loss": 0.8008, + "num_tokens": 28689644505.0, + "step": 6864 + }, + { + "epoch": 0.815805109922757, + "grad_norm": 0.4476088276601645, + "learning_rate": 1.4061267076649377e-05, + "loss": 0.8328, + "num_tokens": 28693833833.0, + "step": 6865 + }, + { + "epoch": 0.81592394533571, + "grad_norm": 0.3866660503919733, + "learning_rate": 1.4059638323909494e-05, + "loss": 0.8637, + "num_tokens": 28698022241.0, + "step": 6866 + }, + { + "epoch": 0.8160427807486631, + "grad_norm": 0.44524176130637855, + "learning_rate": 1.4058009457860862e-05, + "loss": 0.8188, + "num_tokens": 28702212734.0, + "step": 6867 + }, + { + "epoch": 0.8161616161616162, + "grad_norm": 0.4018498690767997, + "learning_rate": 1.4056380478563793e-05, + "loss": 0.8092, + "num_tokens": 28706378895.0, + "step": 6868 + }, + { + "epoch": 0.8162804515745692, + "grad_norm": 0.4331424155027154, + "learning_rate": 1.405475138607863e-05, + "loss": 0.8165, + "num_tokens": 28710568026.0, + "step": 6869 + }, + { + "epoch": 0.8163992869875223, + "grad_norm": 0.3899188843485696, + "learning_rate": 1.405312218046569e-05, + "loss": 0.8068, + "num_tokens": 28714757478.0, + "step": 6870 + }, + { + "epoch": 0.8165181224004754, + "grad_norm": 0.3778236381424318, + "learning_rate": 1.4051492861785309e-05, + "loss": 0.8326, + "num_tokens": 28718900551.0, + "step": 6871 + }, + { + "epoch": 0.8166369578134284, + "grad_norm": 0.38265697147287037, + "learning_rate": 1.4049863430097838e-05, + "loss": 0.8405, + "num_tokens": 28723088498.0, + "step": 6872 + }, + { + "epoch": 0.8167557932263815, + "grad_norm": 0.43535433607640706, + "learning_rate": 1.4048233885463609e-05, + "loss": 0.8095, + "num_tokens": 28727259630.0, + "step": 6873 + }, + { + "epoch": 0.8168746286393346, + "grad_norm": 0.36865137830583594, + "learning_rate": 1.4046604227942973e-05, + "loss": 0.8339, + "num_tokens": 28731424268.0, + "step": 6874 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.44211378704579707, + "learning_rate": 1.4044974457596281e-05, + "loss": 0.8718, + "num_tokens": 28735613460.0, + "step": 6875 + }, + { + "epoch": 0.8171122994652407, + "grad_norm": 0.3848757881010154, + "learning_rate": 1.404334457448389e-05, + "loss": 0.84, + "num_tokens": 28739802113.0, + "step": 6876 + }, + { + "epoch": 0.8172311348781937, + "grad_norm": 0.4983371544896532, + "learning_rate": 1.4041714578666158e-05, + "loss": 0.8063, + "num_tokens": 28743962255.0, + "step": 6877 + }, + { + "epoch": 0.8173499702911468, + "grad_norm": 0.3778316755499709, + "learning_rate": 1.4040084470203452e-05, + "loss": 0.8479, + "num_tokens": 28748151383.0, + "step": 6878 + }, + { + "epoch": 0.8174688057040999, + "grad_norm": 0.4268623003433978, + "learning_rate": 1.403845424915614e-05, + "loss": 0.8414, + "num_tokens": 28752339446.0, + "step": 6879 + }, + { + "epoch": 0.8175876411170528, + "grad_norm": 0.3779017768848438, + "learning_rate": 1.4036823915584591e-05, + "loss": 0.8385, + "num_tokens": 28756529151.0, + "step": 6880 + }, + { + "epoch": 0.8177064765300059, + "grad_norm": 0.420506119582412, + "learning_rate": 1.4035193469549189e-05, + "loss": 0.8242, + "num_tokens": 28760711946.0, + "step": 6881 + }, + { + "epoch": 0.817825311942959, + "grad_norm": 0.36529689309428925, + "learning_rate": 1.4033562911110312e-05, + "loss": 0.8463, + "num_tokens": 28764892304.0, + "step": 6882 + }, + { + "epoch": 0.817944147355912, + "grad_norm": 0.4319926276574222, + "learning_rate": 1.4031932240328339e-05, + "loss": 0.8336, + "num_tokens": 28769082700.0, + "step": 6883 + }, + { + "epoch": 0.8180629827688651, + "grad_norm": 0.41336168932466094, + "learning_rate": 1.4030301457263662e-05, + "loss": 0.819, + "num_tokens": 28773269338.0, + "step": 6884 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 0.39245249302618346, + "learning_rate": 1.4028670561976683e-05, + "loss": 0.8094, + "num_tokens": 28777458049.0, + "step": 6885 + }, + { + "epoch": 0.8183006535947712, + "grad_norm": 0.4008675074731809, + "learning_rate": 1.4027039554527788e-05, + "loss": 0.8295, + "num_tokens": 28781603879.0, + "step": 6886 + }, + { + "epoch": 0.8184194890077243, + "grad_norm": 0.3445399316643763, + "learning_rate": 1.4025408434977385e-05, + "loss": 0.8151, + "num_tokens": 28785776400.0, + "step": 6887 + }, + { + "epoch": 0.8185383244206773, + "grad_norm": 0.45794164524061637, + "learning_rate": 1.402377720338588e-05, + "loss": 0.8365, + "num_tokens": 28789917827.0, + "step": 6888 + }, + { + "epoch": 0.8186571598336304, + "grad_norm": 0.3958129488010532, + "learning_rate": 1.402214585981368e-05, + "loss": 0.8305, + "num_tokens": 28794106996.0, + "step": 6889 + }, + { + "epoch": 0.8187759952465835, + "grad_norm": 0.3935135898208482, + "learning_rate": 1.4020514404321202e-05, + "loss": 0.8769, + "num_tokens": 28798296183.0, + "step": 6890 + }, + { + "epoch": 0.8188948306595365, + "grad_norm": 0.4051683404680633, + "learning_rate": 1.4018882836968859e-05, + "loss": 0.8626, + "num_tokens": 28802484785.0, + "step": 6891 + }, + { + "epoch": 0.8190136660724896, + "grad_norm": 0.3971056565628762, + "learning_rate": 1.4017251157817083e-05, + "loss": 0.8313, + "num_tokens": 28806674533.0, + "step": 6892 + }, + { + "epoch": 0.8191325014854427, + "grad_norm": 0.4749523183571842, + "learning_rate": 1.4015619366926293e-05, + "loss": 0.8362, + "num_tokens": 28810844024.0, + "step": 6893 + }, + { + "epoch": 0.8192513368983957, + "grad_norm": 0.3950799692225139, + "learning_rate": 1.4013987464356919e-05, + "loss": 0.8633, + "num_tokens": 28815033521.0, + "step": 6894 + }, + { + "epoch": 0.8193701723113488, + "grad_norm": 0.44715459943103686, + "learning_rate": 1.4012355450169404e-05, + "loss": 0.8404, + "num_tokens": 28819175757.0, + "step": 6895 + }, + { + "epoch": 0.8194890077243019, + "grad_norm": 0.3646307420747811, + "learning_rate": 1.4010723324424174e-05, + "loss": 0.8608, + "num_tokens": 28823365540.0, + "step": 6896 + }, + { + "epoch": 0.8196078431372549, + "grad_norm": 0.47646632426260754, + "learning_rate": 1.4009091087181686e-05, + "loss": 0.8525, + "num_tokens": 28827553942.0, + "step": 6897 + }, + { + "epoch": 0.819726678550208, + "grad_norm": 0.39013225469146257, + "learning_rate": 1.4007458738502381e-05, + "loss": 0.8352, + "num_tokens": 28831659422.0, + "step": 6898 + }, + { + "epoch": 0.8198455139631611, + "grad_norm": 0.4157840489604199, + "learning_rate": 1.4005826278446708e-05, + "loss": 0.8596, + "num_tokens": 28835823514.0, + "step": 6899 + }, + { + "epoch": 0.8199643493761141, + "grad_norm": 0.34765680722293785, + "learning_rate": 1.4004193707075127e-05, + "loss": 0.8286, + "num_tokens": 28840012203.0, + "step": 6900 + }, + { + "epoch": 0.8200831847890672, + "grad_norm": 0.39558271098747866, + "learning_rate": 1.4002561024448094e-05, + "loss": 0.8451, + "num_tokens": 28844201805.0, + "step": 6901 + }, + { + "epoch": 0.8202020202020202, + "grad_norm": 0.3814189960947971, + "learning_rate": 1.4000928230626074e-05, + "loss": 0.8674, + "num_tokens": 28848377687.0, + "step": 6902 + }, + { + "epoch": 0.8203208556149733, + "grad_norm": 0.42688884959388246, + "learning_rate": 1.3999295325669534e-05, + "loss": 0.8139, + "num_tokens": 28852566814.0, + "step": 6903 + }, + { + "epoch": 0.8204396910279264, + "grad_norm": 0.3727159299958523, + "learning_rate": 1.3997662309638951e-05, + "loss": 0.8476, + "num_tokens": 28856756217.0, + "step": 6904 + }, + { + "epoch": 0.8205585264408793, + "grad_norm": 0.40683165703156154, + "learning_rate": 1.3996029182594796e-05, + "loss": 0.8585, + "num_tokens": 28860946333.0, + "step": 6905 + }, + { + "epoch": 0.8206773618538324, + "grad_norm": 0.4291545348098712, + "learning_rate": 1.399439594459755e-05, + "loss": 0.8247, + "num_tokens": 28865133431.0, + "step": 6906 + }, + { + "epoch": 0.8207961972667855, + "grad_norm": 0.4159080959378077, + "learning_rate": 1.3992762595707698e-05, + "loss": 0.8447, + "num_tokens": 28869309488.0, + "step": 6907 + }, + { + "epoch": 0.8209150326797385, + "grad_norm": 0.39596591855593244, + "learning_rate": 1.3991129135985729e-05, + "loss": 0.8333, + "num_tokens": 28873472382.0, + "step": 6908 + }, + { + "epoch": 0.8210338680926916, + "grad_norm": 0.43249076002992587, + "learning_rate": 1.3989495565492133e-05, + "loss": 0.7988, + "num_tokens": 28877647220.0, + "step": 6909 + }, + { + "epoch": 0.8211527035056447, + "grad_norm": 0.39153618290204356, + "learning_rate": 1.3987861884287408e-05, + "loss": 0.8046, + "num_tokens": 28881832752.0, + "step": 6910 + }, + { + "epoch": 0.8212715389185977, + "grad_norm": 0.4015443677172657, + "learning_rate": 1.3986228092432062e-05, + "loss": 0.8545, + "num_tokens": 28886021985.0, + "step": 6911 + }, + { + "epoch": 0.8213903743315508, + "grad_norm": 0.4165642541417931, + "learning_rate": 1.3984594189986585e-05, + "loss": 0.7708, + "num_tokens": 28890171181.0, + "step": 6912 + }, + { + "epoch": 0.8215092097445038, + "grad_norm": 0.3640170503521741, + "learning_rate": 1.3982960177011496e-05, + "loss": 0.8653, + "num_tokens": 28894360255.0, + "step": 6913 + }, + { + "epoch": 0.8216280451574569, + "grad_norm": 0.48563588993964885, + "learning_rate": 1.398132605356731e-05, + "loss": 0.8255, + "num_tokens": 28898549603.0, + "step": 6914 + }, + { + "epoch": 0.82174688057041, + "grad_norm": 0.3854376513889453, + "learning_rate": 1.3979691819714534e-05, + "loss": 0.8707, + "num_tokens": 28902738526.0, + "step": 6915 + }, + { + "epoch": 0.821865715983363, + "grad_norm": 0.39597942705641775, + "learning_rate": 1.39780574755137e-05, + "loss": 0.8346, + "num_tokens": 28906927864.0, + "step": 6916 + }, + { + "epoch": 0.8219845513963161, + "grad_norm": 0.4329411610439042, + "learning_rate": 1.3976423021025326e-05, + "loss": 0.8225, + "num_tokens": 28911116582.0, + "step": 6917 + }, + { + "epoch": 0.8221033868092692, + "grad_norm": 0.392248291874048, + "learning_rate": 1.3974788456309945e-05, + "loss": 0.8818, + "num_tokens": 28915284634.0, + "step": 6918 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3588455891367645, + "learning_rate": 1.397315378142809e-05, + "loss": 0.8303, + "num_tokens": 28919473986.0, + "step": 6919 + }, + { + "epoch": 0.8223410576351753, + "grad_norm": 0.3910753154621545, + "learning_rate": 1.3971518996440299e-05, + "loss": 0.8362, + "num_tokens": 28923603081.0, + "step": 6920 + }, + { + "epoch": 0.8224598930481284, + "grad_norm": 0.4633245511227526, + "learning_rate": 1.3969884101407113e-05, + "loss": 0.8546, + "num_tokens": 28927783311.0, + "step": 6921 + }, + { + "epoch": 0.8225787284610814, + "grad_norm": 0.33909195505356404, + "learning_rate": 1.3968249096389073e-05, + "loss": 0.8623, + "num_tokens": 28931972231.0, + "step": 6922 + }, + { + "epoch": 0.8226975638740345, + "grad_norm": 0.44854303530545464, + "learning_rate": 1.3966613981446735e-05, + "loss": 0.8487, + "num_tokens": 28936150375.0, + "step": 6923 + }, + { + "epoch": 0.8228163992869876, + "grad_norm": 0.3113250357830877, + "learning_rate": 1.3964978756640653e-05, + "loss": 0.7931, + "num_tokens": 28940338699.0, + "step": 6924 + }, + { + "epoch": 0.8229352346999406, + "grad_norm": 0.4452002111691339, + "learning_rate": 1.396334342203138e-05, + "loss": 0.8434, + "num_tokens": 28944484255.0, + "step": 6925 + }, + { + "epoch": 0.8230540701128937, + "grad_norm": 0.4019852701378792, + "learning_rate": 1.3961707977679481e-05, + "loss": 0.8609, + "num_tokens": 28948672018.0, + "step": 6926 + }, + { + "epoch": 0.8231729055258467, + "grad_norm": 0.41638619395204185, + "learning_rate": 1.3960072423645525e-05, + "loss": 0.8676, + "num_tokens": 28952853996.0, + "step": 6927 + }, + { + "epoch": 0.8232917409387998, + "grad_norm": 0.3790487185426605, + "learning_rate": 1.3958436759990075e-05, + "loss": 0.82, + "num_tokens": 28957019535.0, + "step": 6928 + }, + { + "epoch": 0.8234105763517529, + "grad_norm": 0.3839445533474307, + "learning_rate": 1.3956800986773714e-05, + "loss": 0.8422, + "num_tokens": 28961208578.0, + "step": 6929 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.45202366450403614, + "learning_rate": 1.395516510405701e-05, + "loss": 0.8561, + "num_tokens": 28965364140.0, + "step": 6930 + }, + { + "epoch": 0.8236482471776589, + "grad_norm": 0.3729116892812632, + "learning_rate": 1.3953529111900552e-05, + "loss": 0.8615, + "num_tokens": 28969552204.0, + "step": 6931 + }, + { + "epoch": 0.823767082590612, + "grad_norm": 0.4152719459107445, + "learning_rate": 1.3951893010364926e-05, + "loss": 0.8697, + "num_tokens": 28973736054.0, + "step": 6932 + }, + { + "epoch": 0.823885918003565, + "grad_norm": 0.3601654493243848, + "learning_rate": 1.395025679951072e-05, + "loss": 0.839, + "num_tokens": 28977924614.0, + "step": 6933 + }, + { + "epoch": 0.8240047534165181, + "grad_norm": 0.39148620280356555, + "learning_rate": 1.394862047939853e-05, + "loss": 0.7958, + "num_tokens": 28982112905.0, + "step": 6934 + }, + { + "epoch": 0.8241235888294712, + "grad_norm": 0.3854963924071923, + "learning_rate": 1.3946984050088955e-05, + "loss": 0.8385, + "num_tokens": 28986295080.0, + "step": 6935 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 0.42472973909130524, + "learning_rate": 1.3945347511642597e-05, + "loss": 0.8237, + "num_tokens": 28990484589.0, + "step": 6936 + }, + { + "epoch": 0.8243612596553773, + "grad_norm": 0.46215236982209, + "learning_rate": 1.3943710864120064e-05, + "loss": 0.8687, + "num_tokens": 28994666430.0, + "step": 6937 + }, + { + "epoch": 0.8244800950683303, + "grad_norm": 0.43862027441621415, + "learning_rate": 1.394207410758196e-05, + "loss": 0.8515, + "num_tokens": 28998855357.0, + "step": 6938 + }, + { + "epoch": 0.8245989304812834, + "grad_norm": 0.3791831586726114, + "learning_rate": 1.3940437242088907e-05, + "loss": 0.8194, + "num_tokens": 29003043995.0, + "step": 6939 + }, + { + "epoch": 0.8247177658942365, + "grad_norm": 0.384355603595984, + "learning_rate": 1.3938800267701522e-05, + "loss": 0.8448, + "num_tokens": 29007232131.0, + "step": 6940 + }, + { + "epoch": 0.8248366013071895, + "grad_norm": 0.355203760293505, + "learning_rate": 1.3937163184480425e-05, + "loss": 0.8097, + "num_tokens": 29011421291.0, + "step": 6941 + }, + { + "epoch": 0.8249554367201426, + "grad_norm": 0.354563126881471, + "learning_rate": 1.3935525992486245e-05, + "loss": 0.8518, + "num_tokens": 29015579015.0, + "step": 6942 + }, + { + "epoch": 0.8250742721330957, + "grad_norm": 0.4646851205030231, + "learning_rate": 1.3933888691779614e-05, + "loss": 0.8274, + "num_tokens": 29019745691.0, + "step": 6943 + }, + { + "epoch": 0.8251931075460487, + "grad_norm": 0.33951833102906537, + "learning_rate": 1.3932251282421159e-05, + "loss": 0.8264, + "num_tokens": 29023932479.0, + "step": 6944 + }, + { + "epoch": 0.8253119429590018, + "grad_norm": 0.41736362652118697, + "learning_rate": 1.393061376447153e-05, + "loss": 0.8968, + "num_tokens": 29028121465.0, + "step": 6945 + }, + { + "epoch": 0.8254307783719549, + "grad_norm": 0.4575508760396033, + "learning_rate": 1.3928976137991363e-05, + "loss": 0.8398, + "num_tokens": 29032292591.0, + "step": 6946 + }, + { + "epoch": 0.8255496137849079, + "grad_norm": 0.36736358196819424, + "learning_rate": 1.3927338403041309e-05, + "loss": 0.8392, + "num_tokens": 29036440150.0, + "step": 6947 + }, + { + "epoch": 0.825668449197861, + "grad_norm": 0.37165146818765765, + "learning_rate": 1.3925700559682014e-05, + "loss": 0.8402, + "num_tokens": 29040582056.0, + "step": 6948 + }, + { + "epoch": 0.8257872846108141, + "grad_norm": 0.36096638652153035, + "learning_rate": 1.3924062607974136e-05, + "loss": 0.8148, + "num_tokens": 29044771181.0, + "step": 6949 + }, + { + "epoch": 0.8259061200237671, + "grad_norm": 0.3795164939834461, + "learning_rate": 1.3922424547978332e-05, + "loss": 0.8272, + "num_tokens": 29048921385.0, + "step": 6950 + }, + { + "epoch": 0.8260249554367202, + "grad_norm": 0.3987423197991671, + "learning_rate": 1.3920786379755268e-05, + "loss": 0.8485, + "num_tokens": 29053109718.0, + "step": 6951 + }, + { + "epoch": 0.8261437908496732, + "grad_norm": 0.351104781879777, + "learning_rate": 1.3919148103365607e-05, + "loss": 0.8387, + "num_tokens": 29057299503.0, + "step": 6952 + }, + { + "epoch": 0.8262626262626263, + "grad_norm": 0.41667104239348923, + "learning_rate": 1.3917509718870023e-05, + "loss": 0.8317, + "num_tokens": 29061488376.0, + "step": 6953 + }, + { + "epoch": 0.8263814616755794, + "grad_norm": 0.38984734505848445, + "learning_rate": 1.391587122632919e-05, + "loss": 0.8414, + "num_tokens": 29065675574.0, + "step": 6954 + }, + { + "epoch": 0.8265002970885323, + "grad_norm": 0.4502380572619717, + "learning_rate": 1.3914232625803787e-05, + "loss": 0.8077, + "num_tokens": 29069865600.0, + "step": 6955 + }, + { + "epoch": 0.8266191325014854, + "grad_norm": 0.4050673188537347, + "learning_rate": 1.3912593917354495e-05, + "loss": 0.8353, + "num_tokens": 29074053852.0, + "step": 6956 + }, + { + "epoch": 0.8267379679144385, + "grad_norm": 0.4193398227560291, + "learning_rate": 1.3910955101042003e-05, + "loss": 0.8067, + "num_tokens": 29078229000.0, + "step": 6957 + }, + { + "epoch": 0.8268568033273915, + "grad_norm": 0.32408881231533765, + "learning_rate": 1.3909316176927003e-05, + "loss": 0.8106, + "num_tokens": 29082418381.0, + "step": 6958 + }, + { + "epoch": 0.8269756387403446, + "grad_norm": 0.3553717828882645, + "learning_rate": 1.3907677145070187e-05, + "loss": 0.8681, + "num_tokens": 29086607586.0, + "step": 6959 + }, + { + "epoch": 0.8270944741532977, + "grad_norm": 0.4039651433098424, + "learning_rate": 1.3906038005532256e-05, + "loss": 0.8061, + "num_tokens": 29090797695.0, + "step": 6960 + }, + { + "epoch": 0.8272133095662507, + "grad_norm": 0.380914736600484, + "learning_rate": 1.3904398758373912e-05, + "loss": 0.8291, + "num_tokens": 29094987593.0, + "step": 6961 + }, + { + "epoch": 0.8273321449792038, + "grad_norm": 0.3845709427776007, + "learning_rate": 1.3902759403655861e-05, + "loss": 0.8312, + "num_tokens": 29099177879.0, + "step": 6962 + }, + { + "epoch": 0.8274509803921568, + "grad_norm": 0.44563084085183546, + "learning_rate": 1.3901119941438815e-05, + "loss": 0.8796, + "num_tokens": 29103367182.0, + "step": 6963 + }, + { + "epoch": 0.8275698158051099, + "grad_norm": 0.39928211755659204, + "learning_rate": 1.3899480371783487e-05, + "loss": 0.8296, + "num_tokens": 29107554373.0, + "step": 6964 + }, + { + "epoch": 0.827688651218063, + "grad_norm": 0.3512019152629622, + "learning_rate": 1.3897840694750601e-05, + "loss": 0.8277, + "num_tokens": 29111742227.0, + "step": 6965 + }, + { + "epoch": 0.827807486631016, + "grad_norm": 0.3874882217266449, + "learning_rate": 1.3896200910400874e-05, + "loss": 0.8063, + "num_tokens": 29115932347.0, + "step": 6966 + }, + { + "epoch": 0.8279263220439691, + "grad_norm": 0.33269932280560727, + "learning_rate": 1.3894561018795037e-05, + "loss": 0.8015, + "num_tokens": 29120109777.0, + "step": 6967 + }, + { + "epoch": 0.8280451574569222, + "grad_norm": 0.4429980904826554, + "learning_rate": 1.3892921019993818e-05, + "loss": 0.8856, + "num_tokens": 29124299103.0, + "step": 6968 + }, + { + "epoch": 0.8281639928698752, + "grad_norm": 0.3359807239570176, + "learning_rate": 1.3891280914057952e-05, + "loss": 0.8083, + "num_tokens": 29128488556.0, + "step": 6969 + }, + { + "epoch": 0.8282828282828283, + "grad_norm": 0.3676068857724747, + "learning_rate": 1.3889640701048175e-05, + "loss": 0.7885, + "num_tokens": 29132678511.0, + "step": 6970 + }, + { + "epoch": 0.8284016636957814, + "grad_norm": 0.36432758544804655, + "learning_rate": 1.3888000381025237e-05, + "loss": 0.8486, + "num_tokens": 29136838355.0, + "step": 6971 + }, + { + "epoch": 0.8285204991087344, + "grad_norm": 0.3905897763453594, + "learning_rate": 1.388635995404988e-05, + "loss": 0.8391, + "num_tokens": 29140998531.0, + "step": 6972 + }, + { + "epoch": 0.8286393345216875, + "grad_norm": 0.42653198467525044, + "learning_rate": 1.3884719420182855e-05, + "loss": 0.8477, + "num_tokens": 29145173300.0, + "step": 6973 + }, + { + "epoch": 0.8287581699346406, + "grad_norm": 0.44390396005993027, + "learning_rate": 1.3883078779484915e-05, + "loss": 0.8352, + "num_tokens": 29149356251.0, + "step": 6974 + }, + { + "epoch": 0.8288770053475936, + "grad_norm": 0.3388915281240483, + "learning_rate": 1.3881438032016818e-05, + "loss": 0.8252, + "num_tokens": 29153545466.0, + "step": 6975 + }, + { + "epoch": 0.8289958407605467, + "grad_norm": 0.3653271131451889, + "learning_rate": 1.3879797177839332e-05, + "loss": 0.8476, + "num_tokens": 29157707013.0, + "step": 6976 + }, + { + "epoch": 0.8291146761734997, + "grad_norm": 0.5154810382745512, + "learning_rate": 1.3878156217013217e-05, + "loss": 0.8433, + "num_tokens": 29161896809.0, + "step": 6977 + }, + { + "epoch": 0.8292335115864528, + "grad_norm": 0.3814718627587497, + "learning_rate": 1.3876515149599249e-05, + "loss": 0.8264, + "num_tokens": 29166086073.0, + "step": 6978 + }, + { + "epoch": 0.8293523469994059, + "grad_norm": 0.37053449915901476, + "learning_rate": 1.3874873975658196e-05, + "loss": 0.8545, + "num_tokens": 29170274468.0, + "step": 6979 + }, + { + "epoch": 0.8294711824123588, + "grad_norm": 0.4349962806830821, + "learning_rate": 1.387323269525084e-05, + "loss": 0.8403, + "num_tokens": 29174456868.0, + "step": 6980 + }, + { + "epoch": 0.8295900178253119, + "grad_norm": 0.3430389573545644, + "learning_rate": 1.3871591308437964e-05, + "loss": 0.7984, + "num_tokens": 29178645310.0, + "step": 6981 + }, + { + "epoch": 0.829708853238265, + "grad_norm": 0.47139262516783564, + "learning_rate": 1.3869949815280352e-05, + "loss": 0.8402, + "num_tokens": 29182812039.0, + "step": 6982 + }, + { + "epoch": 0.829827688651218, + "grad_norm": 0.3585557316869384, + "learning_rate": 1.3868308215838797e-05, + "loss": 0.7854, + "num_tokens": 29186970483.0, + "step": 6983 + }, + { + "epoch": 0.8299465240641711, + "grad_norm": 0.4908094150441668, + "learning_rate": 1.3866666510174088e-05, + "loss": 0.8589, + "num_tokens": 29191147979.0, + "step": 6984 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.4066684554936646, + "learning_rate": 1.3865024698347024e-05, + "loss": 0.8701, + "num_tokens": 29195307532.0, + "step": 6985 + }, + { + "epoch": 0.8301841948900772, + "grad_norm": 0.4314355519610966, + "learning_rate": 1.3863382780418413e-05, + "loss": 0.8294, + "num_tokens": 29199495536.0, + "step": 6986 + }, + { + "epoch": 0.8303030303030303, + "grad_norm": 0.4259315907345476, + "learning_rate": 1.3861740756449056e-05, + "loss": 0.8254, + "num_tokens": 29203661268.0, + "step": 6987 + }, + { + "epoch": 0.8304218657159833, + "grad_norm": 0.4302733816631677, + "learning_rate": 1.386009862649976e-05, + "loss": 0.7984, + "num_tokens": 29207770024.0, + "step": 6988 + }, + { + "epoch": 0.8305407011289364, + "grad_norm": 0.45530954586839745, + "learning_rate": 1.3858456390631345e-05, + "loss": 0.8228, + "num_tokens": 29211959448.0, + "step": 6989 + }, + { + "epoch": 0.8306595365418895, + "grad_norm": 0.3515478054521148, + "learning_rate": 1.3856814048904624e-05, + "loss": 0.8476, + "num_tokens": 29216147243.0, + "step": 6990 + }, + { + "epoch": 0.8307783719548425, + "grad_norm": 0.48441170293432806, + "learning_rate": 1.3855171601380417e-05, + "loss": 0.8011, + "num_tokens": 29220337063.0, + "step": 6991 + }, + { + "epoch": 0.8308972073677956, + "grad_norm": 0.35387221807929353, + "learning_rate": 1.3853529048119554e-05, + "loss": 0.8219, + "num_tokens": 29224475089.0, + "step": 6992 + }, + { + "epoch": 0.8310160427807487, + "grad_norm": 0.4613952714952431, + "learning_rate": 1.3851886389182863e-05, + "loss": 0.7943, + "num_tokens": 29228662817.0, + "step": 6993 + }, + { + "epoch": 0.8311348781937017, + "grad_norm": 0.4025921028862825, + "learning_rate": 1.3850243624631177e-05, + "loss": 0.8083, + "num_tokens": 29232840460.0, + "step": 6994 + }, + { + "epoch": 0.8312537136066548, + "grad_norm": 0.40909628877549753, + "learning_rate": 1.384860075452533e-05, + "loss": 0.8457, + "num_tokens": 29237028812.0, + "step": 6995 + }, + { + "epoch": 0.8313725490196079, + "grad_norm": 0.4191971579288941, + "learning_rate": 1.3846957778926166e-05, + "loss": 0.8232, + "num_tokens": 29241216945.0, + "step": 6996 + }, + { + "epoch": 0.8314913844325609, + "grad_norm": 0.3866395447172364, + "learning_rate": 1.3845314697894534e-05, + "loss": 0.8293, + "num_tokens": 29245404145.0, + "step": 6997 + }, + { + "epoch": 0.831610219845514, + "grad_norm": 0.4167466062288816, + "learning_rate": 1.3843671511491274e-05, + "loss": 0.8776, + "num_tokens": 29249594258.0, + "step": 6998 + }, + { + "epoch": 0.8317290552584671, + "grad_norm": 0.39642570000976984, + "learning_rate": 1.3842028219777244e-05, + "loss": 0.8132, + "num_tokens": 29253783336.0, + "step": 6999 + }, + { + "epoch": 0.8318478906714201, + "grad_norm": 0.43259971827994437, + "learning_rate": 1.3840384822813302e-05, + "loss": 0.8418, + "num_tokens": 29257967827.0, + "step": 7000 + }, + { + "epoch": 0.8319667260843732, + "grad_norm": 0.3774761424966895, + "learning_rate": 1.3838741320660303e-05, + "loss": 0.8528, + "num_tokens": 29262157627.0, + "step": 7001 + }, + { + "epoch": 0.8320855614973262, + "grad_norm": 0.4524748427733865, + "learning_rate": 1.3837097713379119e-05, + "loss": 0.7915, + "num_tokens": 29266347324.0, + "step": 7002 + }, + { + "epoch": 0.8322043969102793, + "grad_norm": 0.42056352599283364, + "learning_rate": 1.3835454001030613e-05, + "loss": 0.8582, + "num_tokens": 29270523334.0, + "step": 7003 + }, + { + "epoch": 0.8323232323232324, + "grad_norm": 0.3699990341907225, + "learning_rate": 1.3833810183675656e-05, + "loss": 0.8325, + "num_tokens": 29274713077.0, + "step": 7004 + }, + { + "epoch": 0.8324420677361853, + "grad_norm": 0.4884069759599609, + "learning_rate": 1.3832166261375128e-05, + "loss": 0.8334, + "num_tokens": 29278903027.0, + "step": 7005 + }, + { + "epoch": 0.8325609031491384, + "grad_norm": 0.41513593578134184, + "learning_rate": 1.383052223418991e-05, + "loss": 0.8771, + "num_tokens": 29283091921.0, + "step": 7006 + }, + { + "epoch": 0.8326797385620915, + "grad_norm": 0.4064080692739114, + "learning_rate": 1.3828878102180878e-05, + "loss": 0.8771, + "num_tokens": 29287280697.0, + "step": 7007 + }, + { + "epoch": 0.8327985739750445, + "grad_norm": 0.4628292386392186, + "learning_rate": 1.3827233865408931e-05, + "loss": 0.8224, + "num_tokens": 29291468546.0, + "step": 7008 + }, + { + "epoch": 0.8329174093879976, + "grad_norm": 0.4019152152937425, + "learning_rate": 1.3825589523934952e-05, + "loss": 0.8512, + "num_tokens": 29295634748.0, + "step": 7009 + }, + { + "epoch": 0.8330362448009507, + "grad_norm": 0.36609804022505205, + "learning_rate": 1.382394507781984e-05, + "loss": 0.8515, + "num_tokens": 29299824626.0, + "step": 7010 + }, + { + "epoch": 0.8331550802139037, + "grad_norm": 0.46022138348828456, + "learning_rate": 1.382230052712449e-05, + "loss": 0.8397, + "num_tokens": 29304007596.0, + "step": 7011 + }, + { + "epoch": 0.8332739156268568, + "grad_norm": 0.2960584545386039, + "learning_rate": 1.3820655871909814e-05, + "loss": 0.8361, + "num_tokens": 29308196681.0, + "step": 7012 + }, + { + "epoch": 0.8333927510398098, + "grad_norm": 0.4423119191434758, + "learning_rate": 1.3819011112236715e-05, + "loss": 0.8332, + "num_tokens": 29312369357.0, + "step": 7013 + }, + { + "epoch": 0.8335115864527629, + "grad_norm": 0.3727555244430807, + "learning_rate": 1.3817366248166098e-05, + "loss": 0.8501, + "num_tokens": 29316558242.0, + "step": 7014 + }, + { + "epoch": 0.833630421865716, + "grad_norm": 0.42420545083848776, + "learning_rate": 1.3815721279758889e-05, + "loss": 0.8285, + "num_tokens": 29320747060.0, + "step": 7015 + }, + { + "epoch": 0.833749257278669, + "grad_norm": 0.34911101255745075, + "learning_rate": 1.3814076207075998e-05, + "loss": 0.8111, + "num_tokens": 29324937105.0, + "step": 7016 + }, + { + "epoch": 0.8338680926916221, + "grad_norm": 0.3777362280269245, + "learning_rate": 1.3812431030178349e-05, + "loss": 0.831, + "num_tokens": 29329126078.0, + "step": 7017 + }, + { + "epoch": 0.8339869281045752, + "grad_norm": 0.34618055226326566, + "learning_rate": 1.381078574912687e-05, + "loss": 0.822, + "num_tokens": 29333310019.0, + "step": 7018 + }, + { + "epoch": 0.8341057635175282, + "grad_norm": 0.3300683104389525, + "learning_rate": 1.3809140363982494e-05, + "loss": 0.8419, + "num_tokens": 29337496515.0, + "step": 7019 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 0.3397696617796697, + "learning_rate": 1.380749487480615e-05, + "loss": 0.8428, + "num_tokens": 29341661559.0, + "step": 7020 + }, + { + "epoch": 0.8343434343434344, + "grad_norm": 0.3824495684686094, + "learning_rate": 1.3805849281658779e-05, + "loss": 0.8218, + "num_tokens": 29345815363.0, + "step": 7021 + }, + { + "epoch": 0.8344622697563874, + "grad_norm": 0.378051844407383, + "learning_rate": 1.3804203584601324e-05, + "loss": 0.836, + "num_tokens": 29350002401.0, + "step": 7022 + }, + { + "epoch": 0.8345811051693405, + "grad_norm": 0.363093262358592, + "learning_rate": 1.3802557783694725e-05, + "loss": 0.8496, + "num_tokens": 29354183743.0, + "step": 7023 + }, + { + "epoch": 0.8346999405822936, + "grad_norm": 0.3702241033078867, + "learning_rate": 1.3800911878999939e-05, + "loss": 0.8383, + "num_tokens": 29358371275.0, + "step": 7024 + }, + { + "epoch": 0.8348187759952466, + "grad_norm": 0.4297405224410879, + "learning_rate": 1.3799265870577913e-05, + "loss": 0.8516, + "num_tokens": 29362560659.0, + "step": 7025 + }, + { + "epoch": 0.8349376114081997, + "grad_norm": 0.422891781491517, + "learning_rate": 1.379761975848961e-05, + "loss": 0.7971, + "num_tokens": 29366748973.0, + "step": 7026 + }, + { + "epoch": 0.8350564468211527, + "grad_norm": 0.3971572355415901, + "learning_rate": 1.3795973542795988e-05, + "loss": 0.8315, + "num_tokens": 29370940758.0, + "step": 7027 + }, + { + "epoch": 0.8351752822341058, + "grad_norm": 0.3965475770438105, + "learning_rate": 1.3794327223558012e-05, + "loss": 0.8632, + "num_tokens": 29375129485.0, + "step": 7028 + }, + { + "epoch": 0.8352941176470589, + "grad_norm": 0.3906756227645609, + "learning_rate": 1.3792680800836652e-05, + "loss": 0.8365, + "num_tokens": 29379295617.0, + "step": 7029 + }, + { + "epoch": 0.8354129530600118, + "grad_norm": 0.47214845449205944, + "learning_rate": 1.3791034274692878e-05, + "loss": 0.8397, + "num_tokens": 29383485325.0, + "step": 7030 + }, + { + "epoch": 0.8355317884729649, + "grad_norm": 0.38678562943374933, + "learning_rate": 1.378938764518767e-05, + "loss": 0.8402, + "num_tokens": 29387660099.0, + "step": 7031 + }, + { + "epoch": 0.835650623885918, + "grad_norm": 0.3868708687181447, + "learning_rate": 1.3787740912382005e-05, + "loss": 0.8384, + "num_tokens": 29391847865.0, + "step": 7032 + }, + { + "epoch": 0.835769459298871, + "grad_norm": 0.40556889094389964, + "learning_rate": 1.378609407633687e-05, + "loss": 0.872, + "num_tokens": 29396034504.0, + "step": 7033 + }, + { + "epoch": 0.8358882947118241, + "grad_norm": 0.4633068243155352, + "learning_rate": 1.378444713711325e-05, + "loss": 0.8131, + "num_tokens": 29400223718.0, + "step": 7034 + }, + { + "epoch": 0.8360071301247772, + "grad_norm": 0.3794970944838372, + "learning_rate": 1.3782800094772138e-05, + "loss": 0.8264, + "num_tokens": 29404412930.0, + "step": 7035 + }, + { + "epoch": 0.8361259655377302, + "grad_norm": 0.36289684518069376, + "learning_rate": 1.3781152949374527e-05, + "loss": 0.8279, + "num_tokens": 29408596715.0, + "step": 7036 + }, + { + "epoch": 0.8362448009506833, + "grad_norm": 0.47133912940617073, + "learning_rate": 1.3779505700981424e-05, + "loss": 0.8311, + "num_tokens": 29412733820.0, + "step": 7037 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 0.40022231221537174, + "learning_rate": 1.3777858349653822e-05, + "loss": 0.8507, + "num_tokens": 29416913166.0, + "step": 7038 + }, + { + "epoch": 0.8364824717765894, + "grad_norm": 0.39756266246627686, + "learning_rate": 1.3776210895452738e-05, + "loss": 0.8345, + "num_tokens": 29421084537.0, + "step": 7039 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.36484401794641563, + "learning_rate": 1.3774563338439176e-05, + "loss": 0.7972, + "num_tokens": 29425273821.0, + "step": 7040 + }, + { + "epoch": 0.8367201426024955, + "grad_norm": 0.4319786181614781, + "learning_rate": 1.3772915678674152e-05, + "loss": 0.8709, + "num_tokens": 29429445382.0, + "step": 7041 + }, + { + "epoch": 0.8368389780154486, + "grad_norm": 0.3873277361911241, + "learning_rate": 1.3771267916218688e-05, + "loss": 0.8145, + "num_tokens": 29433620836.0, + "step": 7042 + }, + { + "epoch": 0.8369578134284017, + "grad_norm": 0.38006151546554334, + "learning_rate": 1.3769620051133796e-05, + "loss": 0.8625, + "num_tokens": 29437809049.0, + "step": 7043 + }, + { + "epoch": 0.8370766488413547, + "grad_norm": 0.4632991022438952, + "learning_rate": 1.3767972083480516e-05, + "loss": 0.8319, + "num_tokens": 29441999825.0, + "step": 7044 + }, + { + "epoch": 0.8371954842543078, + "grad_norm": 0.45908854812279715, + "learning_rate": 1.3766324013319872e-05, + "loss": 0.825, + "num_tokens": 29446170628.0, + "step": 7045 + }, + { + "epoch": 0.8373143196672609, + "grad_norm": 0.3744795505411975, + "learning_rate": 1.376467584071289e-05, + "loss": 0.8502, + "num_tokens": 29450358437.0, + "step": 7046 + }, + { + "epoch": 0.8374331550802139, + "grad_norm": 0.37574817413653994, + "learning_rate": 1.376302756572062e-05, + "loss": 0.8155, + "num_tokens": 29454548776.0, + "step": 7047 + }, + { + "epoch": 0.837551990493167, + "grad_norm": 0.40435910838337913, + "learning_rate": 1.3761379188404094e-05, + "loss": 0.8533, + "num_tokens": 29458738054.0, + "step": 7048 + }, + { + "epoch": 0.8376708259061201, + "grad_norm": 0.44304828424728204, + "learning_rate": 1.3759730708824358e-05, + "loss": 0.8247, + "num_tokens": 29462910589.0, + "step": 7049 + }, + { + "epoch": 0.8377896613190731, + "grad_norm": 0.3789458451074998, + "learning_rate": 1.3758082127042469e-05, + "loss": 0.8359, + "num_tokens": 29467098167.0, + "step": 7050 + }, + { + "epoch": 0.8379084967320262, + "grad_norm": 0.3835866822408082, + "learning_rate": 1.3756433443119468e-05, + "loss": 0.8495, + "num_tokens": 29471286514.0, + "step": 7051 + }, + { + "epoch": 0.8380273321449792, + "grad_norm": 0.3948700177615949, + "learning_rate": 1.375478465711642e-05, + "loss": 0.8111, + "num_tokens": 29475475273.0, + "step": 7052 + }, + { + "epoch": 0.8381461675579323, + "grad_norm": 0.3878709492498718, + "learning_rate": 1.375313576909438e-05, + "loss": 0.8434, + "num_tokens": 29479664147.0, + "step": 7053 + }, + { + "epoch": 0.8382650029708854, + "grad_norm": 0.39060656885964123, + "learning_rate": 1.3751486779114415e-05, + "loss": 0.8189, + "num_tokens": 29483853484.0, + "step": 7054 + }, + { + "epoch": 0.8383838383838383, + "grad_norm": 0.42349336420796885, + "learning_rate": 1.3749837687237593e-05, + "loss": 0.8879, + "num_tokens": 29488014717.0, + "step": 7055 + }, + { + "epoch": 0.8385026737967914, + "grad_norm": 0.4181412356837525, + "learning_rate": 1.3748188493524984e-05, + "loss": 0.8484, + "num_tokens": 29492203171.0, + "step": 7056 + }, + { + "epoch": 0.8386215092097445, + "grad_norm": 0.39442347364434793, + "learning_rate": 1.3746539198037665e-05, + "loss": 0.82, + "num_tokens": 29496392285.0, + "step": 7057 + }, + { + "epoch": 0.8387403446226975, + "grad_norm": 0.399824779831625, + "learning_rate": 1.3744889800836705e-05, + "loss": 0.823, + "num_tokens": 29500582131.0, + "step": 7058 + }, + { + "epoch": 0.8388591800356506, + "grad_norm": 0.32881760524772946, + "learning_rate": 1.37432403019832e-05, + "loss": 0.8932, + "num_tokens": 29504770889.0, + "step": 7059 + }, + { + "epoch": 0.8389780154486037, + "grad_norm": 0.32425905411304157, + "learning_rate": 1.3741590701538237e-05, + "loss": 0.8421, + "num_tokens": 29508960007.0, + "step": 7060 + }, + { + "epoch": 0.8390968508615567, + "grad_norm": 0.39642126783845016, + "learning_rate": 1.3739940999562893e-05, + "loss": 0.8399, + "num_tokens": 29513102661.0, + "step": 7061 + }, + { + "epoch": 0.8392156862745098, + "grad_norm": 0.44634611522116746, + "learning_rate": 1.3738291196118272e-05, + "loss": 0.8438, + "num_tokens": 29517262331.0, + "step": 7062 + }, + { + "epoch": 0.8393345216874628, + "grad_norm": 0.3721988233087739, + "learning_rate": 1.3736641291265471e-05, + "loss": 0.8139, + "num_tokens": 29521434967.0, + "step": 7063 + }, + { + "epoch": 0.8394533571004159, + "grad_norm": 0.4314820008648749, + "learning_rate": 1.373499128506559e-05, + "loss": 0.8105, + "num_tokens": 29525624989.0, + "step": 7064 + }, + { + "epoch": 0.839572192513369, + "grad_norm": 0.3680555373769979, + "learning_rate": 1.373334117757973e-05, + "loss": 0.8545, + "num_tokens": 29529813805.0, + "step": 7065 + }, + { + "epoch": 0.839691027926322, + "grad_norm": 0.3778230199654828, + "learning_rate": 1.373169096886901e-05, + "loss": 0.8244, + "num_tokens": 29534002739.0, + "step": 7066 + }, + { + "epoch": 0.8398098633392751, + "grad_norm": 0.38554564180836187, + "learning_rate": 1.3730040658994535e-05, + "loss": 0.7839, + "num_tokens": 29538191740.0, + "step": 7067 + }, + { + "epoch": 0.8399286987522282, + "grad_norm": 0.3328999135244654, + "learning_rate": 1.3728390248017427e-05, + "loss": 0.8743, + "num_tokens": 29542380687.0, + "step": 7068 + }, + { + "epoch": 0.8400475341651812, + "grad_norm": 0.4418852698859145, + "learning_rate": 1.3726739735998797e-05, + "loss": 0.8304, + "num_tokens": 29546508600.0, + "step": 7069 + }, + { + "epoch": 0.8401663695781343, + "grad_norm": 0.3541715278530772, + "learning_rate": 1.3725089122999784e-05, + "loss": 0.854, + "num_tokens": 29550681673.0, + "step": 7070 + }, + { + "epoch": 0.8402852049910874, + "grad_norm": 0.594358861073733, + "learning_rate": 1.3723438409081498e-05, + "loss": 0.8326, + "num_tokens": 29554798916.0, + "step": 7071 + }, + { + "epoch": 0.8404040404040404, + "grad_norm": 0.38794870534606196, + "learning_rate": 1.3721787594305083e-05, + "loss": 0.8766, + "num_tokens": 29558961477.0, + "step": 7072 + }, + { + "epoch": 0.8405228758169935, + "grad_norm": 0.462142335104797, + "learning_rate": 1.3720136678731674e-05, + "loss": 0.7781, + "num_tokens": 29563121305.0, + "step": 7073 + }, + { + "epoch": 0.8406417112299466, + "grad_norm": 0.4869654942651242, + "learning_rate": 1.3718485662422403e-05, + "loss": 0.7805, + "num_tokens": 29567309839.0, + "step": 7074 + }, + { + "epoch": 0.8407605466428996, + "grad_norm": 0.3687861797005506, + "learning_rate": 1.371683454543842e-05, + "loss": 0.8627, + "num_tokens": 29571469216.0, + "step": 7075 + }, + { + "epoch": 0.8408793820558527, + "grad_norm": 0.4947911008309961, + "learning_rate": 1.3715183327840864e-05, + "loss": 0.8276, + "num_tokens": 29575658760.0, + "step": 7076 + }, + { + "epoch": 0.8409982174688057, + "grad_norm": 0.37843456015263127, + "learning_rate": 1.3713532009690891e-05, + "loss": 0.8674, + "num_tokens": 29579847939.0, + "step": 7077 + }, + { + "epoch": 0.8411170528817588, + "grad_norm": 0.39968989515863074, + "learning_rate": 1.3711880591049653e-05, + "loss": 0.7884, + "num_tokens": 29584037753.0, + "step": 7078 + }, + { + "epoch": 0.8412358882947119, + "grad_norm": 0.45355195499144285, + "learning_rate": 1.3710229071978308e-05, + "loss": 0.8634, + "num_tokens": 29588227572.0, + "step": 7079 + }, + { + "epoch": 0.8413547237076648, + "grad_norm": 0.3548214636974668, + "learning_rate": 1.3708577452538014e-05, + "loss": 0.8443, + "num_tokens": 29592411367.0, + "step": 7080 + }, + { + "epoch": 0.8414735591206179, + "grad_norm": 0.3908968062382281, + "learning_rate": 1.3706925732789942e-05, + "loss": 0.8412, + "num_tokens": 29596601690.0, + "step": 7081 + }, + { + "epoch": 0.841592394533571, + "grad_norm": 0.40271479232657403, + "learning_rate": 1.3705273912795259e-05, + "loss": 0.8529, + "num_tokens": 29600791894.0, + "step": 7082 + }, + { + "epoch": 0.841711229946524, + "grad_norm": 0.4576772979773952, + "learning_rate": 1.3703621992615135e-05, + "loss": 0.867, + "num_tokens": 29604975683.0, + "step": 7083 + }, + { + "epoch": 0.8418300653594771, + "grad_norm": 0.39619317768687995, + "learning_rate": 1.3701969972310742e-05, + "loss": 0.8573, + "num_tokens": 29609144327.0, + "step": 7084 + }, + { + "epoch": 0.8419489007724302, + "grad_norm": 0.43090252611889335, + "learning_rate": 1.3700317851943272e-05, + "loss": 0.845, + "num_tokens": 29613296702.0, + "step": 7085 + }, + { + "epoch": 0.8420677361853832, + "grad_norm": 0.3745557586544133, + "learning_rate": 1.3698665631573902e-05, + "loss": 0.8223, + "num_tokens": 29617486708.0, + "step": 7086 + }, + { + "epoch": 0.8421865715983363, + "grad_norm": 0.36173476803283594, + "learning_rate": 1.3697013311263815e-05, + "loss": 0.83, + "num_tokens": 29621655651.0, + "step": 7087 + }, + { + "epoch": 0.8423054070112893, + "grad_norm": 0.3926561530635941, + "learning_rate": 1.3695360891074212e-05, + "loss": 0.8183, + "num_tokens": 29625812576.0, + "step": 7088 + }, + { + "epoch": 0.8424242424242424, + "grad_norm": 0.36805410556891327, + "learning_rate": 1.3693708371066278e-05, + "loss": 0.8497, + "num_tokens": 29630002549.0, + "step": 7089 + }, + { + "epoch": 0.8425430778371955, + "grad_norm": 0.3624796295795989, + "learning_rate": 1.3692055751301212e-05, + "loss": 0.8272, + "num_tokens": 29634191884.0, + "step": 7090 + }, + { + "epoch": 0.8426619132501485, + "grad_norm": 0.40399919101048903, + "learning_rate": 1.3690403031840228e-05, + "loss": 0.874, + "num_tokens": 29638380944.0, + "step": 7091 + }, + { + "epoch": 0.8427807486631016, + "grad_norm": 0.35254714544808424, + "learning_rate": 1.3688750212744517e-05, + "loss": 0.8568, + "num_tokens": 29642537935.0, + "step": 7092 + }, + { + "epoch": 0.8428995840760547, + "grad_norm": 0.42127510786614153, + "learning_rate": 1.3687097294075296e-05, + "loss": 0.8231, + "num_tokens": 29646695468.0, + "step": 7093 + }, + { + "epoch": 0.8430184194890077, + "grad_norm": 0.32115045977008044, + "learning_rate": 1.3685444275893777e-05, + "loss": 0.873, + "num_tokens": 29650861045.0, + "step": 7094 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.44758546188616194, + "learning_rate": 1.3683791158261177e-05, + "loss": 0.861, + "num_tokens": 29655050267.0, + "step": 7095 + }, + { + "epoch": 0.8432560903149139, + "grad_norm": 0.4253852114157669, + "learning_rate": 1.3682137941238719e-05, + "loss": 0.8852, + "num_tokens": 29659239598.0, + "step": 7096 + }, + { + "epoch": 0.8433749257278669, + "grad_norm": 0.38682657529859066, + "learning_rate": 1.368048462488762e-05, + "loss": 0.8394, + "num_tokens": 29663429860.0, + "step": 7097 + }, + { + "epoch": 0.84349376114082, + "grad_norm": 0.4228790818020232, + "learning_rate": 1.3678831209269114e-05, + "loss": 0.8261, + "num_tokens": 29667610198.0, + "step": 7098 + }, + { + "epoch": 0.8436125965537731, + "grad_norm": 0.33574849253589384, + "learning_rate": 1.3677177694444433e-05, + "loss": 0.8287, + "num_tokens": 29671788799.0, + "step": 7099 + }, + { + "epoch": 0.8437314319667261, + "grad_norm": 0.3995323247750478, + "learning_rate": 1.3675524080474807e-05, + "loss": 0.8329, + "num_tokens": 29675928517.0, + "step": 7100 + }, + { + "epoch": 0.8438502673796792, + "grad_norm": 0.34520107458087945, + "learning_rate": 1.367387036742148e-05, + "loss": 0.8255, + "num_tokens": 29680118155.0, + "step": 7101 + }, + { + "epoch": 0.8439691027926322, + "grad_norm": 0.3904060338318916, + "learning_rate": 1.3672216555345695e-05, + "loss": 0.7783, + "num_tokens": 29684307199.0, + "step": 7102 + }, + { + "epoch": 0.8440879382055853, + "grad_norm": 0.3675044199830288, + "learning_rate": 1.3670562644308691e-05, + "loss": 0.8151, + "num_tokens": 29688496185.0, + "step": 7103 + }, + { + "epoch": 0.8442067736185384, + "grad_norm": 0.37286984394909656, + "learning_rate": 1.3668908634371725e-05, + "loss": 0.8777, + "num_tokens": 29692671766.0, + "step": 7104 + }, + { + "epoch": 0.8443256090314913, + "grad_norm": 0.3868569741019669, + "learning_rate": 1.3667254525596049e-05, + "loss": 0.836, + "num_tokens": 29696860908.0, + "step": 7105 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.4619192844966575, + "learning_rate": 1.3665600318042918e-05, + "loss": 0.8592, + "num_tokens": 29701029039.0, + "step": 7106 + }, + { + "epoch": 0.8445632798573975, + "grad_norm": 0.33975949422333923, + "learning_rate": 1.36639460117736e-05, + "loss": 0.7967, + "num_tokens": 29705190176.0, + "step": 7107 + }, + { + "epoch": 0.8446821152703505, + "grad_norm": 0.3352906407394728, + "learning_rate": 1.3662291606849347e-05, + "loss": 0.8484, + "num_tokens": 29709377561.0, + "step": 7108 + }, + { + "epoch": 0.8448009506833036, + "grad_norm": 0.3466477700358326, + "learning_rate": 1.3660637103331437e-05, + "loss": 0.7996, + "num_tokens": 29713566261.0, + "step": 7109 + }, + { + "epoch": 0.8449197860962567, + "grad_norm": 0.36691300829202134, + "learning_rate": 1.3658982501281142e-05, + "loss": 0.8295, + "num_tokens": 29717755068.0, + "step": 7110 + }, + { + "epoch": 0.8450386215092097, + "grad_norm": 0.4172234127754619, + "learning_rate": 1.3657327800759732e-05, + "loss": 0.8591, + "num_tokens": 29721944217.0, + "step": 7111 + }, + { + "epoch": 0.8451574569221628, + "grad_norm": 0.39632639750442833, + "learning_rate": 1.3655673001828488e-05, + "loss": 0.8163, + "num_tokens": 29726134277.0, + "step": 7112 + }, + { + "epoch": 0.8452762923351159, + "grad_norm": 0.3855313288576199, + "learning_rate": 1.3654018104548697e-05, + "loss": 0.8663, + "num_tokens": 29730324617.0, + "step": 7113 + }, + { + "epoch": 0.8453951277480689, + "grad_norm": 0.42035900153610867, + "learning_rate": 1.365236310898164e-05, + "loss": 0.7917, + "num_tokens": 29734501794.0, + "step": 7114 + }, + { + "epoch": 0.845513963161022, + "grad_norm": 0.3806634031009775, + "learning_rate": 1.3650708015188612e-05, + "loss": 0.8196, + "num_tokens": 29738691271.0, + "step": 7115 + }, + { + "epoch": 0.845632798573975, + "grad_norm": 0.41824132797893193, + "learning_rate": 1.36490528232309e-05, + "loss": 0.8319, + "num_tokens": 29742844134.0, + "step": 7116 + }, + { + "epoch": 0.8457516339869281, + "grad_norm": 0.3851016929100552, + "learning_rate": 1.3647397533169806e-05, + "loss": 0.8699, + "num_tokens": 29747030270.0, + "step": 7117 + }, + { + "epoch": 0.8458704693998812, + "grad_norm": 0.41656907357321626, + "learning_rate": 1.3645742145066632e-05, + "loss": 0.8448, + "num_tokens": 29751207763.0, + "step": 7118 + }, + { + "epoch": 0.8459893048128342, + "grad_norm": 0.40185451486807994, + "learning_rate": 1.3644086658982679e-05, + "loss": 0.8082, + "num_tokens": 29755387278.0, + "step": 7119 + }, + { + "epoch": 0.8461081402257873, + "grad_norm": 0.4153437208810737, + "learning_rate": 1.3642431074979258e-05, + "loss": 0.8101, + "num_tokens": 29759576784.0, + "step": 7120 + }, + { + "epoch": 0.8462269756387404, + "grad_norm": 0.44991457301273036, + "learning_rate": 1.3640775393117682e-05, + "loss": 0.8236, + "num_tokens": 29763725559.0, + "step": 7121 + }, + { + "epoch": 0.8463458110516934, + "grad_norm": 0.3425691513166564, + "learning_rate": 1.3639119613459262e-05, + "loss": 0.8017, + "num_tokens": 29767896176.0, + "step": 7122 + }, + { + "epoch": 0.8464646464646465, + "grad_norm": 0.45845281916474245, + "learning_rate": 1.3637463736065319e-05, + "loss": 0.8668, + "num_tokens": 29772055142.0, + "step": 7123 + }, + { + "epoch": 0.8465834818775996, + "grad_norm": 0.42785292956952287, + "learning_rate": 1.363580776099718e-05, + "loss": 0.8521, + "num_tokens": 29776244649.0, + "step": 7124 + }, + { + "epoch": 0.8467023172905526, + "grad_norm": 0.3430895191835088, + "learning_rate": 1.3634151688316163e-05, + "loss": 0.8423, + "num_tokens": 29780416039.0, + "step": 7125 + }, + { + "epoch": 0.8468211527035057, + "grad_norm": 0.4618132983754854, + "learning_rate": 1.3632495518083606e-05, + "loss": 0.8659, + "num_tokens": 29784574945.0, + "step": 7126 + }, + { + "epoch": 0.8469399881164587, + "grad_norm": 0.46102037664338247, + "learning_rate": 1.3630839250360838e-05, + "loss": 0.8389, + "num_tokens": 29788764226.0, + "step": 7127 + }, + { + "epoch": 0.8470588235294118, + "grad_norm": 0.38108362000090656, + "learning_rate": 1.3629182885209197e-05, + "loss": 0.84, + "num_tokens": 29792935862.0, + "step": 7128 + }, + { + "epoch": 0.8471776589423649, + "grad_norm": 0.5392579136952711, + "learning_rate": 1.3627526422690024e-05, + "loss": 0.8475, + "num_tokens": 29797125387.0, + "step": 7129 + }, + { + "epoch": 0.8472964943553178, + "grad_norm": 0.3860843642029128, + "learning_rate": 1.3625869862864669e-05, + "loss": 0.8443, + "num_tokens": 29801314015.0, + "step": 7130 + }, + { + "epoch": 0.8474153297682709, + "grad_norm": 0.5610715188596277, + "learning_rate": 1.362421320579447e-05, + "loss": 0.8326, + "num_tokens": 29805495049.0, + "step": 7131 + }, + { + "epoch": 0.847534165181224, + "grad_norm": 0.3942713977384374, + "learning_rate": 1.3622556451540781e-05, + "loss": 0.7954, + "num_tokens": 29809683173.0, + "step": 7132 + }, + { + "epoch": 0.847653000594177, + "grad_norm": 0.5504614607860929, + "learning_rate": 1.3620899600164965e-05, + "loss": 0.8256, + "num_tokens": 29813865464.0, + "step": 7133 + }, + { + "epoch": 0.8477718360071301, + "grad_norm": 0.42022369455700914, + "learning_rate": 1.3619242651728373e-05, + "loss": 0.8155, + "num_tokens": 29818055311.0, + "step": 7134 + }, + { + "epoch": 0.8478906714200832, + "grad_norm": 0.5575448464074814, + "learning_rate": 1.3617585606292369e-05, + "loss": 0.8801, + "num_tokens": 29822238732.0, + "step": 7135 + }, + { + "epoch": 0.8480095068330362, + "grad_norm": 0.4172364162768059, + "learning_rate": 1.361592846391832e-05, + "loss": 0.8016, + "num_tokens": 29826429867.0, + "step": 7136 + }, + { + "epoch": 0.8481283422459893, + "grad_norm": 0.5286407767213869, + "learning_rate": 1.3614271224667597e-05, + "loss": 0.8022, + "num_tokens": 29830590957.0, + "step": 7137 + }, + { + "epoch": 0.8482471776589424, + "grad_norm": 0.44589581621796526, + "learning_rate": 1.361261388860157e-05, + "loss": 0.8524, + "num_tokens": 29834756678.0, + "step": 7138 + }, + { + "epoch": 0.8483660130718954, + "grad_norm": 0.5224445058150975, + "learning_rate": 1.3610956455781617e-05, + "loss": 0.8551, + "num_tokens": 29838945663.0, + "step": 7139 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 0.46508218353160247, + "learning_rate": 1.3609298926269117e-05, + "loss": 0.8336, + "num_tokens": 29843109491.0, + "step": 7140 + }, + { + "epoch": 0.8486036838978015, + "grad_norm": 0.5032128944776915, + "learning_rate": 1.3607641300125458e-05, + "loss": 0.815, + "num_tokens": 29847299341.0, + "step": 7141 + }, + { + "epoch": 0.8487225193107546, + "grad_norm": 0.427852181633146, + "learning_rate": 1.3605983577412023e-05, + "loss": 0.8425, + "num_tokens": 29851488515.0, + "step": 7142 + }, + { + "epoch": 0.8488413547237077, + "grad_norm": 0.47599181542318275, + "learning_rate": 1.3604325758190209e-05, + "loss": 0.8288, + "num_tokens": 29855658028.0, + "step": 7143 + }, + { + "epoch": 0.8489601901366607, + "grad_norm": 0.4038345422055148, + "learning_rate": 1.3602667842521402e-05, + "loss": 0.8433, + "num_tokens": 29859847348.0, + "step": 7144 + }, + { + "epoch": 0.8490790255496138, + "grad_norm": 0.3877456728611444, + "learning_rate": 1.3601009830467008e-05, + "loss": 0.8221, + "num_tokens": 29864037079.0, + "step": 7145 + }, + { + "epoch": 0.8491978609625669, + "grad_norm": 0.4309149852848363, + "learning_rate": 1.3599351722088428e-05, + "loss": 0.8435, + "num_tokens": 29868200065.0, + "step": 7146 + }, + { + "epoch": 0.8493166963755199, + "grad_norm": 0.39264137464110366, + "learning_rate": 1.3597693517447058e-05, + "loss": 0.8093, + "num_tokens": 29872390224.0, + "step": 7147 + }, + { + "epoch": 0.849435531788473, + "grad_norm": 0.4463505879649508, + "learning_rate": 1.359603521660432e-05, + "loss": 0.8566, + "num_tokens": 29876578336.0, + "step": 7148 + }, + { + "epoch": 0.8495543672014261, + "grad_norm": 0.3822902608705336, + "learning_rate": 1.3594376819621617e-05, + "loss": 0.8561, + "num_tokens": 29880752550.0, + "step": 7149 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.47183989786674563, + "learning_rate": 1.3592718326560372e-05, + "loss": 0.8066, + "num_tokens": 29884914506.0, + "step": 7150 + }, + { + "epoch": 0.8497920380273322, + "grad_norm": 0.320532763550389, + "learning_rate": 1.3591059737482e-05, + "loss": 0.8338, + "num_tokens": 29889074307.0, + "step": 7151 + }, + { + "epoch": 0.8499108734402852, + "grad_norm": 0.4954969670103316, + "learning_rate": 1.3589401052447924e-05, + "loss": 0.8511, + "num_tokens": 29893243597.0, + "step": 7152 + }, + { + "epoch": 0.8500297088532383, + "grad_norm": 0.37635906640643474, + "learning_rate": 1.3587742271519573e-05, + "loss": 0.8319, + "num_tokens": 29897433557.0, + "step": 7153 + }, + { + "epoch": 0.8501485442661914, + "grad_norm": 0.48475615512850817, + "learning_rate": 1.3586083394758376e-05, + "loss": 0.7716, + "num_tokens": 29901622876.0, + "step": 7154 + }, + { + "epoch": 0.8502673796791443, + "grad_norm": 0.4261966272179752, + "learning_rate": 1.3584424422225769e-05, + "loss": 0.8281, + "num_tokens": 29905812424.0, + "step": 7155 + }, + { + "epoch": 0.8503862150920974, + "grad_norm": 0.40070536818684654, + "learning_rate": 1.3582765353983182e-05, + "loss": 0.8567, + "num_tokens": 29910002426.0, + "step": 7156 + }, + { + "epoch": 0.8505050505050505, + "grad_norm": 0.4524972738669759, + "learning_rate": 1.3581106190092067e-05, + "loss": 0.8266, + "num_tokens": 29914155988.0, + "step": 7157 + }, + { + "epoch": 0.8506238859180035, + "grad_norm": 0.3885251175209926, + "learning_rate": 1.357944693061386e-05, + "loss": 0.8664, + "num_tokens": 29918327975.0, + "step": 7158 + }, + { + "epoch": 0.8507427213309566, + "grad_norm": 0.5244751935971879, + "learning_rate": 1.3577787575610015e-05, + "loss": 0.8862, + "num_tokens": 29922500640.0, + "step": 7159 + }, + { + "epoch": 0.8508615567439097, + "grad_norm": 0.41940329629748285, + "learning_rate": 1.3576128125141976e-05, + "loss": 0.8385, + "num_tokens": 29926659529.0, + "step": 7160 + }, + { + "epoch": 0.8509803921568627, + "grad_norm": 0.5543025365976898, + "learning_rate": 1.3574468579271205e-05, + "loss": 0.8092, + "num_tokens": 29930848129.0, + "step": 7161 + }, + { + "epoch": 0.8510992275698158, + "grad_norm": 0.4471583436236855, + "learning_rate": 1.3572808938059158e-05, + "loss": 0.8441, + "num_tokens": 29935037419.0, + "step": 7162 + }, + { + "epoch": 0.8512180629827689, + "grad_norm": 0.5791776237148392, + "learning_rate": 1.3571149201567296e-05, + "loss": 0.8118, + "num_tokens": 29939227421.0, + "step": 7163 + }, + { + "epoch": 0.8513368983957219, + "grad_norm": 0.4376019512602802, + "learning_rate": 1.3569489369857087e-05, + "loss": 0.8569, + "num_tokens": 29943391756.0, + "step": 7164 + }, + { + "epoch": 0.851455733808675, + "grad_norm": 0.5867158472812697, + "learning_rate": 1.356782944299e-05, + "loss": 0.8553, + "num_tokens": 29947581159.0, + "step": 7165 + }, + { + "epoch": 0.851574569221628, + "grad_norm": 0.4738575741372837, + "learning_rate": 1.3566169421027504e-05, + "loss": 0.834, + "num_tokens": 29951744195.0, + "step": 7166 + }, + { + "epoch": 0.8516934046345811, + "grad_norm": 0.5487161750501354, + "learning_rate": 1.3564509304031082e-05, + "loss": 0.8182, + "num_tokens": 29955931335.0, + "step": 7167 + }, + { + "epoch": 0.8518122400475342, + "grad_norm": 0.45563913815112705, + "learning_rate": 1.3562849092062202e-05, + "loss": 0.8493, + "num_tokens": 29960121142.0, + "step": 7168 + }, + { + "epoch": 0.8519310754604872, + "grad_norm": 0.565234217600554, + "learning_rate": 1.3561188785182361e-05, + "loss": 0.8133, + "num_tokens": 29964310782.0, + "step": 7169 + }, + { + "epoch": 0.8520499108734403, + "grad_norm": 0.47292531735244187, + "learning_rate": 1.355952838345304e-05, + "loss": 0.8537, + "num_tokens": 29968499912.0, + "step": 7170 + }, + { + "epoch": 0.8521687462863934, + "grad_norm": 0.5290846654505159, + "learning_rate": 1.3557867886935727e-05, + "loss": 0.8247, + "num_tokens": 29972689644.0, + "step": 7171 + }, + { + "epoch": 0.8522875816993464, + "grad_norm": 0.4720535519810032, + "learning_rate": 1.3556207295691919e-05, + "loss": 0.8433, + "num_tokens": 29976878999.0, + "step": 7172 + }, + { + "epoch": 0.8524064171122995, + "grad_norm": 0.5335372348938385, + "learning_rate": 1.355454660978311e-05, + "loss": 0.8514, + "num_tokens": 29981069483.0, + "step": 7173 + }, + { + "epoch": 0.8525252525252526, + "grad_norm": 0.4782817032674877, + "learning_rate": 1.3552885829270806e-05, + "loss": 0.8691, + "num_tokens": 29985258362.0, + "step": 7174 + }, + { + "epoch": 0.8526440879382056, + "grad_norm": 0.5112870880454811, + "learning_rate": 1.3551224954216507e-05, + "loss": 0.8365, + "num_tokens": 29989439258.0, + "step": 7175 + }, + { + "epoch": 0.8527629233511587, + "grad_norm": 0.4933722754965584, + "learning_rate": 1.3549563984681719e-05, + "loss": 0.83, + "num_tokens": 29993629475.0, + "step": 7176 + }, + { + "epoch": 0.8528817587641117, + "grad_norm": 0.41549527755045546, + "learning_rate": 1.3547902920727963e-05, + "loss": 0.8708, + "num_tokens": 29997789931.0, + "step": 7177 + }, + { + "epoch": 0.8530005941770648, + "grad_norm": 0.401705289422079, + "learning_rate": 1.354624176241674e-05, + "loss": 0.8375, + "num_tokens": 30001979101.0, + "step": 7178 + }, + { + "epoch": 0.8531194295900179, + "grad_norm": 0.5193050483626418, + "learning_rate": 1.3544580509809577e-05, + "loss": 0.8211, + "num_tokens": 30006168865.0, + "step": 7179 + }, + { + "epoch": 0.8532382650029708, + "grad_norm": 0.4067078386114779, + "learning_rate": 1.3542919162967998e-05, + "loss": 0.8004, + "num_tokens": 30010356769.0, + "step": 7180 + }, + { + "epoch": 0.8533571004159239, + "grad_norm": 0.4485403478222812, + "learning_rate": 1.3541257721953522e-05, + "loss": 0.8559, + "num_tokens": 30014545647.0, + "step": 7181 + }, + { + "epoch": 0.853475935828877, + "grad_norm": 0.4282826702396372, + "learning_rate": 1.3539596186827681e-05, + "loss": 0.8241, + "num_tokens": 30018735735.0, + "step": 7182 + }, + { + "epoch": 0.85359477124183, + "grad_norm": 0.43798328702563205, + "learning_rate": 1.3537934557652006e-05, + "loss": 0.8199, + "num_tokens": 30022923366.0, + "step": 7183 + }, + { + "epoch": 0.8537136066547831, + "grad_norm": 0.36104592410446074, + "learning_rate": 1.3536272834488033e-05, + "loss": 0.8254, + "num_tokens": 30027090918.0, + "step": 7184 + }, + { + "epoch": 0.8538324420677362, + "grad_norm": 0.49603528903421096, + "learning_rate": 1.3534611017397305e-05, + "loss": 0.8099, + "num_tokens": 30031269644.0, + "step": 7185 + }, + { + "epoch": 0.8539512774806892, + "grad_norm": 0.38725911078277653, + "learning_rate": 1.3532949106441358e-05, + "loss": 0.8332, + "num_tokens": 30035459871.0, + "step": 7186 + }, + { + "epoch": 0.8540701128936423, + "grad_norm": 0.5404972937642575, + "learning_rate": 1.3531287101681743e-05, + "loss": 0.8013, + "num_tokens": 30039647929.0, + "step": 7187 + }, + { + "epoch": 0.8541889483065954, + "grad_norm": 0.4829060733078401, + "learning_rate": 1.3529625003180009e-05, + "loss": 0.8797, + "num_tokens": 30043835166.0, + "step": 7188 + }, + { + "epoch": 0.8543077837195484, + "grad_norm": 0.4497585537300375, + "learning_rate": 1.3527962810997702e-05, + "loss": 0.826, + "num_tokens": 30048023122.0, + "step": 7189 + }, + { + "epoch": 0.8544266191325015, + "grad_norm": 0.4577544171393597, + "learning_rate": 1.3526300525196392e-05, + "loss": 0.8023, + "num_tokens": 30052212498.0, + "step": 7190 + }, + { + "epoch": 0.8545454545454545, + "grad_norm": 0.39040520776178855, + "learning_rate": 1.352463814583763e-05, + "loss": 0.8117, + "num_tokens": 30056375398.0, + "step": 7191 + }, + { + "epoch": 0.8546642899584076, + "grad_norm": 0.5236152907030796, + "learning_rate": 1.3522975672982978e-05, + "loss": 0.8126, + "num_tokens": 30060563493.0, + "step": 7192 + }, + { + "epoch": 0.8547831253713607, + "grad_norm": 0.4297379199481328, + "learning_rate": 1.3521313106694013e-05, + "loss": 0.8611, + "num_tokens": 30064753627.0, + "step": 7193 + }, + { + "epoch": 0.8549019607843137, + "grad_norm": 0.45758546303362846, + "learning_rate": 1.3519650447032294e-05, + "loss": 0.8282, + "num_tokens": 30068931273.0, + "step": 7194 + }, + { + "epoch": 0.8550207961972668, + "grad_norm": 0.38204779961876806, + "learning_rate": 1.35179876940594e-05, + "loss": 0.8475, + "num_tokens": 30073095579.0, + "step": 7195 + }, + { + "epoch": 0.8551396316102199, + "grad_norm": 0.40240813792370206, + "learning_rate": 1.3516324847836907e-05, + "loss": 0.8747, + "num_tokens": 30077283445.0, + "step": 7196 + }, + { + "epoch": 0.8552584670231729, + "grad_norm": 0.4302372411966981, + "learning_rate": 1.3514661908426401e-05, + "loss": 0.8627, + "num_tokens": 30081450228.0, + "step": 7197 + }, + { + "epoch": 0.855377302436126, + "grad_norm": 0.41170508591286564, + "learning_rate": 1.3512998875889461e-05, + "loss": 0.8468, + "num_tokens": 30085620931.0, + "step": 7198 + }, + { + "epoch": 0.8554961378490791, + "grad_norm": 0.4456406866258374, + "learning_rate": 1.3511335750287674e-05, + "loss": 0.8351, + "num_tokens": 30089810654.0, + "step": 7199 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 0.3675615103598478, + "learning_rate": 1.3509672531682636e-05, + "loss": 0.8911, + "num_tokens": 30093997291.0, + "step": 7200 + }, + { + "epoch": 0.8557338086749852, + "grad_norm": 0.3493475922899675, + "learning_rate": 1.350800922013594e-05, + "loss": 0.7925, + "num_tokens": 30098158381.0, + "step": 7201 + }, + { + "epoch": 0.8558526440879382, + "grad_norm": 0.39527684430545845, + "learning_rate": 1.3506345815709177e-05, + "loss": 0.8481, + "num_tokens": 30102348192.0, + "step": 7202 + }, + { + "epoch": 0.8559714795008913, + "grad_norm": 0.40343060436470835, + "learning_rate": 1.350468231846396e-05, + "loss": 0.8569, + "num_tokens": 30106507612.0, + "step": 7203 + }, + { + "epoch": 0.8560903149138444, + "grad_norm": 0.4078595686212006, + "learning_rate": 1.3503018728461886e-05, + "loss": 0.846, + "num_tokens": 30110697573.0, + "step": 7204 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.3928685628426751, + "learning_rate": 1.3501355045764561e-05, + "loss": 0.7793, + "num_tokens": 30114869008.0, + "step": 7205 + }, + { + "epoch": 0.8563279857397504, + "grad_norm": 0.3604334355692997, + "learning_rate": 1.3499691270433608e-05, + "loss": 0.8632, + "num_tokens": 30119014359.0, + "step": 7206 + }, + { + "epoch": 0.8564468211527035, + "grad_norm": 0.4028778211311268, + "learning_rate": 1.3498027402530632e-05, + "loss": 0.8396, + "num_tokens": 30123202158.0, + "step": 7207 + }, + { + "epoch": 0.8565656565656565, + "grad_norm": 0.32771085280285295, + "learning_rate": 1.3496363442117257e-05, + "loss": 0.7949, + "num_tokens": 30127381586.0, + "step": 7208 + }, + { + "epoch": 0.8566844919786096, + "grad_norm": 0.36593462170758206, + "learning_rate": 1.3494699389255104e-05, + "loss": 0.8092, + "num_tokens": 30131569286.0, + "step": 7209 + }, + { + "epoch": 0.8568033273915627, + "grad_norm": 0.38134150470369393, + "learning_rate": 1.3493035244005796e-05, + "loss": 0.8428, + "num_tokens": 30135757709.0, + "step": 7210 + }, + { + "epoch": 0.8569221628045157, + "grad_norm": 0.42104317390317036, + "learning_rate": 1.3491371006430964e-05, + "loss": 0.8521, + "num_tokens": 30139946108.0, + "step": 7211 + }, + { + "epoch": 0.8570409982174688, + "grad_norm": 0.3192137765188789, + "learning_rate": 1.348970667659224e-05, + "loss": 0.8338, + "num_tokens": 30144134570.0, + "step": 7212 + }, + { + "epoch": 0.8571598336304219, + "grad_norm": 0.4355877792862198, + "learning_rate": 1.348804225455126e-05, + "loss": 0.8551, + "num_tokens": 30148323234.0, + "step": 7213 + }, + { + "epoch": 0.8572786690433749, + "grad_norm": 0.35598679487836077, + "learning_rate": 1.3486377740369665e-05, + "loss": 0.8481, + "num_tokens": 30152513197.0, + "step": 7214 + }, + { + "epoch": 0.857397504456328, + "grad_norm": 0.3805328593814151, + "learning_rate": 1.3484713134109094e-05, + "loss": 0.8166, + "num_tokens": 30156702688.0, + "step": 7215 + }, + { + "epoch": 0.857516339869281, + "grad_norm": 0.3827172998968716, + "learning_rate": 1.3483048435831193e-05, + "loss": 0.8294, + "num_tokens": 30160891054.0, + "step": 7216 + }, + { + "epoch": 0.8576351752822341, + "grad_norm": 0.39013102755030005, + "learning_rate": 1.3481383645597614e-05, + "loss": 0.8831, + "num_tokens": 30165074566.0, + "step": 7217 + }, + { + "epoch": 0.8577540106951872, + "grad_norm": 0.38998943861730667, + "learning_rate": 1.347971876347001e-05, + "loss": 0.8599, + "num_tokens": 30169264526.0, + "step": 7218 + }, + { + "epoch": 0.8578728461081402, + "grad_norm": 0.41165719663090783, + "learning_rate": 1.3478053789510036e-05, + "loss": 0.8483, + "num_tokens": 30173443026.0, + "step": 7219 + }, + { + "epoch": 0.8579916815210933, + "grad_norm": 0.35069262001943974, + "learning_rate": 1.3476388723779353e-05, + "loss": 0.8055, + "num_tokens": 30177603577.0, + "step": 7220 + }, + { + "epoch": 0.8581105169340464, + "grad_norm": 0.4570902901302758, + "learning_rate": 1.347472356633962e-05, + "loss": 0.8292, + "num_tokens": 30181792127.0, + "step": 7221 + }, + { + "epoch": 0.8582293523469994, + "grad_norm": 0.34780166338018553, + "learning_rate": 1.3473058317252506e-05, + "loss": 0.834, + "num_tokens": 30185981292.0, + "step": 7222 + }, + { + "epoch": 0.8583481877599525, + "grad_norm": 0.3920564606348067, + "learning_rate": 1.3471392976579682e-05, + "loss": 0.8202, + "num_tokens": 30190170227.0, + "step": 7223 + }, + { + "epoch": 0.8584670231729056, + "grad_norm": 0.48931812950701714, + "learning_rate": 1.346972754438282e-05, + "loss": 0.869, + "num_tokens": 30194359813.0, + "step": 7224 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.36148986317665516, + "learning_rate": 1.3468062020723598e-05, + "loss": 0.8528, + "num_tokens": 30198501339.0, + "step": 7225 + }, + { + "epoch": 0.8587046939988117, + "grad_norm": 0.40194498374904186, + "learning_rate": 1.346639640566369e-05, + "loss": 0.8442, + "num_tokens": 30202683286.0, + "step": 7226 + }, + { + "epoch": 0.8588235294117647, + "grad_norm": 0.4352870306777917, + "learning_rate": 1.3464730699264789e-05, + "loss": 0.8131, + "num_tokens": 30206856256.0, + "step": 7227 + }, + { + "epoch": 0.8589423648247178, + "grad_norm": 0.344324362867471, + "learning_rate": 1.3463064901588574e-05, + "loss": 0.8204, + "num_tokens": 30211043949.0, + "step": 7228 + }, + { + "epoch": 0.8590612002376709, + "grad_norm": 0.34905076109656485, + "learning_rate": 1.3461399012696741e-05, + "loss": 0.8369, + "num_tokens": 30215232090.0, + "step": 7229 + }, + { + "epoch": 0.8591800356506238, + "grad_norm": 0.3738156358149304, + "learning_rate": 1.3459733032650978e-05, + "loss": 0.8492, + "num_tokens": 30219421488.0, + "step": 7230 + }, + { + "epoch": 0.8592988710635769, + "grad_norm": 0.36794725961642927, + "learning_rate": 1.3458066961512983e-05, + "loss": 0.8744, + "num_tokens": 30223558488.0, + "step": 7231 + }, + { + "epoch": 0.85941770647653, + "grad_norm": 0.3536683107103366, + "learning_rate": 1.345640079934446e-05, + "loss": 0.8486, + "num_tokens": 30227746130.0, + "step": 7232 + }, + { + "epoch": 0.859536541889483, + "grad_norm": 0.404821503778898, + "learning_rate": 1.345473454620711e-05, + "loss": 0.8249, + "num_tokens": 30231935138.0, + "step": 7233 + }, + { + "epoch": 0.8596553773024361, + "grad_norm": 0.40418245824330584, + "learning_rate": 1.3453068202162638e-05, + "loss": 0.8324, + "num_tokens": 30236123381.0, + "step": 7234 + }, + { + "epoch": 0.8597742127153892, + "grad_norm": 0.3261032273598542, + "learning_rate": 1.345140176727276e-05, + "loss": 0.8392, + "num_tokens": 30240312932.0, + "step": 7235 + }, + { + "epoch": 0.8598930481283422, + "grad_norm": 0.45294874235058563, + "learning_rate": 1.3449735241599182e-05, + "loss": 0.8225, + "num_tokens": 30244502033.0, + "step": 7236 + }, + { + "epoch": 0.8600118835412953, + "grad_norm": 0.33598246015068656, + "learning_rate": 1.344806862520363e-05, + "loss": 0.8289, + "num_tokens": 30248691884.0, + "step": 7237 + }, + { + "epoch": 0.8601307189542484, + "grad_norm": 0.3569286914584151, + "learning_rate": 1.3446401918147816e-05, + "loss": 0.8501, + "num_tokens": 30252881184.0, + "step": 7238 + }, + { + "epoch": 0.8602495543672014, + "grad_norm": 0.4069254252939108, + "learning_rate": 1.3444735120493467e-05, + "loss": 0.8494, + "num_tokens": 30257044480.0, + "step": 7239 + }, + { + "epoch": 0.8603683897801545, + "grad_norm": 0.3637577837170512, + "learning_rate": 1.3443068232302312e-05, + "loss": 0.8352, + "num_tokens": 30261231708.0, + "step": 7240 + }, + { + "epoch": 0.8604872251931075, + "grad_norm": 0.422421101628706, + "learning_rate": 1.3441401253636082e-05, + "loss": 0.8292, + "num_tokens": 30265420832.0, + "step": 7241 + }, + { + "epoch": 0.8606060606060606, + "grad_norm": 0.4080578030148709, + "learning_rate": 1.3439734184556505e-05, + "loss": 0.8352, + "num_tokens": 30269609786.0, + "step": 7242 + }, + { + "epoch": 0.8607248960190137, + "grad_norm": 0.3579049500257501, + "learning_rate": 1.3438067025125326e-05, + "loss": 0.8397, + "num_tokens": 30273798390.0, + "step": 7243 + }, + { + "epoch": 0.8608437314319667, + "grad_norm": 0.3973201933417021, + "learning_rate": 1.3436399775404278e-05, + "loss": 0.8344, + "num_tokens": 30277964433.0, + "step": 7244 + }, + { + "epoch": 0.8609625668449198, + "grad_norm": 0.37553976460341976, + "learning_rate": 1.3434732435455113e-05, + "loss": 0.8091, + "num_tokens": 30282133294.0, + "step": 7245 + }, + { + "epoch": 0.8610814022578729, + "grad_norm": 0.35229304379327847, + "learning_rate": 1.343306500533957e-05, + "loss": 0.8095, + "num_tokens": 30286289775.0, + "step": 7246 + }, + { + "epoch": 0.8612002376708259, + "grad_norm": 0.37967795145125727, + "learning_rate": 1.3431397485119406e-05, + "loss": 0.803, + "num_tokens": 30290461056.0, + "step": 7247 + }, + { + "epoch": 0.861319073083779, + "grad_norm": 0.40722178986695295, + "learning_rate": 1.3429729874856372e-05, + "loss": 0.8506, + "num_tokens": 30294647076.0, + "step": 7248 + }, + { + "epoch": 0.8614379084967321, + "grad_norm": 0.33788439875330467, + "learning_rate": 1.3428062174612222e-05, + "loss": 0.781, + "num_tokens": 30298818379.0, + "step": 7249 + }, + { + "epoch": 0.8615567439096851, + "grad_norm": 0.39206772846532184, + "learning_rate": 1.3426394384448727e-05, + "loss": 0.8484, + "num_tokens": 30302978554.0, + "step": 7250 + }, + { + "epoch": 0.8616755793226382, + "grad_norm": 0.45454605304050205, + "learning_rate": 1.342472650442764e-05, + "loss": 0.8201, + "num_tokens": 30307154871.0, + "step": 7251 + }, + { + "epoch": 0.8617944147355912, + "grad_norm": 0.4216604862291262, + "learning_rate": 1.3423058534610732e-05, + "loss": 0.8601, + "num_tokens": 30311314553.0, + "step": 7252 + }, + { + "epoch": 0.8619132501485443, + "grad_norm": 0.333238502584731, + "learning_rate": 1.3421390475059776e-05, + "loss": 0.8212, + "num_tokens": 30315473237.0, + "step": 7253 + }, + { + "epoch": 0.8620320855614974, + "grad_norm": 0.3883349116761201, + "learning_rate": 1.3419722325836542e-05, + "loss": 0.8333, + "num_tokens": 30319648581.0, + "step": 7254 + }, + { + "epoch": 0.8621509209744503, + "grad_norm": 0.3933162509878086, + "learning_rate": 1.341805408700281e-05, + "loss": 0.7991, + "num_tokens": 30323812139.0, + "step": 7255 + }, + { + "epoch": 0.8622697563874034, + "grad_norm": 0.48527131077866836, + "learning_rate": 1.3416385758620358e-05, + "loss": 0.8459, + "num_tokens": 30328001741.0, + "step": 7256 + }, + { + "epoch": 0.8623885918003565, + "grad_norm": 0.3778058358908995, + "learning_rate": 1.341471734075097e-05, + "loss": 0.8159, + "num_tokens": 30332131215.0, + "step": 7257 + }, + { + "epoch": 0.8625074272133095, + "grad_norm": 0.42174281523572754, + "learning_rate": 1.3413048833456442e-05, + "loss": 0.8832, + "num_tokens": 30336288430.0, + "step": 7258 + }, + { + "epoch": 0.8626262626262626, + "grad_norm": 0.4481886017256628, + "learning_rate": 1.341138023679855e-05, + "loss": 0.8494, + "num_tokens": 30340477106.0, + "step": 7259 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.429975019153626, + "learning_rate": 1.3409711550839095e-05, + "loss": 0.8379, + "num_tokens": 30344661112.0, + "step": 7260 + }, + { + "epoch": 0.8628639334521687, + "grad_norm": 0.43376422878137344, + "learning_rate": 1.340804277563988e-05, + "loss": 0.8276, + "num_tokens": 30348848680.0, + "step": 7261 + }, + { + "epoch": 0.8629827688651218, + "grad_norm": 0.36577918132206844, + "learning_rate": 1.340637391126269e-05, + "loss": 0.8555, + "num_tokens": 30353038147.0, + "step": 7262 + }, + { + "epoch": 0.8631016042780749, + "grad_norm": 0.3723723936204493, + "learning_rate": 1.3404704957769345e-05, + "loss": 0.8393, + "num_tokens": 30357170941.0, + "step": 7263 + }, + { + "epoch": 0.8632204396910279, + "grad_norm": 0.41331522598024023, + "learning_rate": 1.3403035915221643e-05, + "loss": 0.8058, + "num_tokens": 30361359321.0, + "step": 7264 + }, + { + "epoch": 0.863339275103981, + "grad_norm": 0.4315157318725387, + "learning_rate": 1.3401366783681394e-05, + "loss": 0.8434, + "num_tokens": 30365535437.0, + "step": 7265 + }, + { + "epoch": 0.863458110516934, + "grad_norm": 0.3985124946112627, + "learning_rate": 1.3399697563210416e-05, + "loss": 0.8273, + "num_tokens": 30369725479.0, + "step": 7266 + }, + { + "epoch": 0.8635769459298871, + "grad_norm": 0.3655095408611285, + "learning_rate": 1.3398028253870526e-05, + "loss": 0.8774, + "num_tokens": 30373866524.0, + "step": 7267 + }, + { + "epoch": 0.8636957813428402, + "grad_norm": 0.4052353504392191, + "learning_rate": 1.3396358855723537e-05, + "loss": 0.8155, + "num_tokens": 30378054677.0, + "step": 7268 + }, + { + "epoch": 0.8638146167557932, + "grad_norm": 0.4681780593771136, + "learning_rate": 1.339468936883128e-05, + "loss": 0.8404, + "num_tokens": 30382211090.0, + "step": 7269 + }, + { + "epoch": 0.8639334521687463, + "grad_norm": 0.4321899707445683, + "learning_rate": 1.3393019793255577e-05, + "loss": 0.8552, + "num_tokens": 30386400783.0, + "step": 7270 + }, + { + "epoch": 0.8640522875816994, + "grad_norm": 0.37773006889970523, + "learning_rate": 1.3391350129058263e-05, + "loss": 0.8176, + "num_tokens": 30390589052.0, + "step": 7271 + }, + { + "epoch": 0.8641711229946524, + "grad_norm": 0.4526091712508806, + "learning_rate": 1.3389680376301163e-05, + "loss": 0.8572, + "num_tokens": 30394753338.0, + "step": 7272 + }, + { + "epoch": 0.8642899584076055, + "grad_norm": 0.45423759878674114, + "learning_rate": 1.3388010535046125e-05, + "loss": 0.8214, + "num_tokens": 30398910566.0, + "step": 7273 + }, + { + "epoch": 0.8644087938205586, + "grad_norm": 0.3917583696537787, + "learning_rate": 1.3386340605354985e-05, + "loss": 0.8017, + "num_tokens": 30403068287.0, + "step": 7274 + }, + { + "epoch": 0.8645276292335116, + "grad_norm": 0.43620535486770234, + "learning_rate": 1.3384670587289574e-05, + "loss": 0.838, + "num_tokens": 30407258187.0, + "step": 7275 + }, + { + "epoch": 0.8646464646464647, + "grad_norm": 0.40076064088203467, + "learning_rate": 1.3383000480911757e-05, + "loss": 0.8197, + "num_tokens": 30411444415.0, + "step": 7276 + }, + { + "epoch": 0.8647653000594177, + "grad_norm": 0.361855015323196, + "learning_rate": 1.3381330286283372e-05, + "loss": 0.8476, + "num_tokens": 30415633335.0, + "step": 7277 + }, + { + "epoch": 0.8648841354723708, + "grad_norm": 0.3833990889059678, + "learning_rate": 1.3379660003466275e-05, + "loss": 0.8784, + "num_tokens": 30419821183.0, + "step": 7278 + }, + { + "epoch": 0.8650029708853239, + "grad_norm": 0.47757785566493427, + "learning_rate": 1.337798963252232e-05, + "loss": 0.8739, + "num_tokens": 30424009412.0, + "step": 7279 + }, + { + "epoch": 0.8651218062982768, + "grad_norm": 0.3556568921873368, + "learning_rate": 1.3376319173513374e-05, + "loss": 0.8549, + "num_tokens": 30428198343.0, + "step": 7280 + }, + { + "epoch": 0.8652406417112299, + "grad_norm": 0.47797859344964916, + "learning_rate": 1.3374648626501293e-05, + "loss": 0.8522, + "num_tokens": 30432349444.0, + "step": 7281 + }, + { + "epoch": 0.865359477124183, + "grad_norm": 0.46794106856514944, + "learning_rate": 1.337297799154794e-05, + "loss": 0.8296, + "num_tokens": 30436512493.0, + "step": 7282 + }, + { + "epoch": 0.865478312537136, + "grad_norm": 0.33825263057588684, + "learning_rate": 1.3371307268715193e-05, + "loss": 0.8342, + "num_tokens": 30440700457.0, + "step": 7283 + }, + { + "epoch": 0.8655971479500891, + "grad_norm": 0.5909702665998648, + "learning_rate": 1.3369636458064918e-05, + "loss": 0.7947, + "num_tokens": 30444890000.0, + "step": 7284 + }, + { + "epoch": 0.8657159833630422, + "grad_norm": 0.39940506259906217, + "learning_rate": 1.3367965559658992e-05, + "loss": 0.8512, + "num_tokens": 30449078921.0, + "step": 7285 + }, + { + "epoch": 0.8658348187759952, + "grad_norm": 0.6236327792240816, + "learning_rate": 1.33662945735593e-05, + "loss": 0.8321, + "num_tokens": 30453268268.0, + "step": 7286 + }, + { + "epoch": 0.8659536541889483, + "grad_norm": 0.4545135805600254, + "learning_rate": 1.3364623499827715e-05, + "loss": 0.8207, + "num_tokens": 30457439663.0, + "step": 7287 + }, + { + "epoch": 0.8660724896019014, + "grad_norm": 0.5812724303117764, + "learning_rate": 1.3362952338526127e-05, + "loss": 0.8232, + "num_tokens": 30461629843.0, + "step": 7288 + }, + { + "epoch": 0.8661913250148544, + "grad_norm": 0.46295483933481135, + "learning_rate": 1.3361281089716427e-05, + "loss": 0.8522, + "num_tokens": 30465820536.0, + "step": 7289 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 0.6117113309853082, + "learning_rate": 1.3359609753460507e-05, + "loss": 0.8322, + "num_tokens": 30469982185.0, + "step": 7290 + }, + { + "epoch": 0.8664289958407605, + "grad_norm": 0.45876374574018786, + "learning_rate": 1.3357938329820252e-05, + "loss": 0.8246, + "num_tokens": 30474154538.0, + "step": 7291 + }, + { + "epoch": 0.8665478312537136, + "grad_norm": 0.5782969914495497, + "learning_rate": 1.3356266818857577e-05, + "loss": 0.8436, + "num_tokens": 30478334674.0, + "step": 7292 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.5095117318733656, + "learning_rate": 1.3354595220634368e-05, + "loss": 0.878, + "num_tokens": 30482522637.0, + "step": 7293 + }, + { + "epoch": 0.8667855020796197, + "grad_norm": 0.5759520225493311, + "learning_rate": 1.3352923535212538e-05, + "loss": 0.8535, + "num_tokens": 30486709012.0, + "step": 7294 + }, + { + "epoch": 0.8669043374925728, + "grad_norm": 0.44735810233663903, + "learning_rate": 1.3351251762653998e-05, + "loss": 0.8407, + "num_tokens": 30490879222.0, + "step": 7295 + }, + { + "epoch": 0.8670231729055259, + "grad_norm": 0.5547530057946015, + "learning_rate": 1.334957990302065e-05, + "loss": 0.8208, + "num_tokens": 30495068083.0, + "step": 7296 + }, + { + "epoch": 0.8671420083184789, + "grad_norm": 0.4687018503606426, + "learning_rate": 1.3347907956374417e-05, + "loss": 0.8316, + "num_tokens": 30499234318.0, + "step": 7297 + }, + { + "epoch": 0.867260843731432, + "grad_norm": 0.5597532375360267, + "learning_rate": 1.3346235922777215e-05, + "loss": 0.8409, + "num_tokens": 30503423310.0, + "step": 7298 + }, + { + "epoch": 0.8673796791443851, + "grad_norm": 0.4970973176436298, + "learning_rate": 1.3344563802290964e-05, + "loss": 0.8053, + "num_tokens": 30507611652.0, + "step": 7299 + }, + { + "epoch": 0.8674985145573381, + "grad_norm": 0.5189064797000199, + "learning_rate": 1.3342891594977585e-05, + "loss": 0.8401, + "num_tokens": 30511799708.0, + "step": 7300 + }, + { + "epoch": 0.8676173499702912, + "grad_norm": 0.5268944954882616, + "learning_rate": 1.334121930089901e-05, + "loss": 0.8277, + "num_tokens": 30515989670.0, + "step": 7301 + }, + { + "epoch": 0.8677361853832442, + "grad_norm": 0.46608182600828474, + "learning_rate": 1.3339546920117172e-05, + "loss": 0.8127, + "num_tokens": 30520179235.0, + "step": 7302 + }, + { + "epoch": 0.8678550207961973, + "grad_norm": 0.4829963396509855, + "learning_rate": 1.3337874452693998e-05, + "loss": 0.8723, + "num_tokens": 30524357655.0, + "step": 7303 + }, + { + "epoch": 0.8679738562091504, + "grad_norm": 0.45948899907783136, + "learning_rate": 1.3336201898691427e-05, + "loss": 0.8538, + "num_tokens": 30528533308.0, + "step": 7304 + }, + { + "epoch": 0.8680926916221033, + "grad_norm": 0.4642262503943922, + "learning_rate": 1.3334529258171404e-05, + "loss": 0.8248, + "num_tokens": 30532721754.0, + "step": 7305 + }, + { + "epoch": 0.8682115270350564, + "grad_norm": 0.41705098708368604, + "learning_rate": 1.3332856531195868e-05, + "loss": 0.8316, + "num_tokens": 30536910312.0, + "step": 7306 + }, + { + "epoch": 0.8683303624480095, + "grad_norm": 0.4494153161443634, + "learning_rate": 1.3331183717826767e-05, + "loss": 0.8139, + "num_tokens": 30541055356.0, + "step": 7307 + }, + { + "epoch": 0.8684491978609625, + "grad_norm": 0.38446172815533053, + "learning_rate": 1.3329510818126053e-05, + "loss": 0.8475, + "num_tokens": 30545244046.0, + "step": 7308 + }, + { + "epoch": 0.8685680332739156, + "grad_norm": 0.39601642861042713, + "learning_rate": 1.3327837832155675e-05, + "loss": 0.8333, + "num_tokens": 30549433015.0, + "step": 7309 + }, + { + "epoch": 0.8686868686868687, + "grad_norm": 0.46006787142363237, + "learning_rate": 1.3326164759977594e-05, + "loss": 0.8063, + "num_tokens": 30553601778.0, + "step": 7310 + }, + { + "epoch": 0.8688057040998217, + "grad_norm": 0.3660606209003987, + "learning_rate": 1.3324491601653763e-05, + "loss": 0.8173, + "num_tokens": 30557790486.0, + "step": 7311 + }, + { + "epoch": 0.8689245395127748, + "grad_norm": 0.5288647819306922, + "learning_rate": 1.332281835724615e-05, + "loss": 0.8221, + "num_tokens": 30561979247.0, + "step": 7312 + }, + { + "epoch": 0.8690433749257279, + "grad_norm": 0.342330778894714, + "learning_rate": 1.332114502681672e-05, + "loss": 0.8723, + "num_tokens": 30566168081.0, + "step": 7313 + }, + { + "epoch": 0.8691622103386809, + "grad_norm": 0.671435936482655, + "learning_rate": 1.3319471610427442e-05, + "loss": 0.8501, + "num_tokens": 30570357013.0, + "step": 7314 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.47976114967351224, + "learning_rate": 1.3317798108140289e-05, + "loss": 0.8661, + "num_tokens": 30574546758.0, + "step": 7315 + }, + { + "epoch": 0.869399881164587, + "grad_norm": 0.673742488445401, + "learning_rate": 1.3316124520017233e-05, + "loss": 0.818, + "num_tokens": 30578736876.0, + "step": 7316 + }, + { + "epoch": 0.8695187165775401, + "grad_norm": 0.5987102813139249, + "learning_rate": 1.3314450846120255e-05, + "loss": 0.8656, + "num_tokens": 30582927219.0, + "step": 7317 + }, + { + "epoch": 0.8696375519904932, + "grad_norm": 0.5487852131634923, + "learning_rate": 1.3312777086511338e-05, + "loss": 0.8199, + "num_tokens": 30587113907.0, + "step": 7318 + }, + { + "epoch": 0.8697563874034462, + "grad_norm": 0.5673373707644646, + "learning_rate": 1.3311103241252468e-05, + "loss": 0.8609, + "num_tokens": 30591301423.0, + "step": 7319 + }, + { + "epoch": 0.8698752228163993, + "grad_norm": 0.49147285107516314, + "learning_rate": 1.3309429310405626e-05, + "loss": 0.8131, + "num_tokens": 30595438536.0, + "step": 7320 + }, + { + "epoch": 0.8699940582293524, + "grad_norm": 0.4821136754155878, + "learning_rate": 1.3307755294032814e-05, + "loss": 0.8358, + "num_tokens": 30599626310.0, + "step": 7321 + }, + { + "epoch": 0.8701128936423054, + "grad_norm": 0.4799422480882451, + "learning_rate": 1.3306081192196017e-05, + "loss": 0.7862, + "num_tokens": 30603813822.0, + "step": 7322 + }, + { + "epoch": 0.8702317290552585, + "grad_norm": 0.44410274890393875, + "learning_rate": 1.3304407004957238e-05, + "loss": 0.8526, + "num_tokens": 30608002914.0, + "step": 7323 + }, + { + "epoch": 0.8703505644682116, + "grad_norm": 0.4476492312955542, + "learning_rate": 1.3302732732378476e-05, + "loss": 0.8399, + "num_tokens": 30612192354.0, + "step": 7324 + }, + { + "epoch": 0.8704693998811646, + "grad_norm": 0.43239873031730747, + "learning_rate": 1.3301058374521734e-05, + "loss": 0.8481, + "num_tokens": 30616381067.0, + "step": 7325 + }, + { + "epoch": 0.8705882352941177, + "grad_norm": 0.46030292574977794, + "learning_rate": 1.3299383931449022e-05, + "loss": 0.8116, + "num_tokens": 30620571221.0, + "step": 7326 + }, + { + "epoch": 0.8707070707070707, + "grad_norm": 0.36877801199258603, + "learning_rate": 1.3297709403222348e-05, + "loss": 0.8325, + "num_tokens": 30624758480.0, + "step": 7327 + }, + { + "epoch": 0.8708259061200238, + "grad_norm": 0.47724308494757467, + "learning_rate": 1.3296034789903725e-05, + "loss": 0.8311, + "num_tokens": 30628940737.0, + "step": 7328 + }, + { + "epoch": 0.8709447415329769, + "grad_norm": 0.3653147576250905, + "learning_rate": 1.3294360091555171e-05, + "loss": 0.8369, + "num_tokens": 30633129239.0, + "step": 7329 + }, + { + "epoch": 0.8710635769459298, + "grad_norm": 0.46211214917582266, + "learning_rate": 1.3292685308238706e-05, + "loss": 0.8622, + "num_tokens": 30637317917.0, + "step": 7330 + }, + { + "epoch": 0.8711824123588829, + "grad_norm": 0.3831732413061621, + "learning_rate": 1.3291010440016355e-05, + "loss": 0.8158, + "num_tokens": 30641480000.0, + "step": 7331 + }, + { + "epoch": 0.871301247771836, + "grad_norm": 0.5416996128798501, + "learning_rate": 1.3289335486950133e-05, + "loss": 0.8374, + "num_tokens": 30645670041.0, + "step": 7332 + }, + { + "epoch": 0.871420083184789, + "grad_norm": 0.3612603934506892, + "learning_rate": 1.3287660449102085e-05, + "loss": 0.8437, + "num_tokens": 30649851425.0, + "step": 7333 + }, + { + "epoch": 0.8715389185977421, + "grad_norm": 0.5320503058197593, + "learning_rate": 1.3285985326534234e-05, + "loss": 0.8357, + "num_tokens": 30654040468.0, + "step": 7334 + }, + { + "epoch": 0.8716577540106952, + "grad_norm": 0.4367315325634247, + "learning_rate": 1.3284310119308614e-05, + "loss": 0.8243, + "num_tokens": 30658230592.0, + "step": 7335 + }, + { + "epoch": 0.8717765894236482, + "grad_norm": 0.5563489658629068, + "learning_rate": 1.3282634827487273e-05, + "loss": 0.8501, + "num_tokens": 30662411035.0, + "step": 7336 + }, + { + "epoch": 0.8718954248366013, + "grad_norm": 0.501273402019583, + "learning_rate": 1.3280959451132243e-05, + "loss": 0.8172, + "num_tokens": 30666599742.0, + "step": 7337 + }, + { + "epoch": 0.8720142602495544, + "grad_norm": 0.4947723855495506, + "learning_rate": 1.3279283990305572e-05, + "loss": 0.8048, + "num_tokens": 30670773803.0, + "step": 7338 + }, + { + "epoch": 0.8721330956625074, + "grad_norm": 0.48240369019066326, + "learning_rate": 1.327760844506931e-05, + "loss": 0.8452, + "num_tokens": 30674945935.0, + "step": 7339 + }, + { + "epoch": 0.8722519310754605, + "grad_norm": 0.4776808249463665, + "learning_rate": 1.3275932815485504e-05, + "loss": 0.8325, + "num_tokens": 30679118799.0, + "step": 7340 + }, + { + "epoch": 0.8723707664884135, + "grad_norm": 0.4902622510033145, + "learning_rate": 1.3274257101616213e-05, + "loss": 0.794, + "num_tokens": 30683289354.0, + "step": 7341 + }, + { + "epoch": 0.8724896019013666, + "grad_norm": 0.3909284575099856, + "learning_rate": 1.3272581303523491e-05, + "loss": 0.8549, + "num_tokens": 30687473793.0, + "step": 7342 + }, + { + "epoch": 0.8726084373143197, + "grad_norm": 0.474631888229615, + "learning_rate": 1.32709054212694e-05, + "loss": 0.8418, + "num_tokens": 30691663841.0, + "step": 7343 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.4253967266342842, + "learning_rate": 1.3269229454916005e-05, + "loss": 0.8355, + "num_tokens": 30695825022.0, + "step": 7344 + }, + { + "epoch": 0.8728461081402258, + "grad_norm": 0.38891463085131284, + "learning_rate": 1.3267553404525366e-05, + "loss": 0.8436, + "num_tokens": 30699977764.0, + "step": 7345 + }, + { + "epoch": 0.8729649435531789, + "grad_norm": 0.3888734285281537, + "learning_rate": 1.3265877270159561e-05, + "loss": 0.8265, + "num_tokens": 30704150232.0, + "step": 7346 + }, + { + "epoch": 0.8730837789661319, + "grad_norm": 0.40351388162533425, + "learning_rate": 1.3264201051880662e-05, + "loss": 0.7969, + "num_tokens": 30708338270.0, + "step": 7347 + }, + { + "epoch": 0.873202614379085, + "grad_norm": 0.4334169698685785, + "learning_rate": 1.3262524749750737e-05, + "loss": 0.86, + "num_tokens": 30712522675.0, + "step": 7348 + }, + { + "epoch": 0.8733214497920381, + "grad_norm": 0.35343241953068344, + "learning_rate": 1.3260848363831876e-05, + "loss": 0.8493, + "num_tokens": 30716680585.0, + "step": 7349 + }, + { + "epoch": 0.8734402852049911, + "grad_norm": 0.39282515169147875, + "learning_rate": 1.3259171894186154e-05, + "loss": 0.815, + "num_tokens": 30720871254.0, + "step": 7350 + }, + { + "epoch": 0.8735591206179442, + "grad_norm": 0.39098988567053006, + "learning_rate": 1.3257495340875654e-05, + "loss": 0.83, + "num_tokens": 30725059560.0, + "step": 7351 + }, + { + "epoch": 0.8736779560308973, + "grad_norm": 0.37087361949039555, + "learning_rate": 1.3255818703962472e-05, + "loss": 0.8611, + "num_tokens": 30729229591.0, + "step": 7352 + }, + { + "epoch": 0.8737967914438503, + "grad_norm": 0.39795525461960013, + "learning_rate": 1.32541419835087e-05, + "loss": 0.8648, + "num_tokens": 30733416200.0, + "step": 7353 + }, + { + "epoch": 0.8739156268568034, + "grad_norm": 0.38535818633176716, + "learning_rate": 1.3252465179576426e-05, + "loss": 0.831, + "num_tokens": 30737594536.0, + "step": 7354 + }, + { + "epoch": 0.8740344622697563, + "grad_norm": 0.386151780075817, + "learning_rate": 1.3250788292227748e-05, + "loss": 0.8413, + "num_tokens": 30741772752.0, + "step": 7355 + }, + { + "epoch": 0.8741532976827094, + "grad_norm": 0.3856098090770195, + "learning_rate": 1.3249111321524772e-05, + "loss": 0.8394, + "num_tokens": 30745947891.0, + "step": 7356 + }, + { + "epoch": 0.8742721330956625, + "grad_norm": 0.35092308626234425, + "learning_rate": 1.32474342675296e-05, + "loss": 0.8404, + "num_tokens": 30750078916.0, + "step": 7357 + }, + { + "epoch": 0.8743909685086155, + "grad_norm": 0.3738633391047436, + "learning_rate": 1.3245757130304335e-05, + "loss": 0.8776, + "num_tokens": 30754246091.0, + "step": 7358 + }, + { + "epoch": 0.8745098039215686, + "grad_norm": 0.34604680836877394, + "learning_rate": 1.3244079909911095e-05, + "loss": 0.823, + "num_tokens": 30758435056.0, + "step": 7359 + }, + { + "epoch": 0.8746286393345217, + "grad_norm": 0.36864900458261907, + "learning_rate": 1.3242402606411987e-05, + "loss": 0.8309, + "num_tokens": 30762625586.0, + "step": 7360 + }, + { + "epoch": 0.8747474747474747, + "grad_norm": 0.3478644227205829, + "learning_rate": 1.3240725219869126e-05, + "loss": 0.8135, + "num_tokens": 30766789584.0, + "step": 7361 + }, + { + "epoch": 0.8748663101604278, + "grad_norm": 0.45368665716701284, + "learning_rate": 1.3239047750344638e-05, + "loss": 0.8386, + "num_tokens": 30770952703.0, + "step": 7362 + }, + { + "epoch": 0.8749851455733809, + "grad_norm": 0.38062983617944063, + "learning_rate": 1.3237370197900642e-05, + "loss": 0.8519, + "num_tokens": 30775142107.0, + "step": 7363 + }, + { + "epoch": 0.8751039809863339, + "grad_norm": 0.3550702177766055, + "learning_rate": 1.3235692562599256e-05, + "loss": 0.8455, + "num_tokens": 30779330972.0, + "step": 7364 + }, + { + "epoch": 0.875222816399287, + "grad_norm": 0.4420463393227032, + "learning_rate": 1.3234014844502623e-05, + "loss": 0.8505, + "num_tokens": 30783515740.0, + "step": 7365 + }, + { + "epoch": 0.87534165181224, + "grad_norm": 0.4247202527099171, + "learning_rate": 1.3232337043672863e-05, + "loss": 0.8343, + "num_tokens": 30787686357.0, + "step": 7366 + }, + { + "epoch": 0.8754604872251931, + "grad_norm": 0.5086492406432853, + "learning_rate": 1.3230659160172112e-05, + "loss": 0.8167, + "num_tokens": 30791853651.0, + "step": 7367 + }, + { + "epoch": 0.8755793226381462, + "grad_norm": 0.3662926018339588, + "learning_rate": 1.3228981194062515e-05, + "loss": 0.8524, + "num_tokens": 30796042812.0, + "step": 7368 + }, + { + "epoch": 0.8756981580510992, + "grad_norm": 0.5570379395934469, + "learning_rate": 1.3227303145406207e-05, + "loss": 0.8362, + "num_tokens": 30800227170.0, + "step": 7369 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.4145598306267957, + "learning_rate": 1.3225625014265333e-05, + "loss": 0.8901, + "num_tokens": 30804353084.0, + "step": 7370 + }, + { + "epoch": 0.8759358288770054, + "grad_norm": 0.4238179258028872, + "learning_rate": 1.3223946800702039e-05, + "loss": 0.8444, + "num_tokens": 30808540428.0, + "step": 7371 + }, + { + "epoch": 0.8760546642899584, + "grad_norm": 0.4000786023882116, + "learning_rate": 1.3222268504778476e-05, + "loss": 0.8266, + "num_tokens": 30812728454.0, + "step": 7372 + }, + { + "epoch": 0.8761734997029115, + "grad_norm": 0.4753385420140578, + "learning_rate": 1.3220590126556796e-05, + "loss": 0.8495, + "num_tokens": 30816917872.0, + "step": 7373 + }, + { + "epoch": 0.8762923351158646, + "grad_norm": 0.39540314164463713, + "learning_rate": 1.3218911666099157e-05, + "loss": 0.828, + "num_tokens": 30821107172.0, + "step": 7374 + }, + { + "epoch": 0.8764111705288176, + "grad_norm": 0.4155510986980302, + "learning_rate": 1.3217233123467715e-05, + "loss": 0.8127, + "num_tokens": 30825296419.0, + "step": 7375 + }, + { + "epoch": 0.8765300059417707, + "grad_norm": 0.4132673991768995, + "learning_rate": 1.3215554498724634e-05, + "loss": 0.849, + "num_tokens": 30829457246.0, + "step": 7376 + }, + { + "epoch": 0.8766488413547238, + "grad_norm": 0.43236128101925975, + "learning_rate": 1.3213875791932077e-05, + "loss": 0.8292, + "num_tokens": 30833637883.0, + "step": 7377 + }, + { + "epoch": 0.8767676767676768, + "grad_norm": 0.3676074362089087, + "learning_rate": 1.3212197003152218e-05, + "loss": 0.8139, + "num_tokens": 30837811536.0, + "step": 7378 + }, + { + "epoch": 0.8768865121806299, + "grad_norm": 0.4134475221844334, + "learning_rate": 1.3210518132447226e-05, + "loss": 0.802, + "num_tokens": 30842001109.0, + "step": 7379 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 0.3406029934249397, + "learning_rate": 1.3208839179879267e-05, + "loss": 0.8, + "num_tokens": 30846191417.0, + "step": 7380 + }, + { + "epoch": 0.8771241830065359, + "grad_norm": 0.4241078706771672, + "learning_rate": 1.320716014551053e-05, + "loss": 0.8359, + "num_tokens": 30850353803.0, + "step": 7381 + }, + { + "epoch": 0.877243018419489, + "grad_norm": 0.36563444490330765, + "learning_rate": 1.3205481029403189e-05, + "loss": 0.7989, + "num_tokens": 30854543857.0, + "step": 7382 + }, + { + "epoch": 0.877361853832442, + "grad_norm": 0.39722472610478227, + "learning_rate": 1.3203801831619428e-05, + "loss": 0.8187, + "num_tokens": 30858732890.0, + "step": 7383 + }, + { + "epoch": 0.8774806892453951, + "grad_norm": 0.404186104036526, + "learning_rate": 1.3202122552221433e-05, + "loss": 0.8232, + "num_tokens": 30862912727.0, + "step": 7384 + }, + { + "epoch": 0.8775995246583482, + "grad_norm": 0.39399811537853807, + "learning_rate": 1.3200443191271396e-05, + "loss": 0.8208, + "num_tokens": 30867102607.0, + "step": 7385 + }, + { + "epoch": 0.8777183600713012, + "grad_norm": 0.41061158461673775, + "learning_rate": 1.3198763748831507e-05, + "loss": 0.8283, + "num_tokens": 30871263657.0, + "step": 7386 + }, + { + "epoch": 0.8778371954842543, + "grad_norm": 0.4246626465788252, + "learning_rate": 1.3197084224963962e-05, + "loss": 0.8253, + "num_tokens": 30875452944.0, + "step": 7387 + }, + { + "epoch": 0.8779560308972074, + "grad_norm": 0.34698258041791613, + "learning_rate": 1.319540461973096e-05, + "loss": 0.8429, + "num_tokens": 30879618158.0, + "step": 7388 + }, + { + "epoch": 0.8780748663101604, + "grad_norm": 0.4243097464800633, + "learning_rate": 1.3193724933194703e-05, + "loss": 0.8467, + "num_tokens": 30883806587.0, + "step": 7389 + }, + { + "epoch": 0.8781937017231135, + "grad_norm": 0.417426641776758, + "learning_rate": 1.3192045165417393e-05, + "loss": 0.841, + "num_tokens": 30887996740.0, + "step": 7390 + }, + { + "epoch": 0.8783125371360665, + "grad_norm": 0.37393742753829584, + "learning_rate": 1.3190365316461242e-05, + "loss": 0.8163, + "num_tokens": 30892187092.0, + "step": 7391 + }, + { + "epoch": 0.8784313725490196, + "grad_norm": 0.36715886686457216, + "learning_rate": 1.3188685386388452e-05, + "loss": 0.846, + "num_tokens": 30896376298.0, + "step": 7392 + }, + { + "epoch": 0.8785502079619727, + "grad_norm": 0.3903756361497455, + "learning_rate": 1.3187005375261245e-05, + "loss": 0.8302, + "num_tokens": 30900537068.0, + "step": 7393 + }, + { + "epoch": 0.8786690433749257, + "grad_norm": 0.33357209242384633, + "learning_rate": 1.3185325283141836e-05, + "loss": 0.8328, + "num_tokens": 30904726169.0, + "step": 7394 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 0.42001150849585284, + "learning_rate": 1.318364511009244e-05, + "loss": 0.8036, + "num_tokens": 30908910754.0, + "step": 7395 + }, + { + "epoch": 0.8789067142008319, + "grad_norm": 0.40653169665823335, + "learning_rate": 1.3181964856175287e-05, + "loss": 0.8423, + "num_tokens": 30913099575.0, + "step": 7396 + }, + { + "epoch": 0.8790255496137849, + "grad_norm": 0.32363793897253923, + "learning_rate": 1.3180284521452596e-05, + "loss": 0.8753, + "num_tokens": 30917286677.0, + "step": 7397 + }, + { + "epoch": 0.879144385026738, + "grad_norm": 0.419983491861162, + "learning_rate": 1.3178604105986597e-05, + "loss": 0.8858, + "num_tokens": 30921444193.0, + "step": 7398 + }, + { + "epoch": 0.8792632204396911, + "grad_norm": 0.335478676689951, + "learning_rate": 1.3176923609839521e-05, + "loss": 0.8828, + "num_tokens": 30925633420.0, + "step": 7399 + }, + { + "epoch": 0.8793820558526441, + "grad_norm": 0.4181921723041249, + "learning_rate": 1.3175243033073604e-05, + "loss": 0.8675, + "num_tokens": 30929801108.0, + "step": 7400 + }, + { + "epoch": 0.8795008912655972, + "grad_norm": 0.36876309393863377, + "learning_rate": 1.3173562375751081e-05, + "loss": 0.8493, + "num_tokens": 30933984416.0, + "step": 7401 + }, + { + "epoch": 0.8796197266785503, + "grad_norm": 0.39940739245276946, + "learning_rate": 1.3171881637934197e-05, + "loss": 0.8461, + "num_tokens": 30938171573.0, + "step": 7402 + }, + { + "epoch": 0.8797385620915033, + "grad_norm": 0.3567553586127938, + "learning_rate": 1.317020081968519e-05, + "loss": 0.834, + "num_tokens": 30942329157.0, + "step": 7403 + }, + { + "epoch": 0.8798573975044564, + "grad_norm": 0.3784720783947035, + "learning_rate": 1.3168519921066317e-05, + "loss": 0.8287, + "num_tokens": 30946469284.0, + "step": 7404 + }, + { + "epoch": 0.8799762329174093, + "grad_norm": 0.4162264596619157, + "learning_rate": 1.3166838942139809e-05, + "loss": 0.8446, + "num_tokens": 30950636157.0, + "step": 7405 + }, + { + "epoch": 0.8800950683303624, + "grad_norm": 0.3615259340222073, + "learning_rate": 1.3165157882967932e-05, + "loss": 0.8705, + "num_tokens": 30954820720.0, + "step": 7406 + }, + { + "epoch": 0.8802139037433155, + "grad_norm": 0.3949895751750199, + "learning_rate": 1.3163476743612941e-05, + "loss": 0.7901, + "num_tokens": 30959010664.0, + "step": 7407 + }, + { + "epoch": 0.8803327391562685, + "grad_norm": 0.39927091050222097, + "learning_rate": 1.3161795524137086e-05, + "loss": 0.8308, + "num_tokens": 30963200502.0, + "step": 7408 + }, + { + "epoch": 0.8804515745692216, + "grad_norm": 0.3788069248572956, + "learning_rate": 1.316011422460264e-05, + "loss": 0.8242, + "num_tokens": 30967390970.0, + "step": 7409 + }, + { + "epoch": 0.8805704099821747, + "grad_norm": 0.36163229177441353, + "learning_rate": 1.3158432845071856e-05, + "loss": 0.7836, + "num_tokens": 30971580865.0, + "step": 7410 + }, + { + "epoch": 0.8806892453951277, + "grad_norm": 0.38992926765044905, + "learning_rate": 1.3156751385607008e-05, + "loss": 0.8612, + "num_tokens": 30975769119.0, + "step": 7411 + }, + { + "epoch": 0.8808080808080808, + "grad_norm": 0.47686539713175186, + "learning_rate": 1.3155069846270362e-05, + "loss": 0.8133, + "num_tokens": 30979934698.0, + "step": 7412 + }, + { + "epoch": 0.8809269162210339, + "grad_norm": 0.35967189701682034, + "learning_rate": 1.3153388227124196e-05, + "loss": 0.8436, + "num_tokens": 30984116893.0, + "step": 7413 + }, + { + "epoch": 0.8810457516339869, + "grad_norm": 0.46129894447603004, + "learning_rate": 1.315170652823078e-05, + "loss": 0.8233, + "num_tokens": 30988278424.0, + "step": 7414 + }, + { + "epoch": 0.88116458704694, + "grad_norm": 0.37951621478397723, + "learning_rate": 1.3150024749652399e-05, + "loss": 0.7806, + "num_tokens": 30992467368.0, + "step": 7415 + }, + { + "epoch": 0.881283422459893, + "grad_norm": 0.46013173384783773, + "learning_rate": 1.314834289145133e-05, + "loss": 0.8884, + "num_tokens": 30996654874.0, + "step": 7416 + }, + { + "epoch": 0.8814022578728461, + "grad_norm": 0.41595766454937144, + "learning_rate": 1.314666095368986e-05, + "loss": 0.8245, + "num_tokens": 31000812492.0, + "step": 7417 + }, + { + "epoch": 0.8815210932857992, + "grad_norm": 0.4412576945464536, + "learning_rate": 1.3144978936430279e-05, + "loss": 0.837, + "num_tokens": 31004988163.0, + "step": 7418 + }, + { + "epoch": 0.8816399286987522, + "grad_norm": 0.4039681280545361, + "learning_rate": 1.3143296839734873e-05, + "loss": 0.8704, + "num_tokens": 31009175858.0, + "step": 7419 + }, + { + "epoch": 0.8817587641117053, + "grad_norm": 0.3907437199645948, + "learning_rate": 1.3141614663665945e-05, + "loss": 0.844, + "num_tokens": 31013355827.0, + "step": 7420 + }, + { + "epoch": 0.8818775995246584, + "grad_norm": 0.4449304892144251, + "learning_rate": 1.3139932408285776e-05, + "loss": 0.804, + "num_tokens": 31017543757.0, + "step": 7421 + }, + { + "epoch": 0.8819964349376114, + "grad_norm": 0.3792785610909659, + "learning_rate": 1.3138250073656678e-05, + "loss": 0.825, + "num_tokens": 31021703878.0, + "step": 7422 + }, + { + "epoch": 0.8821152703505645, + "grad_norm": 0.3484858352686104, + "learning_rate": 1.3136567659840953e-05, + "loss": 0.8579, + "num_tokens": 31025891561.0, + "step": 7423 + }, + { + "epoch": 0.8822341057635176, + "grad_norm": 0.4489988076885948, + "learning_rate": 1.31348851669009e-05, + "loss": 0.8214, + "num_tokens": 31030081877.0, + "step": 7424 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.4378873697827591, + "learning_rate": 1.3133202594898832e-05, + "loss": 0.8287, + "num_tokens": 31034271801.0, + "step": 7425 + }, + { + "epoch": 0.8824717765894237, + "grad_norm": 0.33242691747294656, + "learning_rate": 1.313151994389706e-05, + "loss": 0.8473, + "num_tokens": 31038445956.0, + "step": 7426 + }, + { + "epoch": 0.8825906120023768, + "grad_norm": 0.5081579828387014, + "learning_rate": 1.3129837213957897e-05, + "loss": 0.8681, + "num_tokens": 31042596300.0, + "step": 7427 + }, + { + "epoch": 0.8827094474153298, + "grad_norm": 0.37262017238304546, + "learning_rate": 1.3128154405143663e-05, + "loss": 0.8241, + "num_tokens": 31046762094.0, + "step": 7428 + }, + { + "epoch": 0.8828282828282829, + "grad_norm": 0.4591310336994856, + "learning_rate": 1.3126471517516673e-05, + "loss": 0.8259, + "num_tokens": 31050951571.0, + "step": 7429 + }, + { + "epoch": 0.8829471182412358, + "grad_norm": 0.3331762533476557, + "learning_rate": 1.3124788551139254e-05, + "loss": 0.8445, + "num_tokens": 31055118466.0, + "step": 7430 + }, + { + "epoch": 0.8830659536541889, + "grad_norm": 0.39085945349349965, + "learning_rate": 1.312310550607373e-05, + "loss": 0.8934, + "num_tokens": 31059299326.0, + "step": 7431 + }, + { + "epoch": 0.883184789067142, + "grad_norm": 0.3896503257086487, + "learning_rate": 1.3121422382382434e-05, + "loss": 0.8176, + "num_tokens": 31063479986.0, + "step": 7432 + }, + { + "epoch": 0.883303624480095, + "grad_norm": 0.34631706746922347, + "learning_rate": 1.3119739180127693e-05, + "loss": 0.7986, + "num_tokens": 31067669933.0, + "step": 7433 + }, + { + "epoch": 0.8834224598930481, + "grad_norm": 0.3954354664352458, + "learning_rate": 1.311805589937184e-05, + "loss": 0.8306, + "num_tokens": 31071846238.0, + "step": 7434 + }, + { + "epoch": 0.8835412953060012, + "grad_norm": 0.3647500408752152, + "learning_rate": 1.3116372540177217e-05, + "loss": 0.8701, + "num_tokens": 31076031262.0, + "step": 7435 + }, + { + "epoch": 0.8836601307189542, + "grad_norm": 0.3294993982677196, + "learning_rate": 1.3114689102606166e-05, + "loss": 0.876, + "num_tokens": 31080195328.0, + "step": 7436 + }, + { + "epoch": 0.8837789661319073, + "grad_norm": 0.42253544125282066, + "learning_rate": 1.3113005586721023e-05, + "loss": 0.8443, + "num_tokens": 31084359104.0, + "step": 7437 + }, + { + "epoch": 0.8838978015448604, + "grad_norm": 0.33938168495177573, + "learning_rate": 1.3111321992584145e-05, + "loss": 0.8216, + "num_tokens": 31088549040.0, + "step": 7438 + }, + { + "epoch": 0.8840166369578134, + "grad_norm": 0.3765427231570401, + "learning_rate": 1.3109638320257871e-05, + "loss": 0.8432, + "num_tokens": 31092723547.0, + "step": 7439 + }, + { + "epoch": 0.8841354723707665, + "grad_norm": 0.3769622457086057, + "learning_rate": 1.3107954569804557e-05, + "loss": 0.8087, + "num_tokens": 31096914681.0, + "step": 7440 + }, + { + "epoch": 0.8842543077837195, + "grad_norm": 0.39059627572688677, + "learning_rate": 1.310627074128656e-05, + "loss": 0.8468, + "num_tokens": 31101103813.0, + "step": 7441 + }, + { + "epoch": 0.8843731431966726, + "grad_norm": 0.4039007705990416, + "learning_rate": 1.3104586834766235e-05, + "loss": 0.8599, + "num_tokens": 31105238282.0, + "step": 7442 + }, + { + "epoch": 0.8844919786096257, + "grad_norm": 0.38705287062586996, + "learning_rate": 1.3102902850305944e-05, + "loss": 0.8042, + "num_tokens": 31109428925.0, + "step": 7443 + }, + { + "epoch": 0.8846108140225787, + "grad_norm": 0.35384496813788413, + "learning_rate": 1.310121878796805e-05, + "loss": 0.8241, + "num_tokens": 31113565331.0, + "step": 7444 + }, + { + "epoch": 0.8847296494355318, + "grad_norm": 0.3791538436563501, + "learning_rate": 1.309953464781492e-05, + "loss": 0.8263, + "num_tokens": 31117755813.0, + "step": 7445 + }, + { + "epoch": 0.8848484848484849, + "grad_norm": 0.3495456391617128, + "learning_rate": 1.3097850429908925e-05, + "loss": 0.7969, + "num_tokens": 31121920721.0, + "step": 7446 + }, + { + "epoch": 0.8849673202614379, + "grad_norm": 0.3806785508122693, + "learning_rate": 1.3096166134312431e-05, + "loss": 0.8426, + "num_tokens": 31126110285.0, + "step": 7447 + }, + { + "epoch": 0.885086155674391, + "grad_norm": 0.3665524345278891, + "learning_rate": 1.309448176108782e-05, + "loss": 0.8468, + "num_tokens": 31130277729.0, + "step": 7448 + }, + { + "epoch": 0.8852049910873441, + "grad_norm": 0.35598229484317423, + "learning_rate": 1.3092797310297472e-05, + "loss": 0.8373, + "num_tokens": 31134465739.0, + "step": 7449 + }, + { + "epoch": 0.8853238265002971, + "grad_norm": 0.41998094579865686, + "learning_rate": 1.3091112782003756e-05, + "loss": 0.8253, + "num_tokens": 31138652159.0, + "step": 7450 + }, + { + "epoch": 0.8854426619132502, + "grad_norm": 0.38330395296082603, + "learning_rate": 1.308942817626907e-05, + "loss": 0.8101, + "num_tokens": 31142842559.0, + "step": 7451 + }, + { + "epoch": 0.8855614973262033, + "grad_norm": 0.4254634757586295, + "learning_rate": 1.3087743493155792e-05, + "loss": 0.8372, + "num_tokens": 31146975150.0, + "step": 7452 + }, + { + "epoch": 0.8856803327391563, + "grad_norm": 0.3291763590313561, + "learning_rate": 1.3086058732726312e-05, + "loss": 0.8331, + "num_tokens": 31151163259.0, + "step": 7453 + }, + { + "epoch": 0.8857991681521094, + "grad_norm": 0.3826701818704733, + "learning_rate": 1.3084373895043028e-05, + "loss": 0.8397, + "num_tokens": 31155326668.0, + "step": 7454 + }, + { + "epoch": 0.8859180035650623, + "grad_norm": 0.38532408071198265, + "learning_rate": 1.3082688980168327e-05, + "loss": 0.8156, + "num_tokens": 31159491920.0, + "step": 7455 + }, + { + "epoch": 0.8860368389780154, + "grad_norm": 0.3531489788510236, + "learning_rate": 1.3081003988164614e-05, + "loss": 0.8527, + "num_tokens": 31163677249.0, + "step": 7456 + }, + { + "epoch": 0.8861556743909685, + "grad_norm": 0.3520691327958492, + "learning_rate": 1.3079318919094287e-05, + "loss": 0.8366, + "num_tokens": 31167852844.0, + "step": 7457 + }, + { + "epoch": 0.8862745098039215, + "grad_norm": 0.35198013719938737, + "learning_rate": 1.3077633773019747e-05, + "loss": 0.8622, + "num_tokens": 31172030614.0, + "step": 7458 + }, + { + "epoch": 0.8863933452168746, + "grad_norm": 0.35591482553827786, + "learning_rate": 1.3075948550003405e-05, + "loss": 0.7951, + "num_tokens": 31176206389.0, + "step": 7459 + }, + { + "epoch": 0.8865121806298277, + "grad_norm": 0.3996209194404562, + "learning_rate": 1.307426325010767e-05, + "loss": 0.8312, + "num_tokens": 31180395712.0, + "step": 7460 + }, + { + "epoch": 0.8866310160427807, + "grad_norm": 0.35433746794876464, + "learning_rate": 1.3072577873394953e-05, + "loss": 0.8471, + "num_tokens": 31184564870.0, + "step": 7461 + }, + { + "epoch": 0.8867498514557338, + "grad_norm": 0.357391555815161, + "learning_rate": 1.307089241992767e-05, + "loss": 0.8469, + "num_tokens": 31188747643.0, + "step": 7462 + }, + { + "epoch": 0.8868686868686869, + "grad_norm": 0.36283893741841877, + "learning_rate": 1.3069206889768243e-05, + "loss": 0.8224, + "num_tokens": 31192908375.0, + "step": 7463 + }, + { + "epoch": 0.8869875222816399, + "grad_norm": 0.3811600653838004, + "learning_rate": 1.3067521282979085e-05, + "loss": 0.8618, + "num_tokens": 31197098186.0, + "step": 7464 + }, + { + "epoch": 0.887106357694593, + "grad_norm": 0.5290370909580473, + "learning_rate": 1.3065835599622621e-05, + "loss": 0.8869, + "num_tokens": 31201263592.0, + "step": 7465 + }, + { + "epoch": 0.887225193107546, + "grad_norm": 0.37397581195383045, + "learning_rate": 1.3064149839761283e-05, + "loss": 0.8263, + "num_tokens": 31205452800.0, + "step": 7466 + }, + { + "epoch": 0.8873440285204991, + "grad_norm": 0.46682382848814624, + "learning_rate": 1.3062464003457498e-05, + "loss": 0.8414, + "num_tokens": 31209639146.0, + "step": 7467 + }, + { + "epoch": 0.8874628639334522, + "grad_norm": 0.3542265903975275, + "learning_rate": 1.3060778090773699e-05, + "loss": 0.8254, + "num_tokens": 31213827848.0, + "step": 7468 + }, + { + "epoch": 0.8875816993464052, + "grad_norm": 0.4293417494513793, + "learning_rate": 1.3059092101772315e-05, + "loss": 0.8159, + "num_tokens": 31217999256.0, + "step": 7469 + }, + { + "epoch": 0.8877005347593583, + "grad_norm": 0.41812699244407087, + "learning_rate": 1.305740603651579e-05, + "loss": 0.8268, + "num_tokens": 31222178910.0, + "step": 7470 + }, + { + "epoch": 0.8878193701723114, + "grad_norm": 0.31888327570136243, + "learning_rate": 1.3055719895066566e-05, + "loss": 0.8327, + "num_tokens": 31226367966.0, + "step": 7471 + }, + { + "epoch": 0.8879382055852644, + "grad_norm": 0.5133991640507064, + "learning_rate": 1.305403367748708e-05, + "loss": 0.8218, + "num_tokens": 31230557823.0, + "step": 7472 + }, + { + "epoch": 0.8880570409982175, + "grad_norm": 0.4482849934076682, + "learning_rate": 1.3052347383839782e-05, + "loss": 0.797, + "num_tokens": 31234747122.0, + "step": 7473 + }, + { + "epoch": 0.8881758764111706, + "grad_norm": 0.3860867324809686, + "learning_rate": 1.3050661014187124e-05, + "loss": 0.8224, + "num_tokens": 31238921832.0, + "step": 7474 + }, + { + "epoch": 0.8882947118241236, + "grad_norm": 0.48519406923000735, + "learning_rate": 1.3048974568591554e-05, + "loss": 0.8387, + "num_tokens": 31243110942.0, + "step": 7475 + }, + { + "epoch": 0.8884135472370767, + "grad_norm": 0.3910244706150254, + "learning_rate": 1.3047288047115527e-05, + "loss": 0.8556, + "num_tokens": 31247299493.0, + "step": 7476 + }, + { + "epoch": 0.8885323826500298, + "grad_norm": 0.5015378864217426, + "learning_rate": 1.3045601449821504e-05, + "loss": 0.8368, + "num_tokens": 31251449089.0, + "step": 7477 + }, + { + "epoch": 0.8886512180629828, + "grad_norm": 0.36972844506088653, + "learning_rate": 1.3043914776771943e-05, + "loss": 0.8364, + "num_tokens": 31255604500.0, + "step": 7478 + }, + { + "epoch": 0.8887700534759359, + "grad_norm": 0.6184951927483142, + "learning_rate": 1.3042228028029305e-05, + "loss": 0.8739, + "num_tokens": 31259793428.0, + "step": 7479 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.4155781087064828, + "learning_rate": 1.304054120365606e-05, + "loss": 0.8104, + "num_tokens": 31263982136.0, + "step": 7480 + }, + { + "epoch": 0.8890077243018419, + "grad_norm": 0.5082681695546706, + "learning_rate": 1.3038854303714672e-05, + "loss": 0.7811, + "num_tokens": 31268151713.0, + "step": 7481 + }, + { + "epoch": 0.889126559714795, + "grad_norm": 0.3972860154761633, + "learning_rate": 1.303716732826762e-05, + "loss": 0.8465, + "num_tokens": 31272326630.0, + "step": 7482 + }, + { + "epoch": 0.889245395127748, + "grad_norm": 0.5214524952569595, + "learning_rate": 1.3035480277377372e-05, + "loss": 0.8135, + "num_tokens": 31276514858.0, + "step": 7483 + }, + { + "epoch": 0.8893642305407011, + "grad_norm": 0.39379737479980603, + "learning_rate": 1.3033793151106405e-05, + "loss": 0.8303, + "num_tokens": 31280703371.0, + "step": 7484 + }, + { + "epoch": 0.8894830659536542, + "grad_norm": 0.5252523199832206, + "learning_rate": 1.3032105949517204e-05, + "loss": 0.8314, + "num_tokens": 31284892398.0, + "step": 7485 + }, + { + "epoch": 0.8896019013666072, + "grad_norm": 0.41106983583667184, + "learning_rate": 1.3030418672672247e-05, + "loss": 0.8482, + "num_tokens": 31289059473.0, + "step": 7486 + }, + { + "epoch": 0.8897207367795603, + "grad_norm": 0.5224190724720446, + "learning_rate": 1.3028731320634023e-05, + "loss": 0.8301, + "num_tokens": 31293249512.0, + "step": 7487 + }, + { + "epoch": 0.8898395721925134, + "grad_norm": 0.409109294034334, + "learning_rate": 1.3027043893465017e-05, + "loss": 0.8502, + "num_tokens": 31297436775.0, + "step": 7488 + }, + { + "epoch": 0.8899584076054664, + "grad_norm": 0.5235093639352928, + "learning_rate": 1.3025356391227724e-05, + "loss": 0.8614, + "num_tokens": 31301626689.0, + "step": 7489 + }, + { + "epoch": 0.8900772430184195, + "grad_norm": 0.4031222945531992, + "learning_rate": 1.3023668813984635e-05, + "loss": 0.8023, + "num_tokens": 31305793053.0, + "step": 7490 + }, + { + "epoch": 0.8901960784313725, + "grad_norm": 0.4385872429936606, + "learning_rate": 1.3021981161798248e-05, + "loss": 0.8294, + "num_tokens": 31309981572.0, + "step": 7491 + }, + { + "epoch": 0.8903149138443256, + "grad_norm": 0.39117926200232017, + "learning_rate": 1.3020293434731062e-05, + "loss": 0.8615, + "num_tokens": 31314149197.0, + "step": 7492 + }, + { + "epoch": 0.8904337492572787, + "grad_norm": 0.3763385306694377, + "learning_rate": 1.3018605632845582e-05, + "loss": 0.8492, + "num_tokens": 31318338937.0, + "step": 7493 + }, + { + "epoch": 0.8905525846702317, + "grad_norm": 0.4637538679414311, + "learning_rate": 1.3016917756204305e-05, + "loss": 0.865, + "num_tokens": 31322501155.0, + "step": 7494 + }, + { + "epoch": 0.8906714200831848, + "grad_norm": 0.34825047742334975, + "learning_rate": 1.3015229804869747e-05, + "loss": 0.8099, + "num_tokens": 31326672351.0, + "step": 7495 + }, + { + "epoch": 0.8907902554961379, + "grad_norm": 0.4415927452194492, + "learning_rate": 1.3013541778904419e-05, + "loss": 0.8273, + "num_tokens": 31330815753.0, + "step": 7496 + }, + { + "epoch": 0.8909090909090909, + "grad_norm": 0.34782322273014327, + "learning_rate": 1.3011853678370823e-05, + "loss": 0.818, + "num_tokens": 31335004128.0, + "step": 7497 + }, + { + "epoch": 0.891027926322044, + "grad_norm": 0.4693564060080957, + "learning_rate": 1.3010165503331491e-05, + "loss": 0.8628, + "num_tokens": 31339192754.0, + "step": 7498 + }, + { + "epoch": 0.8911467617349971, + "grad_norm": 0.3754564352319871, + "learning_rate": 1.300847725384893e-05, + "loss": 0.8322, + "num_tokens": 31343379141.0, + "step": 7499 + }, + { + "epoch": 0.8912655971479501, + "grad_norm": 0.39518735207856326, + "learning_rate": 1.3006788929985663e-05, + "loss": 0.8645, + "num_tokens": 31347567504.0, + "step": 7500 + }, + { + "epoch": 0.8913844325609032, + "grad_norm": 0.36972354621171905, + "learning_rate": 1.3005100531804222e-05, + "loss": 0.8305, + "num_tokens": 31351756779.0, + "step": 7501 + }, + { + "epoch": 0.8915032679738563, + "grad_norm": 0.41766620150046935, + "learning_rate": 1.3003412059367126e-05, + "loss": 0.8677, + "num_tokens": 31355946068.0, + "step": 7502 + }, + { + "epoch": 0.8916221033868093, + "grad_norm": 0.42503806553875867, + "learning_rate": 1.3001723512736908e-05, + "loss": 0.7956, + "num_tokens": 31360113944.0, + "step": 7503 + }, + { + "epoch": 0.8917409387997624, + "grad_norm": 0.40789712485748064, + "learning_rate": 1.3000034891976103e-05, + "loss": 0.8486, + "num_tokens": 31364271407.0, + "step": 7504 + }, + { + "epoch": 0.8918597742127153, + "grad_norm": 0.39194811639899374, + "learning_rate": 1.299834619714724e-05, + "loss": 0.8318, + "num_tokens": 31368460228.0, + "step": 7505 + }, + { + "epoch": 0.8919786096256684, + "grad_norm": 0.34739256485876774, + "learning_rate": 1.2996657428312865e-05, + "loss": 0.8133, + "num_tokens": 31372649861.0, + "step": 7506 + }, + { + "epoch": 0.8920974450386215, + "grad_norm": 0.45444167650262723, + "learning_rate": 1.299496858553551e-05, + "loss": 0.8637, + "num_tokens": 31376810927.0, + "step": 7507 + }, + { + "epoch": 0.8922162804515745, + "grad_norm": 0.356735449740932, + "learning_rate": 1.2993279668877724e-05, + "loss": 0.8653, + "num_tokens": 31381000699.0, + "step": 7508 + }, + { + "epoch": 0.8923351158645276, + "grad_norm": 0.40574121336308916, + "learning_rate": 1.2991590678402056e-05, + "loss": 0.829, + "num_tokens": 31385188951.0, + "step": 7509 + }, + { + "epoch": 0.8924539512774807, + "grad_norm": 0.35359414042500226, + "learning_rate": 1.2989901614171046e-05, + "loss": 0.8596, + "num_tokens": 31389358628.0, + "step": 7510 + }, + { + "epoch": 0.8925727866904337, + "grad_norm": 0.40582182584639725, + "learning_rate": 1.2988212476247254e-05, + "loss": 0.8136, + "num_tokens": 31393503923.0, + "step": 7511 + }, + { + "epoch": 0.8926916221033868, + "grad_norm": 0.39972746413002214, + "learning_rate": 1.2986523264693234e-05, + "loss": 0.8526, + "num_tokens": 31397684715.0, + "step": 7512 + }, + { + "epoch": 0.8928104575163399, + "grad_norm": 0.3778375015940162, + "learning_rate": 1.2984833979571533e-05, + "loss": 0.823, + "num_tokens": 31401869180.0, + "step": 7513 + }, + { + "epoch": 0.8929292929292929, + "grad_norm": 0.39095842122716007, + "learning_rate": 1.2983144620944727e-05, + "loss": 0.8379, + "num_tokens": 31406047451.0, + "step": 7514 + }, + { + "epoch": 0.893048128342246, + "grad_norm": 0.340027321185095, + "learning_rate": 1.2981455188875369e-05, + "loss": 0.8447, + "num_tokens": 31410235843.0, + "step": 7515 + }, + { + "epoch": 0.893166963755199, + "grad_norm": 0.37869809516364605, + "learning_rate": 1.2979765683426024e-05, + "loss": 0.8262, + "num_tokens": 31414424935.0, + "step": 7516 + }, + { + "epoch": 0.8932857991681521, + "grad_norm": 0.3525567248182435, + "learning_rate": 1.2978076104659265e-05, + "loss": 0.8612, + "num_tokens": 31418615275.0, + "step": 7517 + }, + { + "epoch": 0.8934046345811052, + "grad_norm": 0.4504761494947001, + "learning_rate": 1.2976386452637658e-05, + "loss": 0.8394, + "num_tokens": 31422804807.0, + "step": 7518 + }, + { + "epoch": 0.8935234699940582, + "grad_norm": 0.3868690420944458, + "learning_rate": 1.297469672742378e-05, + "loss": 0.8095, + "num_tokens": 31426992772.0, + "step": 7519 + }, + { + "epoch": 0.8936423054070113, + "grad_norm": 0.3815374397313509, + "learning_rate": 1.2973006929080201e-05, + "loss": 0.8477, + "num_tokens": 31431132667.0, + "step": 7520 + }, + { + "epoch": 0.8937611408199644, + "grad_norm": 0.3289134524901052, + "learning_rate": 1.2971317057669505e-05, + "loss": 0.8643, + "num_tokens": 31435322498.0, + "step": 7521 + }, + { + "epoch": 0.8938799762329174, + "grad_norm": 0.3755388995666592, + "learning_rate": 1.2969627113254277e-05, + "loss": 0.8316, + "num_tokens": 31439510895.0, + "step": 7522 + }, + { + "epoch": 0.8939988116458705, + "grad_norm": 0.49186938961436477, + "learning_rate": 1.2967937095897096e-05, + "loss": 0.8033, + "num_tokens": 31443700271.0, + "step": 7523 + }, + { + "epoch": 0.8941176470588236, + "grad_norm": 0.37176706442592017, + "learning_rate": 1.296624700566055e-05, + "loss": 0.8257, + "num_tokens": 31447890848.0, + "step": 7524 + }, + { + "epoch": 0.8942364824717766, + "grad_norm": 0.4000585232200607, + "learning_rate": 1.296455684260723e-05, + "loss": 0.8858, + "num_tokens": 31452078010.0, + "step": 7525 + }, + { + "epoch": 0.8943553178847297, + "grad_norm": 0.33923143337432315, + "learning_rate": 1.2962866606799727e-05, + "loss": 0.8007, + "num_tokens": 31456268626.0, + "step": 7526 + }, + { + "epoch": 0.8944741532976828, + "grad_norm": 0.40376998592645286, + "learning_rate": 1.2961176298300636e-05, + "loss": 0.8294, + "num_tokens": 31460452467.0, + "step": 7527 + }, + { + "epoch": 0.8945929887106358, + "grad_norm": 0.444349401918613, + "learning_rate": 1.295948591717256e-05, + "loss": 0.8082, + "num_tokens": 31464633785.0, + "step": 7528 + }, + { + "epoch": 0.8947118241235889, + "grad_norm": 0.3375742000598805, + "learning_rate": 1.2957795463478092e-05, + "loss": 0.8314, + "num_tokens": 31468824773.0, + "step": 7529 + }, + { + "epoch": 0.8948306595365418, + "grad_norm": 0.41101299325908003, + "learning_rate": 1.295610493727984e-05, + "loss": 0.8792, + "num_tokens": 31473013222.0, + "step": 7530 + }, + { + "epoch": 0.8949494949494949, + "grad_norm": 0.38011389367121556, + "learning_rate": 1.2954414338640407e-05, + "loss": 0.8702, + "num_tokens": 31477195339.0, + "step": 7531 + }, + { + "epoch": 0.895068330362448, + "grad_norm": 0.5071609302743609, + "learning_rate": 1.2952723667622402e-05, + "loss": 0.8469, + "num_tokens": 31481385101.0, + "step": 7532 + }, + { + "epoch": 0.895187165775401, + "grad_norm": 0.33912394969315535, + "learning_rate": 1.295103292428844e-05, + "loss": 0.8223, + "num_tokens": 31485556153.0, + "step": 7533 + }, + { + "epoch": 0.8953060011883541, + "grad_norm": 0.5113512721508456, + "learning_rate": 1.2949342108701129e-05, + "loss": 0.8701, + "num_tokens": 31489744236.0, + "step": 7534 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.4448214438165788, + "learning_rate": 1.2947651220923092e-05, + "loss": 0.8261, + "num_tokens": 31493916792.0, + "step": 7535 + }, + { + "epoch": 0.8955436720142602, + "grad_norm": 0.453082742820516, + "learning_rate": 1.2945960261016943e-05, + "loss": 0.8182, + "num_tokens": 31498086412.0, + "step": 7536 + }, + { + "epoch": 0.8956625074272133, + "grad_norm": 0.4122079935936538, + "learning_rate": 1.2944269229045308e-05, + "loss": 0.8698, + "num_tokens": 31502274343.0, + "step": 7537 + }, + { + "epoch": 0.8957813428401664, + "grad_norm": 0.5480942699285466, + "learning_rate": 1.294257812507081e-05, + "loss": 0.8415, + "num_tokens": 31506463150.0, + "step": 7538 + }, + { + "epoch": 0.8959001782531194, + "grad_norm": 0.37835455959046876, + "learning_rate": 1.2940886949156073e-05, + "loss": 0.8703, + "num_tokens": 31510635180.0, + "step": 7539 + }, + { + "epoch": 0.8960190136660725, + "grad_norm": 0.5777261199368988, + "learning_rate": 1.2939195701363734e-05, + "loss": 0.8346, + "num_tokens": 31514824625.0, + "step": 7540 + }, + { + "epoch": 0.8961378490790255, + "grad_norm": 0.5083174535427067, + "learning_rate": 1.2937504381756418e-05, + "loss": 0.8358, + "num_tokens": 31519003784.0, + "step": 7541 + }, + { + "epoch": 0.8962566844919786, + "grad_norm": 0.45256426714081965, + "learning_rate": 1.2935812990396765e-05, + "loss": 0.8505, + "num_tokens": 31523192262.0, + "step": 7542 + }, + { + "epoch": 0.8963755199049317, + "grad_norm": 0.572716723920881, + "learning_rate": 1.2934121527347412e-05, + "loss": 0.8075, + "num_tokens": 31527346184.0, + "step": 7543 + }, + { + "epoch": 0.8964943553178847, + "grad_norm": 0.3755694099126516, + "learning_rate": 1.2932429992671e-05, + "loss": 0.872, + "num_tokens": 31531504291.0, + "step": 7544 + }, + { + "epoch": 0.8966131907308378, + "grad_norm": 0.5396473220834314, + "learning_rate": 1.2930738386430173e-05, + "loss": 0.8275, + "num_tokens": 31535672882.0, + "step": 7545 + }, + { + "epoch": 0.8967320261437909, + "grad_norm": 0.44836678901568494, + "learning_rate": 1.2929046708687571e-05, + "loss": 0.8736, + "num_tokens": 31539837913.0, + "step": 7546 + }, + { + "epoch": 0.8968508615567439, + "grad_norm": 0.3607644121344064, + "learning_rate": 1.2927354959505851e-05, + "loss": 0.8254, + "num_tokens": 31543997996.0, + "step": 7547 + }, + { + "epoch": 0.896969696969697, + "grad_norm": 0.49138998977158677, + "learning_rate": 1.2925663138947657e-05, + "loss": 0.8227, + "num_tokens": 31548186587.0, + "step": 7548 + }, + { + "epoch": 0.8970885323826501, + "grad_norm": 0.4724238956714443, + "learning_rate": 1.2923971247075651e-05, + "loss": 0.8335, + "num_tokens": 31552347792.0, + "step": 7549 + }, + { + "epoch": 0.8972073677956031, + "grad_norm": 0.4155040463886367, + "learning_rate": 1.2922279283952482e-05, + "loss": 0.8336, + "num_tokens": 31556530733.0, + "step": 7550 + }, + { + "epoch": 0.8973262032085562, + "grad_norm": 0.5619254353343512, + "learning_rate": 1.292058724964081e-05, + "loss": 0.8234, + "num_tokens": 31560716743.0, + "step": 7551 + }, + { + "epoch": 0.8974450386215093, + "grad_norm": 0.4062240218288755, + "learning_rate": 1.2918895144203301e-05, + "loss": 0.799, + "num_tokens": 31564887297.0, + "step": 7552 + }, + { + "epoch": 0.8975638740344623, + "grad_norm": 0.4873786742865021, + "learning_rate": 1.2917202967702616e-05, + "loss": 0.8393, + "num_tokens": 31569077660.0, + "step": 7553 + }, + { + "epoch": 0.8976827094474154, + "grad_norm": 0.5301237726068105, + "learning_rate": 1.2915510720201423e-05, + "loss": 0.8597, + "num_tokens": 31573266532.0, + "step": 7554 + }, + { + "epoch": 0.8978015448603683, + "grad_norm": 0.40272814437818605, + "learning_rate": 1.291381840176239e-05, + "loss": 0.8125, + "num_tokens": 31577456097.0, + "step": 7555 + }, + { + "epoch": 0.8979203802733214, + "grad_norm": 0.5188422452909555, + "learning_rate": 1.2912126012448193e-05, + "loss": 0.8404, + "num_tokens": 31581644919.0, + "step": 7556 + }, + { + "epoch": 0.8980392156862745, + "grad_norm": 0.4775710448688098, + "learning_rate": 1.2910433552321506e-05, + "loss": 0.8346, + "num_tokens": 31585832171.0, + "step": 7557 + }, + { + "epoch": 0.8981580510992275, + "grad_norm": 0.42290228844591216, + "learning_rate": 1.2908741021445002e-05, + "loss": 0.8231, + "num_tokens": 31589998927.0, + "step": 7558 + }, + { + "epoch": 0.8982768865121806, + "grad_norm": 0.5266398812938251, + "learning_rate": 1.2907048419881367e-05, + "loss": 0.8802, + "num_tokens": 31594188628.0, + "step": 7559 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 0.367714664987697, + "learning_rate": 1.2905355747693278e-05, + "loss": 0.8687, + "num_tokens": 31598377856.0, + "step": 7560 + }, + { + "epoch": 0.8985145573380867, + "grad_norm": 0.6219698251884932, + "learning_rate": 1.2903663004943425e-05, + "loss": 0.8266, + "num_tokens": 31602567843.0, + "step": 7561 + }, + { + "epoch": 0.8986333927510398, + "grad_norm": 0.4112244605963302, + "learning_rate": 1.2901970191694495e-05, + "loss": 0.8348, + "num_tokens": 31606756550.0, + "step": 7562 + }, + { + "epoch": 0.8987522281639929, + "grad_norm": 0.6804576505818594, + "learning_rate": 1.2900277308009177e-05, + "loss": 0.8097, + "num_tokens": 31610944376.0, + "step": 7563 + }, + { + "epoch": 0.8988710635769459, + "grad_norm": 0.4926753881995214, + "learning_rate": 1.2898584353950168e-05, + "loss": 0.846, + "num_tokens": 31615133594.0, + "step": 7564 + }, + { + "epoch": 0.898989898989899, + "grad_norm": 0.6879986351471568, + "learning_rate": 1.2896891329580156e-05, + "loss": 0.8906, + "num_tokens": 31619296154.0, + "step": 7565 + }, + { + "epoch": 0.899108734402852, + "grad_norm": 0.5332178052178312, + "learning_rate": 1.2895198234961852e-05, + "loss": 0.8585, + "num_tokens": 31623484302.0, + "step": 7566 + }, + { + "epoch": 0.8992275698158051, + "grad_norm": 0.6145575830695656, + "learning_rate": 1.2893505070157941e-05, + "loss": 0.8239, + "num_tokens": 31627672125.0, + "step": 7567 + }, + { + "epoch": 0.8993464052287582, + "grad_norm": 0.526861689806997, + "learning_rate": 1.289181183523114e-05, + "loss": 0.8275, + "num_tokens": 31631836372.0, + "step": 7568 + }, + { + "epoch": 0.8994652406417112, + "grad_norm": 0.60569282937288, + "learning_rate": 1.2890118530244153e-05, + "loss": 0.7916, + "num_tokens": 31636024464.0, + "step": 7569 + }, + { + "epoch": 0.8995840760546643, + "grad_norm": 0.5014290248701025, + "learning_rate": 1.2888425155259679e-05, + "loss": 0.8166, + "num_tokens": 31640204621.0, + "step": 7570 + }, + { + "epoch": 0.8997029114676174, + "grad_norm": 0.642192193322983, + "learning_rate": 1.2886731710340444e-05, + "loss": 0.8472, + "num_tokens": 31644393641.0, + "step": 7571 + }, + { + "epoch": 0.8998217468805704, + "grad_norm": 0.5558282526207705, + "learning_rate": 1.2885038195549152e-05, + "loss": 0.8283, + "num_tokens": 31648582616.0, + "step": 7572 + }, + { + "epoch": 0.8999405822935235, + "grad_norm": 0.5890129342160186, + "learning_rate": 1.288334461094852e-05, + "loss": 0.8284, + "num_tokens": 31652771317.0, + "step": 7573 + }, + { + "epoch": 0.9000594177064766, + "grad_norm": 0.5475185730146623, + "learning_rate": 1.288165095660127e-05, + "loss": 0.8322, + "num_tokens": 31656906466.0, + "step": 7574 + }, + { + "epoch": 0.9001782531194296, + "grad_norm": 0.5841710115967144, + "learning_rate": 1.2879957232570124e-05, + "loss": 0.8496, + "num_tokens": 31661086201.0, + "step": 7575 + }, + { + "epoch": 0.9002970885323827, + "grad_norm": 0.5151605282070709, + "learning_rate": 1.2878263438917805e-05, + "loss": 0.8491, + "num_tokens": 31665274803.0, + "step": 7576 + }, + { + "epoch": 0.9004159239453358, + "grad_norm": 0.5848341897026633, + "learning_rate": 1.287656957570704e-05, + "loss": 0.8351, + "num_tokens": 31669452769.0, + "step": 7577 + }, + { + "epoch": 0.9005347593582887, + "grad_norm": 0.5080387958096797, + "learning_rate": 1.2874875643000557e-05, + "loss": 0.864, + "num_tokens": 31673641171.0, + "step": 7578 + }, + { + "epoch": 0.9006535947712418, + "grad_norm": 0.6010209593698826, + "learning_rate": 1.2873181640861095e-05, + "loss": 0.8203, + "num_tokens": 31677818373.0, + "step": 7579 + }, + { + "epoch": 0.9007724301841948, + "grad_norm": 0.5093259294391557, + "learning_rate": 1.287148756935138e-05, + "loss": 0.8279, + "num_tokens": 31682007435.0, + "step": 7580 + }, + { + "epoch": 0.9008912655971479, + "grad_norm": 0.5739084212565316, + "learning_rate": 1.286979342853415e-05, + "loss": 0.8472, + "num_tokens": 31686185169.0, + "step": 7581 + }, + { + "epoch": 0.901010101010101, + "grad_norm": 0.5261940290686945, + "learning_rate": 1.2868099218472154e-05, + "loss": 0.828, + "num_tokens": 31690372654.0, + "step": 7582 + }, + { + "epoch": 0.901128936423054, + "grad_norm": 0.5451050499212198, + "learning_rate": 1.2866404939228123e-05, + "loss": 0.8458, + "num_tokens": 31694539936.0, + "step": 7583 + }, + { + "epoch": 0.9012477718360071, + "grad_norm": 0.5156796040365199, + "learning_rate": 1.2864710590864808e-05, + "loss": 0.8705, + "num_tokens": 31698729234.0, + "step": 7584 + }, + { + "epoch": 0.9013666072489602, + "grad_norm": 0.5150908780218417, + "learning_rate": 1.2863016173444955e-05, + "loss": 0.817, + "num_tokens": 31702917608.0, + "step": 7585 + }, + { + "epoch": 0.9014854426619132, + "grad_norm": 0.46577950286173825, + "learning_rate": 1.2861321687031314e-05, + "loss": 0.8221, + "num_tokens": 31707107389.0, + "step": 7586 + }, + { + "epoch": 0.9016042780748663, + "grad_norm": 0.5472569785276873, + "learning_rate": 1.2859627131686639e-05, + "loss": 0.8311, + "num_tokens": 31711296618.0, + "step": 7587 + }, + { + "epoch": 0.9017231134878194, + "grad_norm": 0.44490245229345465, + "learning_rate": 1.285793250747368e-05, + "loss": 0.8161, + "num_tokens": 31715485926.0, + "step": 7588 + }, + { + "epoch": 0.9018419489007724, + "grad_norm": 0.639472629890147, + "learning_rate": 1.2856237814455201e-05, + "loss": 0.8333, + "num_tokens": 31719676098.0, + "step": 7589 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.5132995415299139, + "learning_rate": 1.285454305269396e-05, + "loss": 0.8216, + "num_tokens": 31723863602.0, + "step": 7590 + }, + { + "epoch": 0.9020796197266786, + "grad_norm": 0.6120174459021487, + "learning_rate": 1.285284822225272e-05, + "loss": 0.838, + "num_tokens": 31728051594.0, + "step": 7591 + }, + { + "epoch": 0.9021984551396316, + "grad_norm": 0.5516191055559767, + "learning_rate": 1.2851153323194245e-05, + "loss": 0.8737, + "num_tokens": 31732239042.0, + "step": 7592 + }, + { + "epoch": 0.9023172905525847, + "grad_norm": 0.5420005900543977, + "learning_rate": 1.2849458355581302e-05, + "loss": 0.8763, + "num_tokens": 31736427133.0, + "step": 7593 + }, + { + "epoch": 0.9024361259655377, + "grad_norm": 0.4789058119369723, + "learning_rate": 1.2847763319476663e-05, + "loss": 0.8804, + "num_tokens": 31740609592.0, + "step": 7594 + }, + { + "epoch": 0.9025549613784908, + "grad_norm": 0.4773923081271288, + "learning_rate": 1.2846068214943104e-05, + "loss": 0.8637, + "num_tokens": 31744796858.0, + "step": 7595 + }, + { + "epoch": 0.9026737967914439, + "grad_norm": 0.4059164504093472, + "learning_rate": 1.2844373042043395e-05, + "loss": 0.8093, + "num_tokens": 31748914237.0, + "step": 7596 + }, + { + "epoch": 0.9027926322043969, + "grad_norm": 0.5608095637189666, + "learning_rate": 1.2842677800840317e-05, + "loss": 0.8238, + "num_tokens": 31753095061.0, + "step": 7597 + }, + { + "epoch": 0.90291146761735, + "grad_norm": 0.4248669962294854, + "learning_rate": 1.2840982491396649e-05, + "loss": 0.8336, + "num_tokens": 31757285058.0, + "step": 7598 + }, + { + "epoch": 0.9030303030303031, + "grad_norm": 0.5984409100895876, + "learning_rate": 1.2839287113775173e-05, + "loss": 0.8149, + "num_tokens": 31761475246.0, + "step": 7599 + }, + { + "epoch": 0.9031491384432561, + "grad_norm": 0.50625755642289, + "learning_rate": 1.2837591668038681e-05, + "loss": 0.8115, + "num_tokens": 31765664756.0, + "step": 7600 + }, + { + "epoch": 0.9032679738562092, + "grad_norm": 0.5334942648712153, + "learning_rate": 1.2835896154249954e-05, + "loss": 0.83, + "num_tokens": 31769853936.0, + "step": 7601 + }, + { + "epoch": 0.9033868092691623, + "grad_norm": 0.5211564413480048, + "learning_rate": 1.2834200572471784e-05, + "loss": 0.8341, + "num_tokens": 31774014931.0, + "step": 7602 + }, + { + "epoch": 0.9035056446821152, + "grad_norm": 0.477678585234529, + "learning_rate": 1.2832504922766968e-05, + "loss": 0.8318, + "num_tokens": 31778160357.0, + "step": 7603 + }, + { + "epoch": 0.9036244800950683, + "grad_norm": 0.44977351623250306, + "learning_rate": 1.2830809205198296e-05, + "loss": 0.7988, + "num_tokens": 31782338849.0, + "step": 7604 + }, + { + "epoch": 0.9037433155080213, + "grad_norm": 0.46398412135416456, + "learning_rate": 1.282911341982857e-05, + "loss": 0.8414, + "num_tokens": 31786504409.0, + "step": 7605 + }, + { + "epoch": 0.9038621509209744, + "grad_norm": 0.40304491782198104, + "learning_rate": 1.282741756672059e-05, + "loss": 0.8145, + "num_tokens": 31790693792.0, + "step": 7606 + }, + { + "epoch": 0.9039809863339275, + "grad_norm": 0.5507176090063863, + "learning_rate": 1.2825721645937157e-05, + "loss": 0.8133, + "num_tokens": 31794855087.0, + "step": 7607 + }, + { + "epoch": 0.9040998217468805, + "grad_norm": 0.4119047864868533, + "learning_rate": 1.2824025657541084e-05, + "loss": 0.8231, + "num_tokens": 31799038195.0, + "step": 7608 + }, + { + "epoch": 0.9042186571598336, + "grad_norm": 0.5615664330182738, + "learning_rate": 1.2822329601595168e-05, + "loss": 0.8244, + "num_tokens": 31803217736.0, + "step": 7609 + }, + { + "epoch": 0.9043374925727867, + "grad_norm": 0.4687091283558556, + "learning_rate": 1.2820633478162227e-05, + "loss": 0.8388, + "num_tokens": 31807407744.0, + "step": 7610 + }, + { + "epoch": 0.9044563279857397, + "grad_norm": 0.5177864088096097, + "learning_rate": 1.2818937287305073e-05, + "loss": 0.8285, + "num_tokens": 31811587342.0, + "step": 7611 + }, + { + "epoch": 0.9045751633986928, + "grad_norm": 0.5021820974704885, + "learning_rate": 1.2817241029086522e-05, + "loss": 0.8174, + "num_tokens": 31815769209.0, + "step": 7612 + }, + { + "epoch": 0.9046939988116459, + "grad_norm": 0.4600477726351104, + "learning_rate": 1.2815544703569391e-05, + "loss": 0.8388, + "num_tokens": 31819940564.0, + "step": 7613 + }, + { + "epoch": 0.9048128342245989, + "grad_norm": 0.44258441857634384, + "learning_rate": 1.2813848310816501e-05, + "loss": 0.8308, + "num_tokens": 31824129107.0, + "step": 7614 + }, + { + "epoch": 0.904931669637552, + "grad_norm": 0.47524834024856266, + "learning_rate": 1.2812151850890676e-05, + "loss": 0.8217, + "num_tokens": 31828274039.0, + "step": 7615 + }, + { + "epoch": 0.9050505050505051, + "grad_norm": 0.4176281919286104, + "learning_rate": 1.281045532385474e-05, + "loss": 0.8323, + "num_tokens": 31832464488.0, + "step": 7616 + }, + { + "epoch": 0.9051693404634581, + "grad_norm": 0.4672892446845184, + "learning_rate": 1.2808758729771522e-05, + "loss": 0.8321, + "num_tokens": 31836653517.0, + "step": 7617 + }, + { + "epoch": 0.9052881758764112, + "grad_norm": 0.41140465226496126, + "learning_rate": 1.2807062068703853e-05, + "loss": 0.8506, + "num_tokens": 31840842446.0, + "step": 7618 + }, + { + "epoch": 0.9054070112893642, + "grad_norm": 0.48863853410069946, + "learning_rate": 1.2805365340714563e-05, + "loss": 0.8043, + "num_tokens": 31845031460.0, + "step": 7619 + }, + { + "epoch": 0.9055258467023173, + "grad_norm": 0.43110924995683514, + "learning_rate": 1.2803668545866495e-05, + "loss": 0.8833, + "num_tokens": 31849219244.0, + "step": 7620 + }, + { + "epoch": 0.9056446821152704, + "grad_norm": 0.46311536005443193, + "learning_rate": 1.280197168422248e-05, + "loss": 0.8413, + "num_tokens": 31853409365.0, + "step": 7621 + }, + { + "epoch": 0.9057635175282234, + "grad_norm": 0.4563119118197377, + "learning_rate": 1.2800274755845362e-05, + "loss": 0.8314, + "num_tokens": 31857599970.0, + "step": 7622 + }, + { + "epoch": 0.9058823529411765, + "grad_norm": 0.45033858752428235, + "learning_rate": 1.2798577760797982e-05, + "loss": 0.8341, + "num_tokens": 31861779586.0, + "step": 7623 + }, + { + "epoch": 0.9060011883541296, + "grad_norm": 0.4168531863972011, + "learning_rate": 1.2796880699143186e-05, + "loss": 0.7976, + "num_tokens": 31865956833.0, + "step": 7624 + }, + { + "epoch": 0.9061200237670826, + "grad_norm": 0.45092320232694616, + "learning_rate": 1.2795183570943823e-05, + "loss": 0.8665, + "num_tokens": 31870132075.0, + "step": 7625 + }, + { + "epoch": 0.9062388591800357, + "grad_norm": 0.40093010144619773, + "learning_rate": 1.2793486376262742e-05, + "loss": 0.8363, + "num_tokens": 31874294709.0, + "step": 7626 + }, + { + "epoch": 0.9063576945929888, + "grad_norm": 0.4732907501888024, + "learning_rate": 1.2791789115162798e-05, + "loss": 0.808, + "num_tokens": 31878435816.0, + "step": 7627 + }, + { + "epoch": 0.9064765300059417, + "grad_norm": 0.42409469214847123, + "learning_rate": 1.2790091787706844e-05, + "loss": 0.8388, + "num_tokens": 31882625297.0, + "step": 7628 + }, + { + "epoch": 0.9065953654188948, + "grad_norm": 0.490703274261906, + "learning_rate": 1.2788394393957739e-05, + "loss": 0.8452, + "num_tokens": 31886813439.0, + "step": 7629 + }, + { + "epoch": 0.9067142008318478, + "grad_norm": 0.4108318094368155, + "learning_rate": 1.2786696933978341e-05, + "loss": 0.844, + "num_tokens": 31890997140.0, + "step": 7630 + }, + { + "epoch": 0.9068330362448009, + "grad_norm": 0.4409840781967004, + "learning_rate": 1.2784999407831518e-05, + "loss": 0.8449, + "num_tokens": 31895185380.0, + "step": 7631 + }, + { + "epoch": 0.906951871657754, + "grad_norm": 0.3837872347196086, + "learning_rate": 1.2783301815580132e-05, + "loss": 0.8101, + "num_tokens": 31899375047.0, + "step": 7632 + }, + { + "epoch": 0.907070707070707, + "grad_norm": 0.5118810877244029, + "learning_rate": 1.2781604157287046e-05, + "loss": 0.8713, + "num_tokens": 31903505063.0, + "step": 7633 + }, + { + "epoch": 0.9071895424836601, + "grad_norm": 0.3903106785342059, + "learning_rate": 1.2779906433015138e-05, + "loss": 0.8339, + "num_tokens": 31907656631.0, + "step": 7634 + }, + { + "epoch": 0.9073083778966132, + "grad_norm": 0.5187820645736018, + "learning_rate": 1.2778208642827274e-05, + "loss": 0.8117, + "num_tokens": 31911846097.0, + "step": 7635 + }, + { + "epoch": 0.9074272133095662, + "grad_norm": 0.49178376807728896, + "learning_rate": 1.2776510786786334e-05, + "loss": 0.881, + "num_tokens": 31916035193.0, + "step": 7636 + }, + { + "epoch": 0.9075460487225193, + "grad_norm": 0.42210889129343754, + "learning_rate": 1.277481286495519e-05, + "loss": 0.8576, + "num_tokens": 31920223744.0, + "step": 7637 + }, + { + "epoch": 0.9076648841354724, + "grad_norm": 0.4685855162156219, + "learning_rate": 1.2773114877396727e-05, + "loss": 0.8064, + "num_tokens": 31924402685.0, + "step": 7638 + }, + { + "epoch": 0.9077837195484254, + "grad_norm": 0.3625917439240674, + "learning_rate": 1.2771416824173828e-05, + "loss": 0.8274, + "num_tokens": 31928542680.0, + "step": 7639 + }, + { + "epoch": 0.9079025549613785, + "grad_norm": 0.3909469328150005, + "learning_rate": 1.276971870534937e-05, + "loss": 0.8858, + "num_tokens": 31932731382.0, + "step": 7640 + }, + { + "epoch": 0.9080213903743316, + "grad_norm": 0.41981128793555095, + "learning_rate": 1.2768020520986244e-05, + "loss": 0.8214, + "num_tokens": 31936915648.0, + "step": 7641 + }, + { + "epoch": 0.9081402257872846, + "grad_norm": 0.3133646309612084, + "learning_rate": 1.2766322271147347e-05, + "loss": 0.8535, + "num_tokens": 31941102072.0, + "step": 7642 + }, + { + "epoch": 0.9082590612002377, + "grad_norm": 0.4526395201872963, + "learning_rate": 1.2764623955895559e-05, + "loss": 0.845, + "num_tokens": 31945278308.0, + "step": 7643 + }, + { + "epoch": 0.9083778966131907, + "grad_norm": 0.37575093270158705, + "learning_rate": 1.2762925575293781e-05, + "loss": 0.8175, + "num_tokens": 31949469316.0, + "step": 7644 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.4236323826944014, + "learning_rate": 1.2761227129404908e-05, + "loss": 0.8371, + "num_tokens": 31953626169.0, + "step": 7645 + }, + { + "epoch": 0.9086155674390969, + "grad_norm": 0.36443244592131213, + "learning_rate": 1.2759528618291836e-05, + "loss": 0.8429, + "num_tokens": 31957743067.0, + "step": 7646 + }, + { + "epoch": 0.9087344028520499, + "grad_norm": 0.4820003114345199, + "learning_rate": 1.2757830042017476e-05, + "loss": 0.8333, + "num_tokens": 31961932617.0, + "step": 7647 + }, + { + "epoch": 0.908853238265003, + "grad_norm": 0.3382817438696264, + "learning_rate": 1.2756131400644722e-05, + "loss": 0.867, + "num_tokens": 31966089401.0, + "step": 7648 + }, + { + "epoch": 0.9089720736779561, + "grad_norm": 0.5129339716218483, + "learning_rate": 1.2754432694236484e-05, + "loss": 0.8346, + "num_tokens": 31970254173.0, + "step": 7649 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.42361175704414045, + "learning_rate": 1.2752733922855679e-05, + "loss": 0.8178, + "num_tokens": 31974424068.0, + "step": 7650 + }, + { + "epoch": 0.9092097445038622, + "grad_norm": 0.45138337226645087, + "learning_rate": 1.2751035086565201e-05, + "loss": 0.8937, + "num_tokens": 31978613240.0, + "step": 7651 + }, + { + "epoch": 0.9093285799168153, + "grad_norm": 0.43486245511432936, + "learning_rate": 1.274933618542798e-05, + "loss": 0.8044, + "num_tokens": 31982798479.0, + "step": 7652 + }, + { + "epoch": 0.9094474153297682, + "grad_norm": 0.4673050731761314, + "learning_rate": 1.274763721950692e-05, + "loss": 0.8529, + "num_tokens": 31986954907.0, + "step": 7653 + }, + { + "epoch": 0.9095662507427213, + "grad_norm": 0.519095083143703, + "learning_rate": 1.274593818886495e-05, + "loss": 0.7728, + "num_tokens": 31991145228.0, + "step": 7654 + }, + { + "epoch": 0.9096850861556743, + "grad_norm": 0.3457885571415099, + "learning_rate": 1.2744239093564985e-05, + "loss": 0.8357, + "num_tokens": 31995322352.0, + "step": 7655 + }, + { + "epoch": 0.9098039215686274, + "grad_norm": 0.5732159195663699, + "learning_rate": 1.2742539933669946e-05, + "loss": 0.8691, + "num_tokens": 31999509442.0, + "step": 7656 + }, + { + "epoch": 0.9099227569815805, + "grad_norm": 0.33530540589204744, + "learning_rate": 1.2740840709242765e-05, + "loss": 0.8215, + "num_tokens": 32003698968.0, + "step": 7657 + }, + { + "epoch": 0.9100415923945335, + "grad_norm": 0.5513934413416833, + "learning_rate": 1.2739141420346367e-05, + "loss": 0.855, + "num_tokens": 32007889134.0, + "step": 7658 + }, + { + "epoch": 0.9101604278074866, + "grad_norm": 0.40179638960203723, + "learning_rate": 1.2737442067043677e-05, + "loss": 0.8288, + "num_tokens": 32012077422.0, + "step": 7659 + }, + { + "epoch": 0.9102792632204397, + "grad_norm": 0.5082775903983567, + "learning_rate": 1.2735742649397635e-05, + "loss": 0.7841, + "num_tokens": 32016246118.0, + "step": 7660 + }, + { + "epoch": 0.9103980986333927, + "grad_norm": 0.38267218241620693, + "learning_rate": 1.2734043167471175e-05, + "loss": 0.8421, + "num_tokens": 32020434284.0, + "step": 7661 + }, + { + "epoch": 0.9105169340463458, + "grad_norm": 0.4855907302961242, + "learning_rate": 1.2732343621327235e-05, + "loss": 0.8539, + "num_tokens": 32024624090.0, + "step": 7662 + }, + { + "epoch": 0.9106357694592989, + "grad_norm": 0.4012734941538126, + "learning_rate": 1.273064401102875e-05, + "loss": 0.8591, + "num_tokens": 32028813630.0, + "step": 7663 + }, + { + "epoch": 0.9107546048722519, + "grad_norm": 0.4994028990350747, + "learning_rate": 1.2728944336638669e-05, + "loss": 0.85, + "num_tokens": 32033004278.0, + "step": 7664 + }, + { + "epoch": 0.910873440285205, + "grad_norm": 0.3978099023754079, + "learning_rate": 1.2727244598219932e-05, + "loss": 0.8287, + "num_tokens": 32037191961.0, + "step": 7665 + }, + { + "epoch": 0.9109922756981581, + "grad_norm": 0.4329685605935335, + "learning_rate": 1.2725544795835487e-05, + "loss": 0.8194, + "num_tokens": 32041381710.0, + "step": 7666 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.42669071046397405, + "learning_rate": 1.2723844929548285e-05, + "loss": 0.8263, + "num_tokens": 32045572151.0, + "step": 7667 + }, + { + "epoch": 0.9112299465240642, + "grad_norm": 0.4140247374139669, + "learning_rate": 1.272214499942128e-05, + "loss": 0.8625, + "num_tokens": 32049761392.0, + "step": 7668 + }, + { + "epoch": 0.9113487819370172, + "grad_norm": 0.38555200996364813, + "learning_rate": 1.2720445005517414e-05, + "loss": 0.8188, + "num_tokens": 32053934456.0, + "step": 7669 + }, + { + "epoch": 0.9114676173499703, + "grad_norm": 0.387217199024434, + "learning_rate": 1.271874494789966e-05, + "loss": 0.8554, + "num_tokens": 32058122618.0, + "step": 7670 + }, + { + "epoch": 0.9115864527629234, + "grad_norm": 0.37508617422965673, + "learning_rate": 1.2717044826630969e-05, + "loss": 0.854, + "num_tokens": 32062283189.0, + "step": 7671 + }, + { + "epoch": 0.9117052881758764, + "grad_norm": 0.47101406701545023, + "learning_rate": 1.2715344641774301e-05, + "loss": 0.8163, + "num_tokens": 32066466043.0, + "step": 7672 + }, + { + "epoch": 0.9118241235888295, + "grad_norm": 0.3895116547993666, + "learning_rate": 1.2713644393392621e-05, + "loss": 0.8707, + "num_tokens": 32070617211.0, + "step": 7673 + }, + { + "epoch": 0.9119429590017826, + "grad_norm": 0.4422460097192029, + "learning_rate": 1.2711944081548896e-05, + "loss": 0.8505, + "num_tokens": 32074807340.0, + "step": 7674 + }, + { + "epoch": 0.9120617944147356, + "grad_norm": 0.3585858959439449, + "learning_rate": 1.2710243706306089e-05, + "loss": 0.8769, + "num_tokens": 32078996792.0, + "step": 7675 + }, + { + "epoch": 0.9121806298276887, + "grad_norm": 0.39683078924442483, + "learning_rate": 1.2708543267727182e-05, + "loss": 0.8138, + "num_tokens": 32083169603.0, + "step": 7676 + }, + { + "epoch": 0.9122994652406418, + "grad_norm": 0.4693462566469864, + "learning_rate": 1.2706842765875138e-05, + "loss": 0.8372, + "num_tokens": 32087342929.0, + "step": 7677 + }, + { + "epoch": 0.9124183006535947, + "grad_norm": 0.3507408976106317, + "learning_rate": 1.2705142200812934e-05, + "loss": 0.8222, + "num_tokens": 32091531904.0, + "step": 7678 + }, + { + "epoch": 0.9125371360665478, + "grad_norm": 0.4770790442446549, + "learning_rate": 1.270344157260355e-05, + "loss": 0.8079, + "num_tokens": 32095693850.0, + "step": 7679 + }, + { + "epoch": 0.9126559714795008, + "grad_norm": 0.3379413454623833, + "learning_rate": 1.2701740881309967e-05, + "loss": 0.8053, + "num_tokens": 32099883244.0, + "step": 7680 + }, + { + "epoch": 0.9127748068924539, + "grad_norm": 0.4351207393358649, + "learning_rate": 1.2700040126995165e-05, + "loss": 0.8135, + "num_tokens": 32104072193.0, + "step": 7681 + }, + { + "epoch": 0.912893642305407, + "grad_norm": 0.4103414390824685, + "learning_rate": 1.2698339309722127e-05, + "loss": 0.8546, + "num_tokens": 32108248929.0, + "step": 7682 + }, + { + "epoch": 0.91301247771836, + "grad_norm": 0.34271964614459427, + "learning_rate": 1.2696638429553845e-05, + "loss": 0.7878, + "num_tokens": 32112436716.0, + "step": 7683 + }, + { + "epoch": 0.9131313131313131, + "grad_norm": 0.4393009130514157, + "learning_rate": 1.2694937486553303e-05, + "loss": 0.8316, + "num_tokens": 32116606277.0, + "step": 7684 + }, + { + "epoch": 0.9132501485442662, + "grad_norm": 0.36694871043882427, + "learning_rate": 1.2693236480783495e-05, + "loss": 0.8554, + "num_tokens": 32120795853.0, + "step": 7685 + }, + { + "epoch": 0.9133689839572192, + "grad_norm": 0.40730680921705786, + "learning_rate": 1.2691535412307418e-05, + "loss": 0.8625, + "num_tokens": 32124984572.0, + "step": 7686 + }, + { + "epoch": 0.9134878193701723, + "grad_norm": 0.45071624337364274, + "learning_rate": 1.2689834281188062e-05, + "loss": 0.806, + "num_tokens": 32129144425.0, + "step": 7687 + }, + { + "epoch": 0.9136066547831254, + "grad_norm": 0.35254989084601335, + "learning_rate": 1.268813308748843e-05, + "loss": 0.8476, + "num_tokens": 32133334493.0, + "step": 7688 + }, + { + "epoch": 0.9137254901960784, + "grad_norm": 0.4625014710023566, + "learning_rate": 1.2686431831271523e-05, + "loss": 0.8551, + "num_tokens": 32137523162.0, + "step": 7689 + }, + { + "epoch": 0.9138443256090315, + "grad_norm": 0.38977204246452934, + "learning_rate": 1.2684730512600343e-05, + "loss": 0.8729, + "num_tokens": 32141713611.0, + "step": 7690 + }, + { + "epoch": 0.9139631610219846, + "grad_norm": 0.4573204562989212, + "learning_rate": 1.2683029131537896e-05, + "loss": 0.8507, + "num_tokens": 32145901554.0, + "step": 7691 + }, + { + "epoch": 0.9140819964349376, + "grad_norm": 0.3674909660993918, + "learning_rate": 1.2681327688147189e-05, + "loss": 0.8706, + "num_tokens": 32150079547.0, + "step": 7692 + }, + { + "epoch": 0.9142008318478907, + "grad_norm": 0.4111020533374487, + "learning_rate": 1.2679626182491233e-05, + "loss": 0.8601, + "num_tokens": 32154268338.0, + "step": 7693 + }, + { + "epoch": 0.9143196672608437, + "grad_norm": 0.4326831676010145, + "learning_rate": 1.2677924614633041e-05, + "loss": 0.8275, + "num_tokens": 32158458282.0, + "step": 7694 + }, + { + "epoch": 0.9144385026737968, + "grad_norm": 0.3905459044505746, + "learning_rate": 1.2676222984635629e-05, + "loss": 0.8461, + "num_tokens": 32162647598.0, + "step": 7695 + }, + { + "epoch": 0.9145573380867499, + "grad_norm": 0.41641668133876614, + "learning_rate": 1.2674521292562012e-05, + "loss": 0.857, + "num_tokens": 32166837013.0, + "step": 7696 + }, + { + "epoch": 0.9146761734997029, + "grad_norm": 0.3569616899845023, + "learning_rate": 1.267281953847521e-05, + "loss": 0.8346, + "num_tokens": 32171027827.0, + "step": 7697 + }, + { + "epoch": 0.914795008912656, + "grad_norm": 0.34666668888425606, + "learning_rate": 1.2671117722438239e-05, + "loss": 0.8329, + "num_tokens": 32175187147.0, + "step": 7698 + }, + { + "epoch": 0.9149138443256091, + "grad_norm": 0.40908988798276724, + "learning_rate": 1.2669415844514138e-05, + "loss": 0.8643, + "num_tokens": 32179377409.0, + "step": 7699 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.3691926019954228, + "learning_rate": 1.2667713904765916e-05, + "loss": 0.851, + "num_tokens": 32183565034.0, + "step": 7700 + }, + { + "epoch": 0.9151515151515152, + "grad_norm": 0.3731756664302854, + "learning_rate": 1.266601190325661e-05, + "loss": 0.8412, + "num_tokens": 32187753128.0, + "step": 7701 + }, + { + "epoch": 0.9152703505644683, + "grad_norm": 0.3687064251076404, + "learning_rate": 1.2664309840049255e-05, + "loss": 0.849, + "num_tokens": 32191942306.0, + "step": 7702 + }, + { + "epoch": 0.9153891859774212, + "grad_norm": 0.4337966025988624, + "learning_rate": 1.2662607715206876e-05, + "loss": 0.859, + "num_tokens": 32196131347.0, + "step": 7703 + }, + { + "epoch": 0.9155080213903743, + "grad_norm": 0.3392112248113842, + "learning_rate": 1.2660905528792516e-05, + "loss": 0.8547, + "num_tokens": 32200320219.0, + "step": 7704 + }, + { + "epoch": 0.9156268568033273, + "grad_norm": 0.46270399601158463, + "learning_rate": 1.2659203280869204e-05, + "loss": 0.8708, + "num_tokens": 32204510118.0, + "step": 7705 + }, + { + "epoch": 0.9157456922162804, + "grad_norm": 0.4066328372991764, + "learning_rate": 1.2657500971499986e-05, + "loss": 0.8581, + "num_tokens": 32208669434.0, + "step": 7706 + }, + { + "epoch": 0.9158645276292335, + "grad_norm": 0.37757366156841055, + "learning_rate": 1.2655798600747904e-05, + "loss": 0.7922, + "num_tokens": 32212795146.0, + "step": 7707 + }, + { + "epoch": 0.9159833630421865, + "grad_norm": 0.4501027164423708, + "learning_rate": 1.2654096168675999e-05, + "loss": 0.8301, + "num_tokens": 32216986230.0, + "step": 7708 + }, + { + "epoch": 0.9161021984551396, + "grad_norm": 0.4050994533692004, + "learning_rate": 1.2652393675347321e-05, + "loss": 0.8525, + "num_tokens": 32221171331.0, + "step": 7709 + }, + { + "epoch": 0.9162210338680927, + "grad_norm": 0.455823238280721, + "learning_rate": 1.2650691120824918e-05, + "loss": 0.8269, + "num_tokens": 32225354781.0, + "step": 7710 + }, + { + "epoch": 0.9163398692810457, + "grad_norm": 0.3826509844139426, + "learning_rate": 1.2648988505171845e-05, + "loss": 0.8218, + "num_tokens": 32229520491.0, + "step": 7711 + }, + { + "epoch": 0.9164587046939988, + "grad_norm": 0.467836329696951, + "learning_rate": 1.264728582845115e-05, + "loss": 0.8495, + "num_tokens": 32233698303.0, + "step": 7712 + }, + { + "epoch": 0.9165775401069519, + "grad_norm": 0.39964003028034517, + "learning_rate": 1.264558309072589e-05, + "loss": 0.8665, + "num_tokens": 32237869977.0, + "step": 7713 + }, + { + "epoch": 0.9166963755199049, + "grad_norm": 0.42203468608840183, + "learning_rate": 1.2643880292059123e-05, + "loss": 0.8137, + "num_tokens": 32242059461.0, + "step": 7714 + }, + { + "epoch": 0.916815210932858, + "grad_norm": 0.3768520867296843, + "learning_rate": 1.2642177432513916e-05, + "loss": 0.8143, + "num_tokens": 32246247348.0, + "step": 7715 + }, + { + "epoch": 0.9169340463458111, + "grad_norm": 0.3850511070617202, + "learning_rate": 1.2640474512153324e-05, + "loss": 0.8486, + "num_tokens": 32250434652.0, + "step": 7716 + }, + { + "epoch": 0.9170528817587641, + "grad_norm": 0.37348314856787657, + "learning_rate": 1.2638771531040414e-05, + "loss": 0.7958, + "num_tokens": 32254625338.0, + "step": 7717 + }, + { + "epoch": 0.9171717171717172, + "grad_norm": 0.3492470882826307, + "learning_rate": 1.2637068489238253e-05, + "loss": 0.8526, + "num_tokens": 32258814960.0, + "step": 7718 + }, + { + "epoch": 0.9172905525846702, + "grad_norm": 0.41623725812812834, + "learning_rate": 1.263536538680991e-05, + "loss": 0.8325, + "num_tokens": 32263004820.0, + "step": 7719 + }, + { + "epoch": 0.9174093879976233, + "grad_norm": 0.42334332729717683, + "learning_rate": 1.2633662223818456e-05, + "loss": 0.8506, + "num_tokens": 32267147129.0, + "step": 7720 + }, + { + "epoch": 0.9175282234105764, + "grad_norm": 0.39319086599682074, + "learning_rate": 1.263195900032697e-05, + "loss": 0.8451, + "num_tokens": 32271336587.0, + "step": 7721 + }, + { + "epoch": 0.9176470588235294, + "grad_norm": 0.39884362218954805, + "learning_rate": 1.2630255716398525e-05, + "loss": 0.8431, + "num_tokens": 32275496368.0, + "step": 7722 + }, + { + "epoch": 0.9177658942364825, + "grad_norm": 0.32941421180795044, + "learning_rate": 1.2628552372096195e-05, + "loss": 0.8563, + "num_tokens": 32279683935.0, + "step": 7723 + }, + { + "epoch": 0.9178847296494356, + "grad_norm": 0.4291359411609367, + "learning_rate": 1.2626848967483066e-05, + "loss": 0.794, + "num_tokens": 32283864717.0, + "step": 7724 + }, + { + "epoch": 0.9180035650623886, + "grad_norm": 0.45342631077734324, + "learning_rate": 1.262514550262222e-05, + "loss": 0.8241, + "num_tokens": 32288052690.0, + "step": 7725 + }, + { + "epoch": 0.9181224004753417, + "grad_norm": 0.34141202599232084, + "learning_rate": 1.262344197757674e-05, + "loss": 0.8535, + "num_tokens": 32292240220.0, + "step": 7726 + }, + { + "epoch": 0.9182412358882948, + "grad_norm": 0.4062101452518002, + "learning_rate": 1.2621738392409714e-05, + "loss": 0.8051, + "num_tokens": 32296420354.0, + "step": 7727 + }, + { + "epoch": 0.9183600713012477, + "grad_norm": 0.4359721155798638, + "learning_rate": 1.2620034747184235e-05, + "loss": 0.8494, + "num_tokens": 32300588365.0, + "step": 7728 + }, + { + "epoch": 0.9184789067142008, + "grad_norm": 0.4271491886529011, + "learning_rate": 1.261833104196339e-05, + "loss": 0.8409, + "num_tokens": 32304777430.0, + "step": 7729 + }, + { + "epoch": 0.9185977421271538, + "grad_norm": 0.36603599754576904, + "learning_rate": 1.2616627276810273e-05, + "loss": 0.8249, + "num_tokens": 32308965091.0, + "step": 7730 + }, + { + "epoch": 0.9187165775401069, + "grad_norm": 0.3507460409794824, + "learning_rate": 1.2614923451787985e-05, + "loss": 0.8238, + "num_tokens": 32313154564.0, + "step": 7731 + }, + { + "epoch": 0.91883541295306, + "grad_norm": 0.3538437121465753, + "learning_rate": 1.2613219566959623e-05, + "loss": 0.8223, + "num_tokens": 32317343276.0, + "step": 7732 + }, + { + "epoch": 0.918954248366013, + "grad_norm": 0.42027506523466346, + "learning_rate": 1.2611515622388284e-05, + "loss": 0.8232, + "num_tokens": 32321531750.0, + "step": 7733 + }, + { + "epoch": 0.9190730837789661, + "grad_norm": 0.35700122949065227, + "learning_rate": 1.2609811618137071e-05, + "loss": 0.8151, + "num_tokens": 32325722308.0, + "step": 7734 + }, + { + "epoch": 0.9191919191919192, + "grad_norm": 0.30498226861282346, + "learning_rate": 1.2608107554269094e-05, + "loss": 0.888, + "num_tokens": 32329893038.0, + "step": 7735 + }, + { + "epoch": 0.9193107546048722, + "grad_norm": 0.35319971717446125, + "learning_rate": 1.2606403430847454e-05, + "loss": 0.801, + "num_tokens": 32334080876.0, + "step": 7736 + }, + { + "epoch": 0.9194295900178253, + "grad_norm": 0.42121038314169734, + "learning_rate": 1.2604699247935263e-05, + "loss": 0.8271, + "num_tokens": 32338271367.0, + "step": 7737 + }, + { + "epoch": 0.9195484254307784, + "grad_norm": 0.3531012768394158, + "learning_rate": 1.2602995005595638e-05, + "loss": 0.8106, + "num_tokens": 32342414083.0, + "step": 7738 + }, + { + "epoch": 0.9196672608437314, + "grad_norm": 0.34122934338450867, + "learning_rate": 1.2601290703891682e-05, + "loss": 0.8267, + "num_tokens": 32346602024.0, + "step": 7739 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 0.37392096672743796, + "learning_rate": 1.259958634288652e-05, + "loss": 0.8241, + "num_tokens": 32350790414.0, + "step": 7740 + }, + { + "epoch": 0.9199049316696376, + "grad_norm": 0.44165483101173514, + "learning_rate": 1.2597881922643269e-05, + "loss": 0.8478, + "num_tokens": 32354934333.0, + "step": 7741 + }, + { + "epoch": 0.9200237670825906, + "grad_norm": 0.4074933878200259, + "learning_rate": 1.2596177443225047e-05, + "loss": 0.8354, + "num_tokens": 32359099858.0, + "step": 7742 + }, + { + "epoch": 0.9201426024955437, + "grad_norm": 0.3546110634687104, + "learning_rate": 1.2594472904694977e-05, + "loss": 0.871, + "num_tokens": 32363287067.0, + "step": 7743 + }, + { + "epoch": 0.9202614379084967, + "grad_norm": 0.35439371492087757, + "learning_rate": 1.2592768307116186e-05, + "loss": 0.8471, + "num_tokens": 32367433922.0, + "step": 7744 + }, + { + "epoch": 0.9203802733214498, + "grad_norm": 0.31722196531755265, + "learning_rate": 1.2591063650551796e-05, + "loss": 0.8397, + "num_tokens": 32371620937.0, + "step": 7745 + }, + { + "epoch": 0.9204991087344029, + "grad_norm": 0.3629758900812914, + "learning_rate": 1.2589358935064941e-05, + "loss": 0.8467, + "num_tokens": 32375785572.0, + "step": 7746 + }, + { + "epoch": 0.9206179441473559, + "grad_norm": 0.33846565931003836, + "learning_rate": 1.2587654160718753e-05, + "loss": 0.8336, + "num_tokens": 32379952318.0, + "step": 7747 + }, + { + "epoch": 0.920736779560309, + "grad_norm": 0.3070884921823505, + "learning_rate": 1.2585949327576358e-05, + "loss": 0.8851, + "num_tokens": 32384140972.0, + "step": 7748 + }, + { + "epoch": 0.9208556149732621, + "grad_norm": 0.38127842886119295, + "learning_rate": 1.2584244435700903e-05, + "loss": 0.844, + "num_tokens": 32388332006.0, + "step": 7749 + }, + { + "epoch": 0.9209744503862151, + "grad_norm": 0.33619997739322627, + "learning_rate": 1.2582539485155519e-05, + "loss": 0.8405, + "num_tokens": 32392520956.0, + "step": 7750 + }, + { + "epoch": 0.9210932857991682, + "grad_norm": 0.4138830124003199, + "learning_rate": 1.2580834476003342e-05, + "loss": 0.8459, + "num_tokens": 32396709575.0, + "step": 7751 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 0.36097309906757974, + "learning_rate": 1.2579129408307525e-05, + "loss": 0.8377, + "num_tokens": 32400898597.0, + "step": 7752 + }, + { + "epoch": 0.9213309566250742, + "grad_norm": 0.3606061413556863, + "learning_rate": 1.2577424282131206e-05, + "loss": 0.8447, + "num_tokens": 32405087363.0, + "step": 7753 + }, + { + "epoch": 0.9214497920380273, + "grad_norm": 0.3865817820964968, + "learning_rate": 1.2575719097537534e-05, + "loss": 0.8493, + "num_tokens": 32409261193.0, + "step": 7754 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.3415400965952112, + "learning_rate": 1.2574013854589651e-05, + "loss": 0.8493, + "num_tokens": 32413450315.0, + "step": 7755 + }, + { + "epoch": 0.9216874628639334, + "grad_norm": 0.40363415631788124, + "learning_rate": 1.2572308553350716e-05, + "loss": 0.8, + "num_tokens": 32417640747.0, + "step": 7756 + }, + { + "epoch": 0.9218062982768865, + "grad_norm": 0.3665147505419871, + "learning_rate": 1.257060319388388e-05, + "loss": 0.8566, + "num_tokens": 32421828394.0, + "step": 7757 + }, + { + "epoch": 0.9219251336898395, + "grad_norm": 0.34037077170585495, + "learning_rate": 1.2568897776252295e-05, + "loss": 0.8335, + "num_tokens": 32425988141.0, + "step": 7758 + }, + { + "epoch": 0.9220439691027926, + "grad_norm": 0.38654491444204087, + "learning_rate": 1.2567192300519125e-05, + "loss": 0.8153, + "num_tokens": 32430176986.0, + "step": 7759 + }, + { + "epoch": 0.9221628045157457, + "grad_norm": 0.44626804860129793, + "learning_rate": 1.256548676674752e-05, + "loss": 0.8668, + "num_tokens": 32434364456.0, + "step": 7760 + }, + { + "epoch": 0.9222816399286987, + "grad_norm": 0.351026325843626, + "learning_rate": 1.2563781175000649e-05, + "loss": 0.8661, + "num_tokens": 32438534222.0, + "step": 7761 + }, + { + "epoch": 0.9224004753416518, + "grad_norm": 0.36566933885357256, + "learning_rate": 1.2562075525341677e-05, + "loss": 0.8119, + "num_tokens": 32442724907.0, + "step": 7762 + }, + { + "epoch": 0.9225193107546049, + "grad_norm": 0.357092564309822, + "learning_rate": 1.2560369817833764e-05, + "loss": 0.8411, + "num_tokens": 32446903977.0, + "step": 7763 + }, + { + "epoch": 0.9226381461675579, + "grad_norm": 0.384618898043033, + "learning_rate": 1.2558664052540079e-05, + "loss": 0.8638, + "num_tokens": 32451085931.0, + "step": 7764 + }, + { + "epoch": 0.922756981580511, + "grad_norm": 0.3615056587303177, + "learning_rate": 1.2556958229523796e-05, + "loss": 0.8347, + "num_tokens": 32455269256.0, + "step": 7765 + }, + { + "epoch": 0.9228758169934641, + "grad_norm": 0.3740586775241253, + "learning_rate": 1.2555252348848085e-05, + "loss": 0.8169, + "num_tokens": 32459457385.0, + "step": 7766 + }, + { + "epoch": 0.9229946524064171, + "grad_norm": 0.33271033136134337, + "learning_rate": 1.2553546410576124e-05, + "loss": 0.8033, + "num_tokens": 32463632270.0, + "step": 7767 + }, + { + "epoch": 0.9231134878193702, + "grad_norm": 0.36505187419255464, + "learning_rate": 1.2551840414771086e-05, + "loss": 0.8599, + "num_tokens": 32467821786.0, + "step": 7768 + }, + { + "epoch": 0.9232323232323232, + "grad_norm": 0.4209285135429004, + "learning_rate": 1.2550134361496147e-05, + "loss": 0.8212, + "num_tokens": 32471969494.0, + "step": 7769 + }, + { + "epoch": 0.9233511586452763, + "grad_norm": 0.3590876978373824, + "learning_rate": 1.2548428250814497e-05, + "loss": 0.8714, + "num_tokens": 32476137215.0, + "step": 7770 + }, + { + "epoch": 0.9234699940582294, + "grad_norm": 0.3980316230188345, + "learning_rate": 1.2546722082789307e-05, + "loss": 0.8344, + "num_tokens": 32480326375.0, + "step": 7771 + }, + { + "epoch": 0.9235888294711824, + "grad_norm": 0.3764469972692457, + "learning_rate": 1.2545015857483774e-05, + "loss": 0.8246, + "num_tokens": 32484508731.0, + "step": 7772 + }, + { + "epoch": 0.9237076648841355, + "grad_norm": 0.329364428671719, + "learning_rate": 1.2543309574961078e-05, + "loss": 0.858, + "num_tokens": 32488670761.0, + "step": 7773 + }, + { + "epoch": 0.9238265002970886, + "grad_norm": 0.3872068440158676, + "learning_rate": 1.2541603235284408e-05, + "loss": 0.8376, + "num_tokens": 32492842906.0, + "step": 7774 + }, + { + "epoch": 0.9239453357100416, + "grad_norm": 0.34469290872581637, + "learning_rate": 1.2539896838516963e-05, + "loss": 0.8336, + "num_tokens": 32497032949.0, + "step": 7775 + }, + { + "epoch": 0.9240641711229947, + "grad_norm": 0.4454392853707732, + "learning_rate": 1.2538190384721927e-05, + "loss": 0.8421, + "num_tokens": 32501159776.0, + "step": 7776 + }, + { + "epoch": 0.9241830065359478, + "grad_norm": 0.3853090117264582, + "learning_rate": 1.2536483873962501e-05, + "loss": 0.8334, + "num_tokens": 32505348612.0, + "step": 7777 + }, + { + "epoch": 0.9243018419489007, + "grad_norm": 0.32785105254704977, + "learning_rate": 1.2534777306301884e-05, + "loss": 0.802, + "num_tokens": 32509538012.0, + "step": 7778 + }, + { + "epoch": 0.9244206773618538, + "grad_norm": 0.4100317065173323, + "learning_rate": 1.2533070681803272e-05, + "loss": 0.8621, + "num_tokens": 32513726842.0, + "step": 7779 + }, + { + "epoch": 0.9245395127748068, + "grad_norm": 0.3538950730878341, + "learning_rate": 1.2531364000529871e-05, + "loss": 0.853, + "num_tokens": 32517915999.0, + "step": 7780 + }, + { + "epoch": 0.9246583481877599, + "grad_norm": 0.3724071893049779, + "learning_rate": 1.2529657262544883e-05, + "loss": 0.8499, + "num_tokens": 32522104604.0, + "step": 7781 + }, + { + "epoch": 0.924777183600713, + "grad_norm": 0.3645686054752375, + "learning_rate": 1.2527950467911514e-05, + "loss": 0.8589, + "num_tokens": 32526226216.0, + "step": 7782 + }, + { + "epoch": 0.924896019013666, + "grad_norm": 0.3449776263720506, + "learning_rate": 1.2526243616692974e-05, + "loss": 0.8436, + "num_tokens": 32530414493.0, + "step": 7783 + }, + { + "epoch": 0.9250148544266191, + "grad_norm": 0.40335810256603116, + "learning_rate": 1.2524536708952472e-05, + "loss": 0.8442, + "num_tokens": 32534578733.0, + "step": 7784 + }, + { + "epoch": 0.9251336898395722, + "grad_norm": 0.383114094640954, + "learning_rate": 1.2522829744753225e-05, + "loss": 0.8323, + "num_tokens": 32538768417.0, + "step": 7785 + }, + { + "epoch": 0.9252525252525252, + "grad_norm": 0.3932024384456477, + "learning_rate": 1.252112272415844e-05, + "loss": 0.8386, + "num_tokens": 32542925962.0, + "step": 7786 + }, + { + "epoch": 0.9253713606654783, + "grad_norm": 0.4135757563713567, + "learning_rate": 1.2519415647231334e-05, + "loss": 0.8267, + "num_tokens": 32547085820.0, + "step": 7787 + }, + { + "epoch": 0.9254901960784314, + "grad_norm": 0.3541473175759635, + "learning_rate": 1.2517708514035135e-05, + "loss": 0.8545, + "num_tokens": 32551266006.0, + "step": 7788 + }, + { + "epoch": 0.9256090314913844, + "grad_norm": 0.35618073782508064, + "learning_rate": 1.2516001324633056e-05, + "loss": 0.8348, + "num_tokens": 32555454553.0, + "step": 7789 + }, + { + "epoch": 0.9257278669043375, + "grad_norm": 0.33819753971887584, + "learning_rate": 1.251429407908832e-05, + "loss": 0.874, + "num_tokens": 32559643269.0, + "step": 7790 + }, + { + "epoch": 0.9258467023172906, + "grad_norm": 0.4120922501319685, + "learning_rate": 1.2512586777464157e-05, + "loss": 0.8917, + "num_tokens": 32563833489.0, + "step": 7791 + }, + { + "epoch": 0.9259655377302436, + "grad_norm": 0.3559921787862432, + "learning_rate": 1.2510879419823791e-05, + "loss": 0.8127, + "num_tokens": 32567998025.0, + "step": 7792 + }, + { + "epoch": 0.9260843731431967, + "grad_norm": 0.357734590380604, + "learning_rate": 1.2509172006230453e-05, + "loss": 0.8211, + "num_tokens": 32572187438.0, + "step": 7793 + }, + { + "epoch": 0.9262032085561497, + "grad_norm": 0.32861902776228724, + "learning_rate": 1.2507464536747372e-05, + "loss": 0.8066, + "num_tokens": 32576375610.0, + "step": 7794 + }, + { + "epoch": 0.9263220439691028, + "grad_norm": 0.3368903748903555, + "learning_rate": 1.250575701143778e-05, + "loss": 0.8213, + "num_tokens": 32580537894.0, + "step": 7795 + }, + { + "epoch": 0.9264408793820559, + "grad_norm": 0.34069744819509984, + "learning_rate": 1.2504049430364918e-05, + "loss": 0.8134, + "num_tokens": 32584726966.0, + "step": 7796 + }, + { + "epoch": 0.9265597147950089, + "grad_norm": 0.36561353356829884, + "learning_rate": 1.2502341793592018e-05, + "loss": 0.837, + "num_tokens": 32588902235.0, + "step": 7797 + }, + { + "epoch": 0.926678550207962, + "grad_norm": 0.3572316952722316, + "learning_rate": 1.2500634101182322e-05, + "loss": 0.7944, + "num_tokens": 32593091770.0, + "step": 7798 + }, + { + "epoch": 0.9267973856209151, + "grad_norm": 0.3652453650114771, + "learning_rate": 1.2498926353199072e-05, + "loss": 0.8662, + "num_tokens": 32597267677.0, + "step": 7799 + }, + { + "epoch": 0.9269162210338681, + "grad_norm": 0.3575616132248062, + "learning_rate": 1.2497218549705511e-05, + "loss": 0.8088, + "num_tokens": 32601458704.0, + "step": 7800 + }, + { + "epoch": 0.9270350564468212, + "grad_norm": 0.31731349326428204, + "learning_rate": 1.2495510690764889e-05, + "loss": 0.8228, + "num_tokens": 32605648955.0, + "step": 7801 + }, + { + "epoch": 0.9271538918597743, + "grad_norm": 0.42795978200849205, + "learning_rate": 1.2493802776440447e-05, + "loss": 0.8368, + "num_tokens": 32609837063.0, + "step": 7802 + }, + { + "epoch": 0.9272727272727272, + "grad_norm": 0.3683980552781556, + "learning_rate": 1.2492094806795434e-05, + "loss": 0.8277, + "num_tokens": 32613994744.0, + "step": 7803 + }, + { + "epoch": 0.9273915626856803, + "grad_norm": 0.3533232561848744, + "learning_rate": 1.2490386781893108e-05, + "loss": 0.8164, + "num_tokens": 32618183712.0, + "step": 7804 + }, + { + "epoch": 0.9275103980986333, + "grad_norm": 0.36297612781269967, + "learning_rate": 1.2488678701796722e-05, + "loss": 0.8441, + "num_tokens": 32622372383.0, + "step": 7805 + }, + { + "epoch": 0.9276292335115864, + "grad_norm": 0.33700480348213824, + "learning_rate": 1.2486970566569527e-05, + "loss": 0.8326, + "num_tokens": 32626559993.0, + "step": 7806 + }, + { + "epoch": 0.9277480689245395, + "grad_norm": 0.36283599359029656, + "learning_rate": 1.2485262376274786e-05, + "loss": 0.8193, + "num_tokens": 32630728171.0, + "step": 7807 + }, + { + "epoch": 0.9278669043374925, + "grad_norm": 0.39299893957760457, + "learning_rate": 1.2483554130975759e-05, + "loss": 0.8488, + "num_tokens": 32634916850.0, + "step": 7808 + }, + { + "epoch": 0.9279857397504456, + "grad_norm": 0.37250821814350366, + "learning_rate": 1.2481845830735704e-05, + "loss": 0.8071, + "num_tokens": 32639062351.0, + "step": 7809 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.3662061776731456, + "learning_rate": 1.248013747561789e-05, + "loss": 0.8342, + "num_tokens": 32643242524.0, + "step": 7810 + }, + { + "epoch": 0.9282234105763517, + "grad_norm": 0.40784548392329917, + "learning_rate": 1.2478429065685577e-05, + "loss": 0.8492, + "num_tokens": 32647425642.0, + "step": 7811 + }, + { + "epoch": 0.9283422459893048, + "grad_norm": 0.33952777660370276, + "learning_rate": 1.2476720601002041e-05, + "loss": 0.8426, + "num_tokens": 32651613796.0, + "step": 7812 + }, + { + "epoch": 0.9284610814022579, + "grad_norm": 0.34627570006600783, + "learning_rate": 1.2475012081630547e-05, + "loss": 0.868, + "num_tokens": 32655804495.0, + "step": 7813 + }, + { + "epoch": 0.9285799168152109, + "grad_norm": 0.4267626371998006, + "learning_rate": 1.2473303507634368e-05, + "loss": 0.8276, + "num_tokens": 32659992291.0, + "step": 7814 + }, + { + "epoch": 0.928698752228164, + "grad_norm": 0.3528506135108431, + "learning_rate": 1.2471594879076778e-05, + "loss": 0.8403, + "num_tokens": 32664159550.0, + "step": 7815 + }, + { + "epoch": 0.9288175876411171, + "grad_norm": 0.3134094484779221, + "learning_rate": 1.2469886196021055e-05, + "loss": 0.8554, + "num_tokens": 32668349169.0, + "step": 7816 + }, + { + "epoch": 0.9289364230540701, + "grad_norm": 0.4150986707918478, + "learning_rate": 1.2468177458530477e-05, + "loss": 0.8543, + "num_tokens": 32672534609.0, + "step": 7817 + }, + { + "epoch": 0.9290552584670232, + "grad_norm": 0.4364314421310403, + "learning_rate": 1.2466468666668324e-05, + "loss": 0.8267, + "num_tokens": 32676693586.0, + "step": 7818 + }, + { + "epoch": 0.9291740938799762, + "grad_norm": 0.33785487669621095, + "learning_rate": 1.2464759820497878e-05, + "loss": 0.8596, + "num_tokens": 32680883194.0, + "step": 7819 + }, + { + "epoch": 0.9292929292929293, + "grad_norm": 0.36804982473428627, + "learning_rate": 1.2463050920082423e-05, + "loss": 0.8024, + "num_tokens": 32685072633.0, + "step": 7820 + }, + { + "epoch": 0.9294117647058824, + "grad_norm": 0.4196519328878517, + "learning_rate": 1.2461341965485242e-05, + "loss": 0.8357, + "num_tokens": 32689249331.0, + "step": 7821 + }, + { + "epoch": 0.9295306001188354, + "grad_norm": 0.3458606327787502, + "learning_rate": 1.2459632956769631e-05, + "loss": 0.8697, + "num_tokens": 32693438719.0, + "step": 7822 + }, + { + "epoch": 0.9296494355317885, + "grad_norm": 0.42051118924301023, + "learning_rate": 1.2457923893998875e-05, + "loss": 0.8112, + "num_tokens": 32697628094.0, + "step": 7823 + }, + { + "epoch": 0.9297682709447416, + "grad_norm": 0.33421268475922505, + "learning_rate": 1.2456214777236268e-05, + "loss": 0.8264, + "num_tokens": 32701797528.0, + "step": 7824 + }, + { + "epoch": 0.9298871063576946, + "grad_norm": 0.386426723956796, + "learning_rate": 1.2454505606545106e-05, + "loss": 0.8459, + "num_tokens": 32705986487.0, + "step": 7825 + }, + { + "epoch": 0.9300059417706477, + "grad_norm": 0.4560712091070031, + "learning_rate": 1.2452796381988684e-05, + "loss": 0.8844, + "num_tokens": 32710175064.0, + "step": 7826 + }, + { + "epoch": 0.9301247771836008, + "grad_norm": 0.37229632777413524, + "learning_rate": 1.24510871036303e-05, + "loss": 0.8614, + "num_tokens": 32714365004.0, + "step": 7827 + }, + { + "epoch": 0.9302436125965537, + "grad_norm": 0.47060170144224517, + "learning_rate": 1.2449377771533248e-05, + "loss": 0.8243, + "num_tokens": 32718552975.0, + "step": 7828 + }, + { + "epoch": 0.9303624480095068, + "grad_norm": 0.33344951917460763, + "learning_rate": 1.2447668385760843e-05, + "loss": 0.837, + "num_tokens": 32722742949.0, + "step": 7829 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 0.51913669076779, + "learning_rate": 1.2445958946376382e-05, + "loss": 0.8716, + "num_tokens": 32726930970.0, + "step": 7830 + }, + { + "epoch": 0.9306001188354129, + "grad_norm": 0.3699221498438434, + "learning_rate": 1.2444249453443172e-05, + "loss": 0.8412, + "num_tokens": 32731120106.0, + "step": 7831 + }, + { + "epoch": 0.930718954248366, + "grad_norm": 0.5252940732436284, + "learning_rate": 1.2442539907024522e-05, + "loss": 0.8158, + "num_tokens": 32735308187.0, + "step": 7832 + }, + { + "epoch": 0.930837789661319, + "grad_norm": 0.46908657657398356, + "learning_rate": 1.244083030718374e-05, + "loss": 0.8336, + "num_tokens": 32739497291.0, + "step": 7833 + }, + { + "epoch": 0.9309566250742721, + "grad_norm": 0.4172958706016612, + "learning_rate": 1.243912065398414e-05, + "loss": 0.856, + "num_tokens": 32743686691.0, + "step": 7834 + }, + { + "epoch": 0.9310754604872252, + "grad_norm": 0.412298163103864, + "learning_rate": 1.2437410947489038e-05, + "loss": 0.8483, + "num_tokens": 32747875952.0, + "step": 7835 + }, + { + "epoch": 0.9311942959001782, + "grad_norm": 0.44492825371658734, + "learning_rate": 1.2435701187761749e-05, + "loss": 0.8446, + "num_tokens": 32752021094.0, + "step": 7836 + }, + { + "epoch": 0.9313131313131313, + "grad_norm": 0.3705744904384059, + "learning_rate": 1.243399137486559e-05, + "loss": 0.8754, + "num_tokens": 32756209841.0, + "step": 7837 + }, + { + "epoch": 0.9314319667260844, + "grad_norm": 0.42163389871631307, + "learning_rate": 1.2432281508863882e-05, + "loss": 0.8055, + "num_tokens": 32760397704.0, + "step": 7838 + }, + { + "epoch": 0.9315508021390374, + "grad_norm": 0.38161422951575225, + "learning_rate": 1.243057158981995e-05, + "loss": 0.8555, + "num_tokens": 32764547659.0, + "step": 7839 + }, + { + "epoch": 0.9316696375519905, + "grad_norm": 0.4772053969976859, + "learning_rate": 1.2428861617797113e-05, + "loss": 0.8459, + "num_tokens": 32768735495.0, + "step": 7840 + }, + { + "epoch": 0.9317884729649436, + "grad_norm": 0.35095559913528707, + "learning_rate": 1.2427151592858695e-05, + "loss": 0.7865, + "num_tokens": 32772924935.0, + "step": 7841 + }, + { + "epoch": 0.9319073083778966, + "grad_norm": 0.5144144712232095, + "learning_rate": 1.2425441515068032e-05, + "loss": 0.8485, + "num_tokens": 32777098168.0, + "step": 7842 + }, + { + "epoch": 0.9320261437908497, + "grad_norm": 0.4177758483039187, + "learning_rate": 1.242373138448845e-05, + "loss": 0.8534, + "num_tokens": 32781288755.0, + "step": 7843 + }, + { + "epoch": 0.9321449792038027, + "grad_norm": 0.47649128687154235, + "learning_rate": 1.2422021201183281e-05, + "loss": 0.8608, + "num_tokens": 32785447970.0, + "step": 7844 + }, + { + "epoch": 0.9322638146167558, + "grad_norm": 0.4829320137986398, + "learning_rate": 1.242031096521586e-05, + "loss": 0.8331, + "num_tokens": 32789635302.0, + "step": 7845 + }, + { + "epoch": 0.9323826500297089, + "grad_norm": 0.38892170706868945, + "learning_rate": 1.2418600676649519e-05, + "loss": 0.812, + "num_tokens": 32793806761.0, + "step": 7846 + }, + { + "epoch": 0.9325014854426619, + "grad_norm": 0.44467227507221113, + "learning_rate": 1.24168903355476e-05, + "loss": 0.7974, + "num_tokens": 32797995972.0, + "step": 7847 + }, + { + "epoch": 0.932620320855615, + "grad_norm": 0.3817333279443348, + "learning_rate": 1.2415179941973443e-05, + "loss": 0.867, + "num_tokens": 32802184601.0, + "step": 7848 + }, + { + "epoch": 0.9327391562685681, + "grad_norm": 0.3849507063719888, + "learning_rate": 1.2413469495990387e-05, + "loss": 0.8361, + "num_tokens": 32806374259.0, + "step": 7849 + }, + { + "epoch": 0.9328579916815211, + "grad_norm": 0.39878155919308994, + "learning_rate": 1.2411758997661774e-05, + "loss": 0.8241, + "num_tokens": 32810562251.0, + "step": 7850 + }, + { + "epoch": 0.9329768270944742, + "grad_norm": 0.3824627608216024, + "learning_rate": 1.2410048447050955e-05, + "loss": 0.8304, + "num_tokens": 32814749990.0, + "step": 7851 + }, + { + "epoch": 0.9330956625074273, + "grad_norm": 0.3422398152492545, + "learning_rate": 1.2408337844221273e-05, + "loss": 0.8557, + "num_tokens": 32818937081.0, + "step": 7852 + }, + { + "epoch": 0.9332144979203802, + "grad_norm": 0.4386482124824625, + "learning_rate": 1.240662718923608e-05, + "loss": 0.8283, + "num_tokens": 32823125742.0, + "step": 7853 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.3538465380817252, + "learning_rate": 1.2404916482158727e-05, + "loss": 0.8688, + "num_tokens": 32827314840.0, + "step": 7854 + }, + { + "epoch": 0.9334521687462864, + "grad_norm": 0.475522831810853, + "learning_rate": 1.2403205723052562e-05, + "loss": 0.8195, + "num_tokens": 32831503761.0, + "step": 7855 + }, + { + "epoch": 0.9335710041592394, + "grad_norm": 0.3720973724706178, + "learning_rate": 1.2401494911980952e-05, + "loss": 0.805, + "num_tokens": 32835676069.0, + "step": 7856 + }, + { + "epoch": 0.9336898395721925, + "grad_norm": 0.4489735939139804, + "learning_rate": 1.239978404900724e-05, + "loss": 0.813, + "num_tokens": 32839866428.0, + "step": 7857 + }, + { + "epoch": 0.9338086749851455, + "grad_norm": 0.4075854565483332, + "learning_rate": 1.2398073134194793e-05, + "loss": 0.8926, + "num_tokens": 32844028713.0, + "step": 7858 + }, + { + "epoch": 0.9339275103980986, + "grad_norm": 0.3834191268789549, + "learning_rate": 1.2396362167606975e-05, + "loss": 0.8521, + "num_tokens": 32848197612.0, + "step": 7859 + }, + { + "epoch": 0.9340463458110517, + "grad_norm": 0.4013474583104661, + "learning_rate": 1.2394651149307139e-05, + "loss": 0.8088, + "num_tokens": 32852369289.0, + "step": 7860 + }, + { + "epoch": 0.9341651812240047, + "grad_norm": 0.3731658106990679, + "learning_rate": 1.2392940079358661e-05, + "loss": 0.7998, + "num_tokens": 32856558743.0, + "step": 7861 + }, + { + "epoch": 0.9342840166369578, + "grad_norm": 0.3816421518041463, + "learning_rate": 1.2391228957824902e-05, + "loss": 0.8438, + "num_tokens": 32860736187.0, + "step": 7862 + }, + { + "epoch": 0.9344028520499109, + "grad_norm": 0.4025781446276455, + "learning_rate": 1.2389517784769224e-05, + "loss": 0.8341, + "num_tokens": 32864924982.0, + "step": 7863 + }, + { + "epoch": 0.9345216874628639, + "grad_norm": 0.3693223910324257, + "learning_rate": 1.2387806560255011e-05, + "loss": 0.8459, + "num_tokens": 32869096262.0, + "step": 7864 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.37566922191895524, + "learning_rate": 1.2386095284345628e-05, + "loss": 0.829, + "num_tokens": 32873286352.0, + "step": 7865 + }, + { + "epoch": 0.9347593582887701, + "grad_norm": 0.35341806942168974, + "learning_rate": 1.2384383957104448e-05, + "loss": 0.8143, + "num_tokens": 32877456366.0, + "step": 7866 + }, + { + "epoch": 0.9348781937017231, + "grad_norm": 0.45330918381968366, + "learning_rate": 1.2382672578594851e-05, + "loss": 0.8376, + "num_tokens": 32881646198.0, + "step": 7867 + }, + { + "epoch": 0.9349970291146762, + "grad_norm": 0.3592390857415854, + "learning_rate": 1.2380961148880214e-05, + "loss": 0.8591, + "num_tokens": 32885814079.0, + "step": 7868 + }, + { + "epoch": 0.9351158645276292, + "grad_norm": 0.38881842497050084, + "learning_rate": 1.2379249668023914e-05, + "loss": 0.8144, + "num_tokens": 32890002313.0, + "step": 7869 + }, + { + "epoch": 0.9352346999405823, + "grad_norm": 0.44158394012034885, + "learning_rate": 1.2377538136089338e-05, + "loss": 0.86, + "num_tokens": 32894193290.0, + "step": 7870 + }, + { + "epoch": 0.9353535353535354, + "grad_norm": 0.38426654631241863, + "learning_rate": 1.2375826553139864e-05, + "loss": 0.8485, + "num_tokens": 32898355824.0, + "step": 7871 + }, + { + "epoch": 0.9354723707664884, + "grad_norm": 0.3742895695131504, + "learning_rate": 1.2374114919238885e-05, + "loss": 0.8071, + "num_tokens": 32902545104.0, + "step": 7872 + }, + { + "epoch": 0.9355912061794415, + "grad_norm": 0.34953076263489974, + "learning_rate": 1.2372403234449782e-05, + "loss": 0.8282, + "num_tokens": 32906714015.0, + "step": 7873 + }, + { + "epoch": 0.9357100415923946, + "grad_norm": 0.38268490471204775, + "learning_rate": 1.237069149883595e-05, + "loss": 0.8016, + "num_tokens": 32910888113.0, + "step": 7874 + }, + { + "epoch": 0.9358288770053476, + "grad_norm": 0.44011857947680566, + "learning_rate": 1.2368979712460773e-05, + "loss": 0.8567, + "num_tokens": 32915052225.0, + "step": 7875 + }, + { + "epoch": 0.9359477124183007, + "grad_norm": 0.37872909565191415, + "learning_rate": 1.236726787538765e-05, + "loss": 0.8449, + "num_tokens": 32919240075.0, + "step": 7876 + }, + { + "epoch": 0.9360665478312538, + "grad_norm": 0.33824988588151494, + "learning_rate": 1.2365555987679976e-05, + "loss": 0.8311, + "num_tokens": 32923429473.0, + "step": 7877 + }, + { + "epoch": 0.9361853832442067, + "grad_norm": 0.4725851592561851, + "learning_rate": 1.2363844049401146e-05, + "loss": 0.8012, + "num_tokens": 32927620157.0, + "step": 7878 + }, + { + "epoch": 0.9363042186571598, + "grad_norm": 0.4178885168861803, + "learning_rate": 1.2362132060614558e-05, + "loss": 0.8166, + "num_tokens": 32931808510.0, + "step": 7879 + }, + { + "epoch": 0.936423054070113, + "grad_norm": 0.3469620730372931, + "learning_rate": 1.2360420021383617e-05, + "loss": 0.8823, + "num_tokens": 32935997493.0, + "step": 7880 + }, + { + "epoch": 0.9365418894830659, + "grad_norm": 0.3848828124883269, + "learning_rate": 1.2358707931771722e-05, + "loss": 0.8282, + "num_tokens": 32940186430.0, + "step": 7881 + }, + { + "epoch": 0.936660724896019, + "grad_norm": 0.3769212084141663, + "learning_rate": 1.2356995791842276e-05, + "loss": 0.848, + "num_tokens": 32944376236.0, + "step": 7882 + }, + { + "epoch": 0.936779560308972, + "grad_norm": 0.3682676271048527, + "learning_rate": 1.2355283601658692e-05, + "loss": 0.8596, + "num_tokens": 32948552862.0, + "step": 7883 + }, + { + "epoch": 0.9368983957219251, + "grad_norm": 0.3294959358596366, + "learning_rate": 1.2353571361284374e-05, + "loss": 0.8243, + "num_tokens": 32952731248.0, + "step": 7884 + }, + { + "epoch": 0.9370172311348782, + "grad_norm": 0.33621876240495957, + "learning_rate": 1.2351859070782727e-05, + "loss": 0.8181, + "num_tokens": 32956921739.0, + "step": 7885 + }, + { + "epoch": 0.9371360665478312, + "grad_norm": 0.31741648874078926, + "learning_rate": 1.235014673021717e-05, + "loss": 0.8302, + "num_tokens": 32961110364.0, + "step": 7886 + }, + { + "epoch": 0.9372549019607843, + "grad_norm": 0.4131010414048087, + "learning_rate": 1.2348434339651118e-05, + "loss": 0.8441, + "num_tokens": 32965257280.0, + "step": 7887 + }, + { + "epoch": 0.9373737373737374, + "grad_norm": 0.3402756992255832, + "learning_rate": 1.234672189914798e-05, + "loss": 0.8618, + "num_tokens": 32969395617.0, + "step": 7888 + }, + { + "epoch": 0.9374925727866904, + "grad_norm": 0.40579928741258664, + "learning_rate": 1.2345009408771176e-05, + "loss": 0.8359, + "num_tokens": 32973583566.0, + "step": 7889 + }, + { + "epoch": 0.9376114081996435, + "grad_norm": 0.34154269645607843, + "learning_rate": 1.2343296868584128e-05, + "loss": 0.8409, + "num_tokens": 32977761396.0, + "step": 7890 + }, + { + "epoch": 0.9377302436125966, + "grad_norm": 0.38604423950506644, + "learning_rate": 1.2341584278650257e-05, + "loss": 0.8125, + "num_tokens": 32981951520.0, + "step": 7891 + }, + { + "epoch": 0.9378490790255496, + "grad_norm": 0.33466880329780596, + "learning_rate": 1.2339871639032978e-05, + "loss": 0.8116, + "num_tokens": 32986140866.0, + "step": 7892 + }, + { + "epoch": 0.9379679144385027, + "grad_norm": 0.4637985323188642, + "learning_rate": 1.2338158949795726e-05, + "loss": 0.8209, + "num_tokens": 32990321508.0, + "step": 7893 + }, + { + "epoch": 0.9380867498514557, + "grad_norm": 0.34354024985486425, + "learning_rate": 1.2336446211001925e-05, + "loss": 0.8308, + "num_tokens": 32994510935.0, + "step": 7894 + }, + { + "epoch": 0.9382055852644088, + "grad_norm": 0.511267530767596, + "learning_rate": 1.2334733422715e-05, + "loss": 0.849, + "num_tokens": 32998685042.0, + "step": 7895 + }, + { + "epoch": 0.9383244206773619, + "grad_norm": 0.39621304164815063, + "learning_rate": 1.2333020584998385e-05, + "loss": 0.8454, + "num_tokens": 33002874737.0, + "step": 7896 + }, + { + "epoch": 0.9384432560903149, + "grad_norm": 0.44946323722354214, + "learning_rate": 1.2331307697915508e-05, + "loss": 0.7993, + "num_tokens": 33007063964.0, + "step": 7897 + }, + { + "epoch": 0.938562091503268, + "grad_norm": 0.38318567216467847, + "learning_rate": 1.2329594761529806e-05, + "loss": 0.8634, + "num_tokens": 33011253376.0, + "step": 7898 + }, + { + "epoch": 0.9386809269162211, + "grad_norm": 0.4608878146579966, + "learning_rate": 1.2327881775904715e-05, + "loss": 0.8342, + "num_tokens": 33015442825.0, + "step": 7899 + }, + { + "epoch": 0.9387997623291741, + "grad_norm": 0.41249521249262366, + "learning_rate": 1.2326168741103678e-05, + "loss": 0.8265, + "num_tokens": 33019631151.0, + "step": 7900 + }, + { + "epoch": 0.9389185977421272, + "grad_norm": 0.4320998649202333, + "learning_rate": 1.2324455657190122e-05, + "loss": 0.8472, + "num_tokens": 33023820711.0, + "step": 7901 + }, + { + "epoch": 0.9390374331550803, + "grad_norm": 0.43639091258302315, + "learning_rate": 1.2322742524227493e-05, + "loss": 0.7993, + "num_tokens": 33028011574.0, + "step": 7902 + }, + { + "epoch": 0.9391562685680332, + "grad_norm": 0.35961779481448763, + "learning_rate": 1.2321029342279245e-05, + "loss": 0.8293, + "num_tokens": 33032177091.0, + "step": 7903 + }, + { + "epoch": 0.9392751039809863, + "grad_norm": 0.47119798184560985, + "learning_rate": 1.2319316111408805e-05, + "loss": 0.8085, + "num_tokens": 33036352896.0, + "step": 7904 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 0.3912689887265081, + "learning_rate": 1.2317602831679634e-05, + "loss": 0.8378, + "num_tokens": 33040532220.0, + "step": 7905 + }, + { + "epoch": 0.9395127748068924, + "grad_norm": 0.36320136891435195, + "learning_rate": 1.2315889503155174e-05, + "loss": 0.7938, + "num_tokens": 33044707718.0, + "step": 7906 + }, + { + "epoch": 0.9396316102198455, + "grad_norm": 0.44478128085369173, + "learning_rate": 1.2314176125898873e-05, + "loss": 0.8391, + "num_tokens": 33048894432.0, + "step": 7907 + }, + { + "epoch": 0.9397504456327985, + "grad_norm": 0.3954619441885887, + "learning_rate": 1.2312462699974193e-05, + "loss": 0.8122, + "num_tokens": 33053084536.0, + "step": 7908 + }, + { + "epoch": 0.9398692810457516, + "grad_norm": 0.35685264815670137, + "learning_rate": 1.2310749225444576e-05, + "loss": 0.8338, + "num_tokens": 33057248771.0, + "step": 7909 + }, + { + "epoch": 0.9399881164587047, + "grad_norm": 0.4075814236148147, + "learning_rate": 1.2309035702373487e-05, + "loss": 0.8554, + "num_tokens": 33061436296.0, + "step": 7910 + }, + { + "epoch": 0.9401069518716577, + "grad_norm": 0.4217503571689731, + "learning_rate": 1.230732213082438e-05, + "loss": 0.8782, + "num_tokens": 33065624313.0, + "step": 7911 + }, + { + "epoch": 0.9402257872846108, + "grad_norm": 0.3539158455485565, + "learning_rate": 1.2305608510860712e-05, + "loss": 0.8326, + "num_tokens": 33069814528.0, + "step": 7912 + }, + { + "epoch": 0.9403446226975639, + "grad_norm": 0.3774310733860779, + "learning_rate": 1.2303894842545947e-05, + "loss": 0.8212, + "num_tokens": 33074004655.0, + "step": 7913 + }, + { + "epoch": 0.9404634581105169, + "grad_norm": 0.3900027154066055, + "learning_rate": 1.2302181125943549e-05, + "loss": 0.7921, + "num_tokens": 33078193777.0, + "step": 7914 + }, + { + "epoch": 0.94058229352347, + "grad_norm": 0.38982385182983154, + "learning_rate": 1.230046736111698e-05, + "loss": 0.8506, + "num_tokens": 33082380065.0, + "step": 7915 + }, + { + "epoch": 0.9407011289364231, + "grad_norm": 0.3412165323363467, + "learning_rate": 1.2298753548129707e-05, + "loss": 0.8388, + "num_tokens": 33086569214.0, + "step": 7916 + }, + { + "epoch": 0.9408199643493761, + "grad_norm": 0.3753095719344461, + "learning_rate": 1.2297039687045198e-05, + "loss": 0.812, + "num_tokens": 33090750241.0, + "step": 7917 + }, + { + "epoch": 0.9409387997623292, + "grad_norm": 0.3631704948091497, + "learning_rate": 1.2295325777926928e-05, + "loss": 0.855, + "num_tokens": 33094939741.0, + "step": 7918 + }, + { + "epoch": 0.9410576351752822, + "grad_norm": 0.32511016809384874, + "learning_rate": 1.2293611820838362e-05, + "loss": 0.8299, + "num_tokens": 33099129650.0, + "step": 7919 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.3268858996832466, + "learning_rate": 1.2291897815842977e-05, + "loss": 0.816, + "num_tokens": 33103301906.0, + "step": 7920 + }, + { + "epoch": 0.9412953060011884, + "grad_norm": 0.3448510300559979, + "learning_rate": 1.2290183763004251e-05, + "loss": 0.8309, + "num_tokens": 33107491420.0, + "step": 7921 + }, + { + "epoch": 0.9414141414141414, + "grad_norm": 0.3208679264471439, + "learning_rate": 1.2288469662385651e-05, + "loss": 0.8181, + "num_tokens": 33111674179.0, + "step": 7922 + }, + { + "epoch": 0.9415329768270945, + "grad_norm": 0.3256549972004873, + "learning_rate": 1.228675551405067e-05, + "loss": 0.8387, + "num_tokens": 33115842427.0, + "step": 7923 + }, + { + "epoch": 0.9416518122400476, + "grad_norm": 0.37001243982844795, + "learning_rate": 1.2285041318062778e-05, + "loss": 0.8248, + "num_tokens": 33120032672.0, + "step": 7924 + }, + { + "epoch": 0.9417706476530006, + "grad_norm": 0.4408592308103963, + "learning_rate": 1.228332707448546e-05, + "loss": 0.7982, + "num_tokens": 33124221995.0, + "step": 7925 + }, + { + "epoch": 0.9418894830659537, + "grad_norm": 0.37511947472853246, + "learning_rate": 1.2281612783382205e-05, + "loss": 0.8191, + "num_tokens": 33128394330.0, + "step": 7926 + }, + { + "epoch": 0.9420083184789068, + "grad_norm": 0.3591559907145654, + "learning_rate": 1.2279898444816491e-05, + "loss": 0.8017, + "num_tokens": 33132564677.0, + "step": 7927 + }, + { + "epoch": 0.9421271538918597, + "grad_norm": 0.3822940548943705, + "learning_rate": 1.2278184058851814e-05, + "loss": 0.7837, + "num_tokens": 33136755338.0, + "step": 7928 + }, + { + "epoch": 0.9422459893048128, + "grad_norm": 0.41928797973304566, + "learning_rate": 1.2276469625551659e-05, + "loss": 0.7833, + "num_tokens": 33140944994.0, + "step": 7929 + }, + { + "epoch": 0.942364824717766, + "grad_norm": 0.3532730813291959, + "learning_rate": 1.2274755144979518e-05, + "loss": 0.8427, + "num_tokens": 33145134632.0, + "step": 7930 + }, + { + "epoch": 0.9424836601307189, + "grad_norm": 0.29559299685329204, + "learning_rate": 1.2273040617198882e-05, + "loss": 0.7832, + "num_tokens": 33149324334.0, + "step": 7931 + }, + { + "epoch": 0.942602495543672, + "grad_norm": 0.344149885936526, + "learning_rate": 1.2271326042273251e-05, + "loss": 0.8313, + "num_tokens": 33153514664.0, + "step": 7932 + }, + { + "epoch": 0.942721330956625, + "grad_norm": 0.35781871877008564, + "learning_rate": 1.2269611420266114e-05, + "loss": 0.8476, + "num_tokens": 33157664702.0, + "step": 7933 + }, + { + "epoch": 0.9428401663695781, + "grad_norm": 0.33702750384923424, + "learning_rate": 1.2267896751240977e-05, + "loss": 0.8673, + "num_tokens": 33161852872.0, + "step": 7934 + }, + { + "epoch": 0.9429590017825312, + "grad_norm": 0.3409766483294766, + "learning_rate": 1.2266182035261333e-05, + "loss": 0.8597, + "num_tokens": 33166004616.0, + "step": 7935 + }, + { + "epoch": 0.9430778371954842, + "grad_norm": 0.3781312778161886, + "learning_rate": 1.226446727239069e-05, + "loss": 0.7848, + "num_tokens": 33170194314.0, + "step": 7936 + }, + { + "epoch": 0.9431966726084373, + "grad_norm": 0.34126601792871786, + "learning_rate": 1.2262752462692551e-05, + "loss": 0.8139, + "num_tokens": 33174383380.0, + "step": 7937 + }, + { + "epoch": 0.9433155080213904, + "grad_norm": 0.3670023145734416, + "learning_rate": 1.2261037606230414e-05, + "loss": 0.8382, + "num_tokens": 33178573785.0, + "step": 7938 + }, + { + "epoch": 0.9434343434343434, + "grad_norm": 0.3658693373807194, + "learning_rate": 1.225932270306779e-05, + "loss": 0.8475, + "num_tokens": 33182763203.0, + "step": 7939 + }, + { + "epoch": 0.9435531788472965, + "grad_norm": 0.3542967084280183, + "learning_rate": 1.2257607753268192e-05, + "loss": 0.8618, + "num_tokens": 33186953738.0, + "step": 7940 + }, + { + "epoch": 0.9436720142602496, + "grad_norm": 0.36099661348323403, + "learning_rate": 1.2255892756895125e-05, + "loss": 0.8063, + "num_tokens": 33191144280.0, + "step": 7941 + }, + { + "epoch": 0.9437908496732026, + "grad_norm": 0.3308086059081248, + "learning_rate": 1.2254177714012106e-05, + "loss": 0.8394, + "num_tokens": 33195334119.0, + "step": 7942 + }, + { + "epoch": 0.9439096850861557, + "grad_norm": 0.3690931115251753, + "learning_rate": 1.2252462624682642e-05, + "loss": 0.9013, + "num_tokens": 33199501842.0, + "step": 7943 + }, + { + "epoch": 0.9440285204991087, + "grad_norm": 0.3692929852810271, + "learning_rate": 1.2250747488970253e-05, + "loss": 0.8151, + "num_tokens": 33203690471.0, + "step": 7944 + }, + { + "epoch": 0.9441473559120618, + "grad_norm": 0.3667941219408378, + "learning_rate": 1.2249032306938458e-05, + "loss": 0.8386, + "num_tokens": 33207837574.0, + "step": 7945 + }, + { + "epoch": 0.9442661913250149, + "grad_norm": 0.33181455321487596, + "learning_rate": 1.2247317078650772e-05, + "loss": 0.8362, + "num_tokens": 33212026369.0, + "step": 7946 + }, + { + "epoch": 0.9443850267379679, + "grad_norm": 0.37773486806011247, + "learning_rate": 1.224560180417072e-05, + "loss": 0.8324, + "num_tokens": 33216217237.0, + "step": 7947 + }, + { + "epoch": 0.944503862150921, + "grad_norm": 0.3981552200777054, + "learning_rate": 1.2243886483561816e-05, + "loss": 0.8452, + "num_tokens": 33220405494.0, + "step": 7948 + }, + { + "epoch": 0.9446226975638741, + "grad_norm": 0.39088921030479346, + "learning_rate": 1.224217111688759e-05, + "loss": 0.8712, + "num_tokens": 33224594172.0, + "step": 7949 + }, + { + "epoch": 0.9447415329768271, + "grad_norm": 0.339579886941103, + "learning_rate": 1.2240455704211574e-05, + "loss": 0.8023, + "num_tokens": 33228784865.0, + "step": 7950 + }, + { + "epoch": 0.9448603683897802, + "grad_norm": 0.3823899747083176, + "learning_rate": 1.2238740245597284e-05, + "loss": 0.8326, + "num_tokens": 33232974997.0, + "step": 7951 + }, + { + "epoch": 0.9449792038027333, + "grad_norm": 0.36507675765169856, + "learning_rate": 1.2237024741108257e-05, + "loss": 0.8504, + "num_tokens": 33237166218.0, + "step": 7952 + }, + { + "epoch": 0.9450980392156862, + "grad_norm": 0.40916628475696676, + "learning_rate": 1.223530919080802e-05, + "loss": 0.8009, + "num_tokens": 33241328436.0, + "step": 7953 + }, + { + "epoch": 0.9452168746286393, + "grad_norm": 0.375706444349388, + "learning_rate": 1.223359359476011e-05, + "loss": 0.8472, + "num_tokens": 33245504529.0, + "step": 7954 + }, + { + "epoch": 0.9453357100415924, + "grad_norm": 0.38590837687517243, + "learning_rate": 1.2231877953028053e-05, + "loss": 0.8712, + "num_tokens": 33249692915.0, + "step": 7955 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 0.41796208794430423, + "learning_rate": 1.2230162265675394e-05, + "loss": 0.8351, + "num_tokens": 33253868562.0, + "step": 7956 + }, + { + "epoch": 0.9455733808674985, + "grad_norm": 0.4477678341441592, + "learning_rate": 1.2228446532765666e-05, + "loss": 0.8711, + "num_tokens": 33258057432.0, + "step": 7957 + }, + { + "epoch": 0.9456922162804515, + "grad_norm": 0.3341626299194725, + "learning_rate": 1.2226730754362408e-05, + "loss": 0.8395, + "num_tokens": 33262246637.0, + "step": 7958 + }, + { + "epoch": 0.9458110516934046, + "grad_norm": 0.502580124459121, + "learning_rate": 1.2225014930529166e-05, + "loss": 0.8061, + "num_tokens": 33266415686.0, + "step": 7959 + }, + { + "epoch": 0.9459298871063577, + "grad_norm": 0.3910459480993723, + "learning_rate": 1.2223299061329479e-05, + "loss": 0.8197, + "num_tokens": 33270604267.0, + "step": 7960 + }, + { + "epoch": 0.9460487225193107, + "grad_norm": 0.3322900051381508, + "learning_rate": 1.2221583146826887e-05, + "loss": 0.811, + "num_tokens": 33274768905.0, + "step": 7961 + }, + { + "epoch": 0.9461675579322638, + "grad_norm": 0.4506385067775462, + "learning_rate": 1.2219867187084942e-05, + "loss": 0.8505, + "num_tokens": 33278940522.0, + "step": 7962 + }, + { + "epoch": 0.9462863933452169, + "grad_norm": 0.4334539373340245, + "learning_rate": 1.2218151182167195e-05, + "loss": 0.8303, + "num_tokens": 33283129290.0, + "step": 7963 + }, + { + "epoch": 0.9464052287581699, + "grad_norm": 0.34571855757979464, + "learning_rate": 1.2216435132137188e-05, + "loss": 0.7986, + "num_tokens": 33287311131.0, + "step": 7964 + }, + { + "epoch": 0.946524064171123, + "grad_norm": 0.4426882151684644, + "learning_rate": 1.2214719037058476e-05, + "loss": 0.833, + "num_tokens": 33291499017.0, + "step": 7965 + }, + { + "epoch": 0.9466428995840761, + "grad_norm": 0.45158238450889715, + "learning_rate": 1.221300289699461e-05, + "loss": 0.8311, + "num_tokens": 33295662570.0, + "step": 7966 + }, + { + "epoch": 0.9467617349970291, + "grad_norm": 0.3417098523731305, + "learning_rate": 1.2211286712009147e-05, + "loss": 0.8413, + "num_tokens": 33299824027.0, + "step": 7967 + }, + { + "epoch": 0.9468805704099822, + "grad_norm": 0.41375332436047474, + "learning_rate": 1.2209570482165638e-05, + "loss": 0.826, + "num_tokens": 33304000969.0, + "step": 7968 + }, + { + "epoch": 0.9469994058229352, + "grad_norm": 0.39154976034784206, + "learning_rate": 1.2207854207527647e-05, + "loss": 0.8215, + "num_tokens": 33308190845.0, + "step": 7969 + }, + { + "epoch": 0.9471182412358883, + "grad_norm": 0.41893678969640635, + "learning_rate": 1.2206137888158734e-05, + "loss": 0.8264, + "num_tokens": 33312380444.0, + "step": 7970 + }, + { + "epoch": 0.9472370766488414, + "grad_norm": 0.33805404347888307, + "learning_rate": 1.2204421524122451e-05, + "loss": 0.855, + "num_tokens": 33316563594.0, + "step": 7971 + }, + { + "epoch": 0.9473559120617944, + "grad_norm": 0.3540966680375435, + "learning_rate": 1.220270511548237e-05, + "loss": 0.8276, + "num_tokens": 33320753571.0, + "step": 7972 + }, + { + "epoch": 0.9474747474747475, + "grad_norm": 0.4047087300178636, + "learning_rate": 1.2200988662302054e-05, + "loss": 0.8039, + "num_tokens": 33324917429.0, + "step": 7973 + }, + { + "epoch": 0.9475935828877006, + "grad_norm": 0.38713893975359204, + "learning_rate": 1.2199272164645064e-05, + "loss": 0.8172, + "num_tokens": 33329105490.0, + "step": 7974 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.32668836774808746, + "learning_rate": 1.2197555622574971e-05, + "loss": 0.8416, + "num_tokens": 33333294208.0, + "step": 7975 + }, + { + "epoch": 0.9478312537136067, + "grad_norm": 0.3293404438689731, + "learning_rate": 1.2195839036155347e-05, + "loss": 0.826, + "num_tokens": 33337484208.0, + "step": 7976 + }, + { + "epoch": 0.9479500891265598, + "grad_norm": 0.36987045892438025, + "learning_rate": 1.2194122405449755e-05, + "loss": 0.8332, + "num_tokens": 33341673469.0, + "step": 7977 + }, + { + "epoch": 0.9480689245395127, + "grad_norm": 0.3448270161480491, + "learning_rate": 1.2192405730521777e-05, + "loss": 0.8321, + "num_tokens": 33345864293.0, + "step": 7978 + }, + { + "epoch": 0.9481877599524658, + "grad_norm": 0.3472869675228122, + "learning_rate": 1.2190689011434984e-05, + "loss": 0.8007, + "num_tokens": 33350029332.0, + "step": 7979 + }, + { + "epoch": 0.948306595365419, + "grad_norm": 0.3921340957198878, + "learning_rate": 1.2188972248252948e-05, + "loss": 0.8161, + "num_tokens": 33354218391.0, + "step": 7980 + }, + { + "epoch": 0.9484254307783719, + "grad_norm": 0.3955973898394632, + "learning_rate": 1.2187255441039252e-05, + "loss": 0.8248, + "num_tokens": 33358392300.0, + "step": 7981 + }, + { + "epoch": 0.948544266191325, + "grad_norm": 0.32361330199446775, + "learning_rate": 1.218553858985747e-05, + "loss": 0.8092, + "num_tokens": 33362570630.0, + "step": 7982 + }, + { + "epoch": 0.948663101604278, + "grad_norm": 0.32186461020950247, + "learning_rate": 1.218382169477119e-05, + "loss": 0.8464, + "num_tokens": 33366759590.0, + "step": 7983 + }, + { + "epoch": 0.9487819370172311, + "grad_norm": 0.41471164263433286, + "learning_rate": 1.2182104755843986e-05, + "loss": 0.7946, + "num_tokens": 33370915622.0, + "step": 7984 + }, + { + "epoch": 0.9489007724301842, + "grad_norm": 0.32340341062502925, + "learning_rate": 1.2180387773139449e-05, + "loss": 0.8193, + "num_tokens": 33375105470.0, + "step": 7985 + }, + { + "epoch": 0.9490196078431372, + "grad_norm": 0.3391574317674489, + "learning_rate": 1.217867074672116e-05, + "loss": 0.7908, + "num_tokens": 33379294269.0, + "step": 7986 + }, + { + "epoch": 0.9491384432560903, + "grad_norm": 0.3468183754994801, + "learning_rate": 1.2176953676652707e-05, + "loss": 0.8746, + "num_tokens": 33383483727.0, + "step": 7987 + }, + { + "epoch": 0.9492572786690434, + "grad_norm": 0.359279696183436, + "learning_rate": 1.217523656299768e-05, + "loss": 0.8478, + "num_tokens": 33387673837.0, + "step": 7988 + }, + { + "epoch": 0.9493761140819964, + "grad_norm": 0.37419671965651047, + "learning_rate": 1.2173519405819673e-05, + "loss": 0.8274, + "num_tokens": 33391864352.0, + "step": 7989 + }, + { + "epoch": 0.9494949494949495, + "grad_norm": 0.35298176600514974, + "learning_rate": 1.2171802205182273e-05, + "loss": 0.8313, + "num_tokens": 33396054361.0, + "step": 7990 + }, + { + "epoch": 0.9496137849079026, + "grad_norm": 0.38580573432604975, + "learning_rate": 1.2170084961149073e-05, + "loss": 0.8235, + "num_tokens": 33400243646.0, + "step": 7991 + }, + { + "epoch": 0.9497326203208556, + "grad_norm": 0.29825351351492196, + "learning_rate": 1.2168367673783676e-05, + "loss": 0.8146, + "num_tokens": 33404432240.0, + "step": 7992 + }, + { + "epoch": 0.9498514557338087, + "grad_norm": 0.3742072218016743, + "learning_rate": 1.2166650343149669e-05, + "loss": 0.8308, + "num_tokens": 33408606976.0, + "step": 7993 + }, + { + "epoch": 0.9499702911467617, + "grad_norm": 0.35407178923499105, + "learning_rate": 1.216493296931066e-05, + "loss": 0.865, + "num_tokens": 33412762115.0, + "step": 7994 + }, + { + "epoch": 0.9500891265597148, + "grad_norm": 0.370209660365124, + "learning_rate": 1.2163215552330239e-05, + "loss": 0.802, + "num_tokens": 33416950348.0, + "step": 7995 + }, + { + "epoch": 0.9502079619726679, + "grad_norm": 0.4550170715048509, + "learning_rate": 1.2161498092272013e-05, + "loss": 0.8423, + "num_tokens": 33421140810.0, + "step": 7996 + }, + { + "epoch": 0.9503267973856209, + "grad_norm": 0.32471164113772716, + "learning_rate": 1.215978058919959e-05, + "loss": 0.8564, + "num_tokens": 33425329919.0, + "step": 7997 + }, + { + "epoch": 0.950445632798574, + "grad_norm": 0.4976105321003154, + "learning_rate": 1.215806304317657e-05, + "loss": 0.8558, + "num_tokens": 33429518073.0, + "step": 7998 + }, + { + "epoch": 0.9505644682115271, + "grad_norm": 0.3942992174854459, + "learning_rate": 1.2156345454266555e-05, + "loss": 0.819, + "num_tokens": 33433697450.0, + "step": 7999 + }, + { + "epoch": 0.9506833036244801, + "grad_norm": 0.42895673011716506, + "learning_rate": 1.2154627822533162e-05, + "loss": 0.8556, + "num_tokens": 33437871812.0, + "step": 8000 + }, + { + "epoch": 0.9508021390374332, + "grad_norm": 0.3512389001961724, + "learning_rate": 1.2152910148039996e-05, + "loss": 0.8435, + "num_tokens": 33442057930.0, + "step": 8001 + }, + { + "epoch": 0.9509209744503863, + "grad_norm": 0.4077375159878342, + "learning_rate": 1.215119243085067e-05, + "loss": 0.8418, + "num_tokens": 33446227385.0, + "step": 8002 + }, + { + "epoch": 0.9510398098633392, + "grad_norm": 0.3244280232939263, + "learning_rate": 1.2149474671028796e-05, + "loss": 0.8176, + "num_tokens": 33450417099.0, + "step": 8003 + }, + { + "epoch": 0.9511586452762923, + "grad_norm": 0.4233883619816518, + "learning_rate": 1.2147756868637985e-05, + "loss": 0.8192, + "num_tokens": 33454599982.0, + "step": 8004 + }, + { + "epoch": 0.9512774806892454, + "grad_norm": 0.31870651284753054, + "learning_rate": 1.2146039023741863e-05, + "loss": 0.8353, + "num_tokens": 33458752547.0, + "step": 8005 + }, + { + "epoch": 0.9513963161021984, + "grad_norm": 0.4381666250967849, + "learning_rate": 1.2144321136404035e-05, + "loss": 0.8316, + "num_tokens": 33462940010.0, + "step": 8006 + }, + { + "epoch": 0.9515151515151515, + "grad_norm": 0.35966857160059773, + "learning_rate": 1.214260320668813e-05, + "loss": 0.8218, + "num_tokens": 33467127529.0, + "step": 8007 + }, + { + "epoch": 0.9516339869281045, + "grad_norm": 0.42024984037144736, + "learning_rate": 1.2140885234657764e-05, + "loss": 0.8251, + "num_tokens": 33471317128.0, + "step": 8008 + }, + { + "epoch": 0.9517528223410576, + "grad_norm": 0.3784569267382392, + "learning_rate": 1.2139167220376557e-05, + "loss": 0.8194, + "num_tokens": 33475507311.0, + "step": 8009 + }, + { + "epoch": 0.9518716577540107, + "grad_norm": 0.40636916346999385, + "learning_rate": 1.2137449163908144e-05, + "loss": 0.8082, + "num_tokens": 33479697773.0, + "step": 8010 + }, + { + "epoch": 0.9519904931669637, + "grad_norm": 0.37522274930132093, + "learning_rate": 1.2135731065316136e-05, + "loss": 0.8245, + "num_tokens": 33483887990.0, + "step": 8011 + }, + { + "epoch": 0.9521093285799168, + "grad_norm": 0.4076604421153674, + "learning_rate": 1.213401292466417e-05, + "loss": 0.8518, + "num_tokens": 33488053990.0, + "step": 8012 + }, + { + "epoch": 0.9522281639928699, + "grad_norm": 0.4225930243725632, + "learning_rate": 1.2132294742015869e-05, + "loss": 0.8109, + "num_tokens": 33492237916.0, + "step": 8013 + }, + { + "epoch": 0.9523469994058229, + "grad_norm": 0.32227689954756134, + "learning_rate": 1.2130576517434867e-05, + "loss": 0.8537, + "num_tokens": 33496427674.0, + "step": 8014 + }, + { + "epoch": 0.952465834818776, + "grad_norm": 0.45748440388824385, + "learning_rate": 1.2128858250984794e-05, + "loss": 0.8773, + "num_tokens": 33500598636.0, + "step": 8015 + }, + { + "epoch": 0.9525846702317291, + "grad_norm": 0.3590535144861884, + "learning_rate": 1.212713994272928e-05, + "loss": 0.8128, + "num_tokens": 33504783894.0, + "step": 8016 + }, + { + "epoch": 0.9527035056446821, + "grad_norm": 0.42888250646352916, + "learning_rate": 1.2125421592731966e-05, + "loss": 0.8196, + "num_tokens": 33508935828.0, + "step": 8017 + }, + { + "epoch": 0.9528223410576352, + "grad_norm": 0.38390662843445267, + "learning_rate": 1.2123703201056487e-05, + "loss": 0.8486, + "num_tokens": 33513124509.0, + "step": 8018 + }, + { + "epoch": 0.9529411764705882, + "grad_norm": 0.41229946493894576, + "learning_rate": 1.2121984767766476e-05, + "loss": 0.8689, + "num_tokens": 33517293259.0, + "step": 8019 + }, + { + "epoch": 0.9530600118835413, + "grad_norm": 0.40447497839021546, + "learning_rate": 1.2120266292925576e-05, + "loss": 0.8212, + "num_tokens": 33521483653.0, + "step": 8020 + }, + { + "epoch": 0.9531788472964944, + "grad_norm": 0.3948088713381373, + "learning_rate": 1.2118547776597429e-05, + "loss": 0.817, + "num_tokens": 33525672551.0, + "step": 8021 + }, + { + "epoch": 0.9532976827094474, + "grad_norm": 0.33766309471642664, + "learning_rate": 1.2116829218845671e-05, + "loss": 0.8007, + "num_tokens": 33529838576.0, + "step": 8022 + }, + { + "epoch": 0.9534165181224005, + "grad_norm": 0.40864454815953993, + "learning_rate": 1.2115110619733956e-05, + "loss": 0.8251, + "num_tokens": 33534011603.0, + "step": 8023 + }, + { + "epoch": 0.9535353535353536, + "grad_norm": 0.34878897481527876, + "learning_rate": 1.2113391979325923e-05, + "loss": 0.8429, + "num_tokens": 33538201573.0, + "step": 8024 + }, + { + "epoch": 0.9536541889483066, + "grad_norm": 0.39049543254361424, + "learning_rate": 1.2111673297685222e-05, + "loss": 0.8091, + "num_tokens": 33542390361.0, + "step": 8025 + }, + { + "epoch": 0.9537730243612597, + "grad_norm": 0.3481582259903871, + "learning_rate": 1.2109954574875498e-05, + "loss": 0.821, + "num_tokens": 33546574411.0, + "step": 8026 + }, + { + "epoch": 0.9538918597742128, + "grad_norm": 0.3442617509271707, + "learning_rate": 1.2108235810960405e-05, + "loss": 0.8255, + "num_tokens": 33550764667.0, + "step": 8027 + }, + { + "epoch": 0.9540106951871657, + "grad_norm": 0.3686500193714432, + "learning_rate": 1.2106517006003592e-05, + "loss": 0.8365, + "num_tokens": 33554934314.0, + "step": 8028 + }, + { + "epoch": 0.9541295306001188, + "grad_norm": 0.41505189943914794, + "learning_rate": 1.2104798160068711e-05, + "loss": 0.7833, + "num_tokens": 33559066022.0, + "step": 8029 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.4213273037239, + "learning_rate": 1.2103079273219422e-05, + "loss": 0.82, + "num_tokens": 33563237753.0, + "step": 8030 + }, + { + "epoch": 0.9543672014260249, + "grad_norm": 0.35296330954975985, + "learning_rate": 1.2101360345519376e-05, + "loss": 0.845, + "num_tokens": 33567427651.0, + "step": 8031 + }, + { + "epoch": 0.954486036838978, + "grad_norm": 0.4113763146636815, + "learning_rate": 1.2099641377032231e-05, + "loss": 0.7847, + "num_tokens": 33571603009.0, + "step": 8032 + }, + { + "epoch": 0.954604872251931, + "grad_norm": 0.49351451414683667, + "learning_rate": 1.2097922367821655e-05, + "loss": 0.8071, + "num_tokens": 33575791892.0, + "step": 8033 + }, + { + "epoch": 0.9547237076648841, + "grad_norm": 0.35176145333546566, + "learning_rate": 1.2096203317951295e-05, + "loss": 0.8643, + "num_tokens": 33579981208.0, + "step": 8034 + }, + { + "epoch": 0.9548425430778372, + "grad_norm": 0.45822535833271183, + "learning_rate": 1.209448422748482e-05, + "loss": 0.8233, + "num_tokens": 33584171658.0, + "step": 8035 + }, + { + "epoch": 0.9549613784907902, + "grad_norm": 0.40508440195320783, + "learning_rate": 1.2092765096485897e-05, + "loss": 0.8205, + "num_tokens": 33588360823.0, + "step": 8036 + }, + { + "epoch": 0.9550802139037433, + "grad_norm": 0.45071661961079806, + "learning_rate": 1.2091045925018183e-05, + "loss": 0.8654, + "num_tokens": 33592507847.0, + "step": 8037 + }, + { + "epoch": 0.9551990493166964, + "grad_norm": 0.3423900399870553, + "learning_rate": 1.2089326713145354e-05, + "loss": 0.82, + "num_tokens": 33596650540.0, + "step": 8038 + }, + { + "epoch": 0.9553178847296494, + "grad_norm": 0.4968656925240591, + "learning_rate": 1.208760746093107e-05, + "loss": 0.8295, + "num_tokens": 33600840755.0, + "step": 8039 + }, + { + "epoch": 0.9554367201426025, + "grad_norm": 0.3723197751912764, + "learning_rate": 1.2085888168439008e-05, + "loss": 0.8689, + "num_tokens": 33605022563.0, + "step": 8040 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.45469947948270234, + "learning_rate": 1.2084168835732833e-05, + "loss": 0.8236, + "num_tokens": 33609210268.0, + "step": 8041 + }, + { + "epoch": 0.9556743909685086, + "grad_norm": 0.39071884062357143, + "learning_rate": 1.208244946287622e-05, + "loss": 0.8462, + "num_tokens": 33613398624.0, + "step": 8042 + }, + { + "epoch": 0.9557932263814617, + "grad_norm": 0.47453374373984236, + "learning_rate": 1.2080730049932848e-05, + "loss": 0.8555, + "num_tokens": 33617587874.0, + "step": 8043 + }, + { + "epoch": 0.9559120617944147, + "grad_norm": 0.35454860729794374, + "learning_rate": 1.2079010596966383e-05, + "loss": 0.857, + "num_tokens": 33621765876.0, + "step": 8044 + }, + { + "epoch": 0.9560308972073678, + "grad_norm": 0.5496095441679152, + "learning_rate": 1.2077291104040509e-05, + "loss": 0.8467, + "num_tokens": 33625948355.0, + "step": 8045 + }, + { + "epoch": 0.9561497326203209, + "grad_norm": 0.4356442491445977, + "learning_rate": 1.2075571571218905e-05, + "loss": 0.8301, + "num_tokens": 33630135323.0, + "step": 8046 + }, + { + "epoch": 0.9562685680332739, + "grad_norm": 0.5196971973569643, + "learning_rate": 1.2073851998565247e-05, + "loss": 0.8021, + "num_tokens": 33634323442.0, + "step": 8047 + }, + { + "epoch": 0.956387403446227, + "grad_norm": 0.5629710310773578, + "learning_rate": 1.2072132386143218e-05, + "loss": 0.8146, + "num_tokens": 33638485342.0, + "step": 8048 + }, + { + "epoch": 0.9565062388591801, + "grad_norm": 0.3893073592100301, + "learning_rate": 1.2070412734016504e-05, + "loss": 0.8069, + "num_tokens": 33642648681.0, + "step": 8049 + }, + { + "epoch": 0.9566250742721331, + "grad_norm": 0.627427505359201, + "learning_rate": 1.2068693042248786e-05, + "loss": 0.8605, + "num_tokens": 33646837924.0, + "step": 8050 + }, + { + "epoch": 0.9567439096850862, + "grad_norm": 0.4304154886015156, + "learning_rate": 1.206697331090375e-05, + "loss": 0.8728, + "num_tokens": 33651026801.0, + "step": 8051 + }, + { + "epoch": 0.9568627450980393, + "grad_norm": 0.6115591294218871, + "learning_rate": 1.2065253540045088e-05, + "loss": 0.8329, + "num_tokens": 33655217188.0, + "step": 8052 + }, + { + "epoch": 0.9569815805109922, + "grad_norm": 0.49192338942292535, + "learning_rate": 1.2063533729736485e-05, + "loss": 0.8281, + "num_tokens": 33659379689.0, + "step": 8053 + }, + { + "epoch": 0.9571004159239453, + "grad_norm": 0.5562259679977384, + "learning_rate": 1.206181388004163e-05, + "loss": 0.8356, + "num_tokens": 33663566548.0, + "step": 8054 + }, + { + "epoch": 0.9572192513368984, + "grad_norm": 0.45938473335121405, + "learning_rate": 1.2060093991024217e-05, + "loss": 0.7908, + "num_tokens": 33667756194.0, + "step": 8055 + }, + { + "epoch": 0.9573380867498514, + "grad_norm": 0.47402987478272246, + "learning_rate": 1.2058374062747942e-05, + "loss": 0.8062, + "num_tokens": 33671945445.0, + "step": 8056 + }, + { + "epoch": 0.9574569221628045, + "grad_norm": 0.47041687497025836, + "learning_rate": 1.2056654095276494e-05, + "loss": 0.8289, + "num_tokens": 33676133360.0, + "step": 8057 + }, + { + "epoch": 0.9575757575757575, + "grad_norm": 0.3611471859744453, + "learning_rate": 1.2054934088673575e-05, + "loss": 0.8263, + "num_tokens": 33680316419.0, + "step": 8058 + }, + { + "epoch": 0.9576945929887106, + "grad_norm": 0.5008579469512788, + "learning_rate": 1.2053214043002879e-05, + "loss": 0.8691, + "num_tokens": 33684506855.0, + "step": 8059 + }, + { + "epoch": 0.9578134284016637, + "grad_norm": 0.4065100430795767, + "learning_rate": 1.2051493958328105e-05, + "loss": 0.8552, + "num_tokens": 33688695881.0, + "step": 8060 + }, + { + "epoch": 0.9579322638146167, + "grad_norm": 0.42674025915068514, + "learning_rate": 1.2049773834712955e-05, + "loss": 0.8521, + "num_tokens": 33692858894.0, + "step": 8061 + }, + { + "epoch": 0.9580510992275698, + "grad_norm": 0.5057009757556253, + "learning_rate": 1.2048053672221131e-05, + "loss": 0.8761, + "num_tokens": 33697032598.0, + "step": 8062 + }, + { + "epoch": 0.9581699346405229, + "grad_norm": 0.36670588216126393, + "learning_rate": 1.2046333470916336e-05, + "loss": 0.8559, + "num_tokens": 33701215949.0, + "step": 8063 + }, + { + "epoch": 0.9582887700534759, + "grad_norm": 0.5137837787246458, + "learning_rate": 1.2044613230862274e-05, + "loss": 0.8078, + "num_tokens": 33705393300.0, + "step": 8064 + }, + { + "epoch": 0.958407605466429, + "grad_norm": 0.3767977998442062, + "learning_rate": 1.2042892952122654e-05, + "loss": 0.8568, + "num_tokens": 33709581360.0, + "step": 8065 + }, + { + "epoch": 0.9585264408793821, + "grad_norm": 0.5319258681501826, + "learning_rate": 1.2041172634761176e-05, + "loss": 0.8622, + "num_tokens": 33713769682.0, + "step": 8066 + }, + { + "epoch": 0.9586452762923351, + "grad_norm": 0.4190371426188594, + "learning_rate": 1.2039452278841563e-05, + "loss": 0.7978, + "num_tokens": 33717955122.0, + "step": 8067 + }, + { + "epoch": 0.9587641117052882, + "grad_norm": 0.5263290147919415, + "learning_rate": 1.2037731884427512e-05, + "loss": 0.8367, + "num_tokens": 33722128464.0, + "step": 8068 + }, + { + "epoch": 0.9588829471182413, + "grad_norm": 0.46860940522594263, + "learning_rate": 1.2036011451582742e-05, + "loss": 0.8279, + "num_tokens": 33726294531.0, + "step": 8069 + }, + { + "epoch": 0.9590017825311943, + "grad_norm": 0.395852189361107, + "learning_rate": 1.2034290980370968e-05, + "loss": 0.8355, + "num_tokens": 33730449174.0, + "step": 8070 + }, + { + "epoch": 0.9591206179441474, + "grad_norm": 0.5900277321638727, + "learning_rate": 1.20325704708559e-05, + "loss": 0.8412, + "num_tokens": 33734633244.0, + "step": 8071 + }, + { + "epoch": 0.9592394533571004, + "grad_norm": 0.4272081010648375, + "learning_rate": 1.2030849923101255e-05, + "loss": 0.8305, + "num_tokens": 33738821450.0, + "step": 8072 + }, + { + "epoch": 0.9593582887700535, + "grad_norm": 0.6609904661916466, + "learning_rate": 1.2029129337170752e-05, + "loss": 0.8068, + "num_tokens": 33743008959.0, + "step": 8073 + }, + { + "epoch": 0.9594771241830066, + "grad_norm": 0.5657195252292928, + "learning_rate": 1.2027408713128112e-05, + "loss": 0.8438, + "num_tokens": 33747196803.0, + "step": 8074 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 0.586717103002205, + "learning_rate": 1.2025688051037055e-05, + "loss": 0.8413, + "num_tokens": 33751341594.0, + "step": 8075 + }, + { + "epoch": 0.9597147950089127, + "grad_norm": 0.5046518418266055, + "learning_rate": 1.2023967350961298e-05, + "loss": 0.8619, + "num_tokens": 33755514034.0, + "step": 8076 + }, + { + "epoch": 0.9598336304218658, + "grad_norm": 0.6451136201002886, + "learning_rate": 1.202224661296457e-05, + "loss": 0.8207, + "num_tokens": 33759696620.0, + "step": 8077 + }, + { + "epoch": 0.9599524658348187, + "grad_norm": 0.49473686784997783, + "learning_rate": 1.2020525837110593e-05, + "loss": 0.8174, + "num_tokens": 33763886213.0, + "step": 8078 + }, + { + "epoch": 0.9600713012477718, + "grad_norm": 0.6513573135587358, + "learning_rate": 1.2018805023463092e-05, + "loss": 0.805, + "num_tokens": 33768074086.0, + "step": 8079 + }, + { + "epoch": 0.960190136660725, + "grad_norm": 0.6104683900573078, + "learning_rate": 1.2017084172085797e-05, + "loss": 0.8296, + "num_tokens": 33772264361.0, + "step": 8080 + }, + { + "epoch": 0.9603089720736779, + "grad_norm": 0.5069583333464035, + "learning_rate": 1.201536328304244e-05, + "loss": 0.829, + "num_tokens": 33776423941.0, + "step": 8081 + }, + { + "epoch": 0.960427807486631, + "grad_norm": 0.5129475157519653, + "learning_rate": 1.2013642356396742e-05, + "loss": 0.8742, + "num_tokens": 33780609850.0, + "step": 8082 + }, + { + "epoch": 0.960546642899584, + "grad_norm": 0.5146173635145824, + "learning_rate": 1.2011921392212445e-05, + "loss": 0.8485, + "num_tokens": 33784799127.0, + "step": 8083 + }, + { + "epoch": 0.9606654783125371, + "grad_norm": 0.44311855268478695, + "learning_rate": 1.2010200390553276e-05, + "loss": 0.8181, + "num_tokens": 33788964214.0, + "step": 8084 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.5584704534048963, + "learning_rate": 1.2008479351482968e-05, + "loss": 0.8371, + "num_tokens": 33793154489.0, + "step": 8085 + }, + { + "epoch": 0.9609031491384432, + "grad_norm": 0.4063178031232992, + "learning_rate": 1.2006758275065263e-05, + "loss": 0.821, + "num_tokens": 33797342835.0, + "step": 8086 + }, + { + "epoch": 0.9610219845513963, + "grad_norm": 0.5731935121010305, + "learning_rate": 1.2005037161363895e-05, + "loss": 0.841, + "num_tokens": 33801531425.0, + "step": 8087 + }, + { + "epoch": 0.9611408199643494, + "grad_norm": 0.43448219076224226, + "learning_rate": 1.2003316010442605e-05, + "loss": 0.8224, + "num_tokens": 33805692096.0, + "step": 8088 + }, + { + "epoch": 0.9612596553773024, + "grad_norm": 0.6706617104830905, + "learning_rate": 1.2001594822365126e-05, + "loss": 0.8142, + "num_tokens": 33809844203.0, + "step": 8089 + }, + { + "epoch": 0.9613784907902555, + "grad_norm": 0.5360833951676116, + "learning_rate": 1.1999873597195205e-05, + "loss": 0.7932, + "num_tokens": 33814032541.0, + "step": 8090 + }, + { + "epoch": 0.9614973262032086, + "grad_norm": 0.5962901674960086, + "learning_rate": 1.1998152334996587e-05, + "loss": 0.8435, + "num_tokens": 33818222383.0, + "step": 8091 + }, + { + "epoch": 0.9616161616161616, + "grad_norm": 0.5746507139615252, + "learning_rate": 1.199643103583301e-05, + "loss": 0.7984, + "num_tokens": 33822383074.0, + "step": 8092 + }, + { + "epoch": 0.9617349970291147, + "grad_norm": 0.5669578689804998, + "learning_rate": 1.1994709699768228e-05, + "loss": 0.848, + "num_tokens": 33826571060.0, + "step": 8093 + }, + { + "epoch": 0.9618538324420678, + "grad_norm": 0.524315939151954, + "learning_rate": 1.1992988326865978e-05, + "loss": 0.793, + "num_tokens": 33830734552.0, + "step": 8094 + }, + { + "epoch": 0.9619726678550208, + "grad_norm": 0.518175034647317, + "learning_rate": 1.1991266917190012e-05, + "loss": 0.8266, + "num_tokens": 33834921980.0, + "step": 8095 + }, + { + "epoch": 0.9620915032679739, + "grad_norm": 0.44891563583643135, + "learning_rate": 1.1989545470804083e-05, + "loss": 0.8587, + "num_tokens": 33839087653.0, + "step": 8096 + }, + { + "epoch": 0.9622103386809269, + "grad_norm": 0.5526757374292359, + "learning_rate": 1.1987823987771939e-05, + "loss": 0.8514, + "num_tokens": 33843276876.0, + "step": 8097 + }, + { + "epoch": 0.96232917409388, + "grad_norm": 0.49601310168023605, + "learning_rate": 1.1986102468157332e-05, + "loss": 0.8483, + "num_tokens": 33847464889.0, + "step": 8098 + }, + { + "epoch": 0.9624480095068331, + "grad_norm": 0.5127365401894594, + "learning_rate": 1.1984380912024019e-05, + "loss": 0.83, + "num_tokens": 33851652884.0, + "step": 8099 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 0.42088207967856545, + "learning_rate": 1.198265931943575e-05, + "loss": 0.8289, + "num_tokens": 33855808711.0, + "step": 8100 + }, + { + "epoch": 0.9626856803327392, + "grad_norm": 0.5230841527180834, + "learning_rate": 1.1980937690456285e-05, + "loss": 0.8092, + "num_tokens": 33859993098.0, + "step": 8101 + }, + { + "epoch": 0.9628045157456923, + "grad_norm": 0.39893560596544664, + "learning_rate": 1.1979216025149378e-05, + "loss": 0.827, + "num_tokens": 33864179703.0, + "step": 8102 + }, + { + "epoch": 0.9629233511586452, + "grad_norm": 0.573025655562228, + "learning_rate": 1.1977494323578795e-05, + "loss": 0.8211, + "num_tokens": 33868341789.0, + "step": 8103 + }, + { + "epoch": 0.9630421865715983, + "grad_norm": 0.4851170176160624, + "learning_rate": 1.1975772585808291e-05, + "loss": 0.8147, + "num_tokens": 33872515095.0, + "step": 8104 + }, + { + "epoch": 0.9631610219845514, + "grad_norm": 0.5941954836463573, + "learning_rate": 1.1974050811901628e-05, + "loss": 0.8356, + "num_tokens": 33876682577.0, + "step": 8105 + }, + { + "epoch": 0.9632798573975044, + "grad_norm": 0.5546885734068938, + "learning_rate": 1.197232900192257e-05, + "loss": 0.8499, + "num_tokens": 33880834596.0, + "step": 8106 + }, + { + "epoch": 0.9633986928104575, + "grad_norm": 0.5020811861190175, + "learning_rate": 1.1970607155934884e-05, + "loss": 0.8321, + "num_tokens": 33884929382.0, + "step": 8107 + }, + { + "epoch": 0.9635175282234105, + "grad_norm": 0.5503592179390346, + "learning_rate": 1.1968885274002328e-05, + "loss": 0.8326, + "num_tokens": 33889118662.0, + "step": 8108 + }, + { + "epoch": 0.9636363636363636, + "grad_norm": 0.4457067048990028, + "learning_rate": 1.196716335618868e-05, + "loss": 0.8375, + "num_tokens": 33893306586.0, + "step": 8109 + }, + { + "epoch": 0.9637551990493167, + "grad_norm": 0.4495160508911224, + "learning_rate": 1.19654414025577e-05, + "loss": 0.8061, + "num_tokens": 33897496470.0, + "step": 8110 + }, + { + "epoch": 0.9638740344622697, + "grad_norm": 0.4493659307938797, + "learning_rate": 1.196371941317316e-05, + "loss": 0.8085, + "num_tokens": 33901675664.0, + "step": 8111 + }, + { + "epoch": 0.9639928698752228, + "grad_norm": 0.41131835971786107, + "learning_rate": 1.1961997388098836e-05, + "loss": 0.834, + "num_tokens": 33905866351.0, + "step": 8112 + }, + { + "epoch": 0.9641117052881759, + "grad_norm": 0.5235714420022815, + "learning_rate": 1.1960275327398491e-05, + "loss": 0.7909, + "num_tokens": 33910034002.0, + "step": 8113 + }, + { + "epoch": 0.9642305407011289, + "grad_norm": 0.3940356793920035, + "learning_rate": 1.1958553231135907e-05, + "loss": 0.8503, + "num_tokens": 33914223426.0, + "step": 8114 + }, + { + "epoch": 0.964349376114082, + "grad_norm": 0.6232748511127719, + "learning_rate": 1.1956831099374854e-05, + "loss": 0.8523, + "num_tokens": 33918385946.0, + "step": 8115 + }, + { + "epoch": 0.9644682115270351, + "grad_norm": 0.5138255620351715, + "learning_rate": 1.1955108932179112e-05, + "loss": 0.8704, + "num_tokens": 33922550095.0, + "step": 8116 + }, + { + "epoch": 0.9645870469399881, + "grad_norm": 0.643616536998717, + "learning_rate": 1.1953386729612456e-05, + "loss": 0.8335, + "num_tokens": 33926729025.0, + "step": 8117 + }, + { + "epoch": 0.9647058823529412, + "grad_norm": 0.6114606013357162, + "learning_rate": 1.1951664491738666e-05, + "loss": 0.8179, + "num_tokens": 33930918133.0, + "step": 8118 + }, + { + "epoch": 0.9648247177658943, + "grad_norm": 0.4497142428593021, + "learning_rate": 1.194994221862152e-05, + "loss": 0.8028, + "num_tokens": 33935105093.0, + "step": 8119 + }, + { + "epoch": 0.9649435531788473, + "grad_norm": 0.5332842186371833, + "learning_rate": 1.1948219910324805e-05, + "loss": 0.821, + "num_tokens": 33939295122.0, + "step": 8120 + }, + { + "epoch": 0.9650623885918004, + "grad_norm": 0.39010119090902745, + "learning_rate": 1.19464975669123e-05, + "loss": 0.7998, + "num_tokens": 33943485163.0, + "step": 8121 + }, + { + "epoch": 0.9651812240047534, + "grad_norm": 0.5547142635171451, + "learning_rate": 1.194477518844779e-05, + "loss": 0.8533, + "num_tokens": 33947657501.0, + "step": 8122 + }, + { + "epoch": 0.9653000594177065, + "grad_norm": 0.3303660023005126, + "learning_rate": 1.1943052774995061e-05, + "loss": 0.8581, + "num_tokens": 33951846753.0, + "step": 8123 + }, + { + "epoch": 0.9654188948306596, + "grad_norm": 0.5553575837256226, + "learning_rate": 1.1941330326617894e-05, + "loss": 0.833, + "num_tokens": 33956003273.0, + "step": 8124 + }, + { + "epoch": 0.9655377302436126, + "grad_norm": 0.37568213401258954, + "learning_rate": 1.1939607843380091e-05, + "loss": 0.8717, + "num_tokens": 33960191400.0, + "step": 8125 + }, + { + "epoch": 0.9656565656565657, + "grad_norm": 0.5151040357913886, + "learning_rate": 1.1937885325345426e-05, + "loss": 0.8843, + "num_tokens": 33964375121.0, + "step": 8126 + }, + { + "epoch": 0.9657754010695188, + "grad_norm": 0.41852955441179884, + "learning_rate": 1.19361627725777e-05, + "loss": 0.8355, + "num_tokens": 33968536748.0, + "step": 8127 + }, + { + "epoch": 0.9658942364824717, + "grad_norm": 0.4665344631182229, + "learning_rate": 1.1934440185140699e-05, + "loss": 0.7993, + "num_tokens": 33972725877.0, + "step": 8128 + }, + { + "epoch": 0.9660130718954248, + "grad_norm": 0.4175163075129138, + "learning_rate": 1.193271756309822e-05, + "loss": 0.7896, + "num_tokens": 33976915101.0, + "step": 8129 + }, + { + "epoch": 0.966131907308378, + "grad_norm": 0.4191551947185073, + "learning_rate": 1.1930994906514056e-05, + "loss": 0.824, + "num_tokens": 33981104905.0, + "step": 8130 + }, + { + "epoch": 0.9662507427213309, + "grad_norm": 0.3930528050182967, + "learning_rate": 1.1929272215452001e-05, + "loss": 0.8354, + "num_tokens": 33985262269.0, + "step": 8131 + }, + { + "epoch": 0.966369578134284, + "grad_norm": 0.3736265976830028, + "learning_rate": 1.1927549489975856e-05, + "loss": 0.8307, + "num_tokens": 33989429880.0, + "step": 8132 + }, + { + "epoch": 0.966488413547237, + "grad_norm": 0.45242979789283505, + "learning_rate": 1.1925826730149415e-05, + "loss": 0.8366, + "num_tokens": 33993618641.0, + "step": 8133 + }, + { + "epoch": 0.9666072489601901, + "grad_norm": 0.39092623847243707, + "learning_rate": 1.1924103936036482e-05, + "loss": 0.8011, + "num_tokens": 33997791949.0, + "step": 8134 + }, + { + "epoch": 0.9667260843731432, + "grad_norm": 0.4810485672390467, + "learning_rate": 1.1922381107700855e-05, + "loss": 0.8512, + "num_tokens": 34001981174.0, + "step": 8135 + }, + { + "epoch": 0.9668449197860962, + "grad_norm": 0.36740341231916435, + "learning_rate": 1.1920658245206337e-05, + "loss": 0.7953, + "num_tokens": 34006169841.0, + "step": 8136 + }, + { + "epoch": 0.9669637551990493, + "grad_norm": 0.4604847737392968, + "learning_rate": 1.1918935348616731e-05, + "loss": 0.8088, + "num_tokens": 34010358203.0, + "step": 8137 + }, + { + "epoch": 0.9670825906120024, + "grad_norm": 0.3355074003814025, + "learning_rate": 1.1917212417995844e-05, + "loss": 0.7925, + "num_tokens": 34014548144.0, + "step": 8138 + }, + { + "epoch": 0.9672014260249554, + "grad_norm": 0.47080575907641087, + "learning_rate": 1.1915489453407478e-05, + "loss": 0.803, + "num_tokens": 34018737297.0, + "step": 8139 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.34098539711272935, + "learning_rate": 1.1913766454915443e-05, + "loss": 0.8367, + "num_tokens": 34022887845.0, + "step": 8140 + }, + { + "epoch": 0.9674390968508616, + "grad_norm": 0.4880115068047554, + "learning_rate": 1.1912043422583546e-05, + "loss": 0.8502, + "num_tokens": 34027077224.0, + "step": 8141 + }, + { + "epoch": 0.9675579322638146, + "grad_norm": 0.37368409211432935, + "learning_rate": 1.19103203564756e-05, + "loss": 0.8181, + "num_tokens": 34031265006.0, + "step": 8142 + }, + { + "epoch": 0.9676767676767677, + "grad_norm": 0.549309003227723, + "learning_rate": 1.190859725665541e-05, + "loss": 0.8018, + "num_tokens": 34035454809.0, + "step": 8143 + }, + { + "epoch": 0.9677956030897208, + "grad_norm": 0.43162715363742693, + "learning_rate": 1.1906874123186792e-05, + "loss": 0.8673, + "num_tokens": 34039644259.0, + "step": 8144 + }, + { + "epoch": 0.9679144385026738, + "grad_norm": 0.5555070485755454, + "learning_rate": 1.1905150956133558e-05, + "loss": 0.8344, + "num_tokens": 34043790213.0, + "step": 8145 + }, + { + "epoch": 0.9680332739156269, + "grad_norm": 0.4805602057526261, + "learning_rate": 1.1903427755559524e-05, + "loss": 0.8391, + "num_tokens": 34047979011.0, + "step": 8146 + }, + { + "epoch": 0.9681521093285799, + "grad_norm": 0.5309724282399282, + "learning_rate": 1.190170452152851e-05, + "loss": 0.8369, + "num_tokens": 34052167767.0, + "step": 8147 + }, + { + "epoch": 0.968270944741533, + "grad_norm": 0.4735512431556424, + "learning_rate": 1.1899981254104327e-05, + "loss": 0.8394, + "num_tokens": 34056349403.0, + "step": 8148 + }, + { + "epoch": 0.9683897801544861, + "grad_norm": 0.4859050135814133, + "learning_rate": 1.1898257953350791e-05, + "loss": 0.8316, + "num_tokens": 34060512004.0, + "step": 8149 + }, + { + "epoch": 0.9685086155674391, + "grad_norm": 0.46734520694105586, + "learning_rate": 1.1896534619331728e-05, + "loss": 0.7952, + "num_tokens": 34064701468.0, + "step": 8150 + }, + { + "epoch": 0.9686274509803922, + "grad_norm": 0.3934990873195984, + "learning_rate": 1.1894811252110961e-05, + "loss": 0.8419, + "num_tokens": 34068866830.0, + "step": 8151 + }, + { + "epoch": 0.9687462863933453, + "grad_norm": 0.43861450864616347, + "learning_rate": 1.18930878517523e-05, + "loss": 0.8174, + "num_tokens": 34073056937.0, + "step": 8152 + }, + { + "epoch": 0.9688651218062982, + "grad_norm": 0.37163149610277874, + "learning_rate": 1.1891364418319583e-05, + "loss": 0.8246, + "num_tokens": 34077234454.0, + "step": 8153 + }, + { + "epoch": 0.9689839572192513, + "grad_norm": 0.4236564976640863, + "learning_rate": 1.1889640951876629e-05, + "loss": 0.8463, + "num_tokens": 34081423858.0, + "step": 8154 + }, + { + "epoch": 0.9691027926322044, + "grad_norm": 0.347378306107463, + "learning_rate": 1.1887917452487257e-05, + "loss": 0.8175, + "num_tokens": 34085611477.0, + "step": 8155 + }, + { + "epoch": 0.9692216280451574, + "grad_norm": 0.3864736751712335, + "learning_rate": 1.18861939202153e-05, + "loss": 0.847, + "num_tokens": 34089754676.0, + "step": 8156 + }, + { + "epoch": 0.9693404634581105, + "grad_norm": 0.3559740255877717, + "learning_rate": 1.1884470355124591e-05, + "loss": 0.8215, + "num_tokens": 34093945166.0, + "step": 8157 + }, + { + "epoch": 0.9694592988710635, + "grad_norm": 0.35796584572962886, + "learning_rate": 1.1882746757278949e-05, + "loss": 0.8395, + "num_tokens": 34098134920.0, + "step": 8158 + }, + { + "epoch": 0.9695781342840166, + "grad_norm": 0.33067062035536693, + "learning_rate": 1.1881023126742212e-05, + "loss": 0.8433, + "num_tokens": 34102292947.0, + "step": 8159 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.35038195204946154, + "learning_rate": 1.1879299463578209e-05, + "loss": 0.8189, + "num_tokens": 34106483148.0, + "step": 8160 + }, + { + "epoch": 0.9698158051099227, + "grad_norm": 0.3212059582697599, + "learning_rate": 1.1877575767850775e-05, + "loss": 0.8024, + "num_tokens": 34110666587.0, + "step": 8161 + }, + { + "epoch": 0.9699346405228758, + "grad_norm": 0.37289001030279956, + "learning_rate": 1.1875852039623743e-05, + "loss": 0.8422, + "num_tokens": 34114855236.0, + "step": 8162 + }, + { + "epoch": 0.9700534759358289, + "grad_norm": 0.3499020151370552, + "learning_rate": 1.187412827896095e-05, + "loss": 0.8611, + "num_tokens": 34119043330.0, + "step": 8163 + }, + { + "epoch": 0.9701723113487819, + "grad_norm": 0.3876808983050524, + "learning_rate": 1.187240448592623e-05, + "loss": 0.872, + "num_tokens": 34123221149.0, + "step": 8164 + }, + { + "epoch": 0.970291146761735, + "grad_norm": 0.30220708298138016, + "learning_rate": 1.187068066058342e-05, + "loss": 0.8046, + "num_tokens": 34127411430.0, + "step": 8165 + }, + { + "epoch": 0.9704099821746881, + "grad_norm": 0.36414497673794155, + "learning_rate": 1.1868956802996366e-05, + "loss": 0.8463, + "num_tokens": 34131600837.0, + "step": 8166 + }, + { + "epoch": 0.9705288175876411, + "grad_norm": 0.3164219117548662, + "learning_rate": 1.1867232913228899e-05, + "loss": 0.8609, + "num_tokens": 34135788128.0, + "step": 8167 + }, + { + "epoch": 0.9706476530005942, + "grad_norm": 0.3306748945003085, + "learning_rate": 1.1865508991344868e-05, + "loss": 0.8097, + "num_tokens": 34139978417.0, + "step": 8168 + }, + { + "epoch": 0.9707664884135473, + "grad_norm": 0.33967307649292444, + "learning_rate": 1.1863785037408111e-05, + "loss": 0.8259, + "num_tokens": 34144165228.0, + "step": 8169 + }, + { + "epoch": 0.9708853238265003, + "grad_norm": 0.35588410501098566, + "learning_rate": 1.1862061051482474e-05, + "loss": 0.8388, + "num_tokens": 34148329210.0, + "step": 8170 + }, + { + "epoch": 0.9710041592394534, + "grad_norm": 0.34085499231841154, + "learning_rate": 1.18603370336318e-05, + "loss": 0.8653, + "num_tokens": 34152517755.0, + "step": 8171 + }, + { + "epoch": 0.9711229946524064, + "grad_norm": 0.36132895168267326, + "learning_rate": 1.185861298391994e-05, + "loss": 0.8629, + "num_tokens": 34156678992.0, + "step": 8172 + }, + { + "epoch": 0.9712418300653595, + "grad_norm": 0.3347899268644041, + "learning_rate": 1.1856888902410737e-05, + "loss": 0.8277, + "num_tokens": 34160868171.0, + "step": 8173 + }, + { + "epoch": 0.9713606654783126, + "grad_norm": 0.33309834376805253, + "learning_rate": 1.1855164789168042e-05, + "loss": 0.8483, + "num_tokens": 34165051613.0, + "step": 8174 + }, + { + "epoch": 0.9714795008912656, + "grad_norm": 0.39523493851861097, + "learning_rate": 1.1853440644255701e-05, + "loss": 0.8541, + "num_tokens": 34169240408.0, + "step": 8175 + }, + { + "epoch": 0.9715983363042187, + "grad_norm": 0.3294956294925692, + "learning_rate": 1.1851716467737566e-05, + "loss": 0.8246, + "num_tokens": 34173429222.0, + "step": 8176 + }, + { + "epoch": 0.9717171717171718, + "grad_norm": 0.3598754015180611, + "learning_rate": 1.1849992259677496e-05, + "loss": 0.832, + "num_tokens": 34177570160.0, + "step": 8177 + }, + { + "epoch": 0.9718360071301247, + "grad_norm": 0.3342463852243895, + "learning_rate": 1.1848268020139334e-05, + "loss": 0.8045, + "num_tokens": 34181758405.0, + "step": 8178 + }, + { + "epoch": 0.9719548425430778, + "grad_norm": 0.30626320475781105, + "learning_rate": 1.1846543749186939e-05, + "loss": 0.7979, + "num_tokens": 34185943909.0, + "step": 8179 + }, + { + "epoch": 0.972073677956031, + "grad_norm": 0.35720749264359963, + "learning_rate": 1.184481944688417e-05, + "loss": 0.789, + "num_tokens": 34190096456.0, + "step": 8180 + }, + { + "epoch": 0.9721925133689839, + "grad_norm": 0.32488740311426745, + "learning_rate": 1.1843095113294876e-05, + "loss": 0.8383, + "num_tokens": 34194286727.0, + "step": 8181 + }, + { + "epoch": 0.972311348781937, + "grad_norm": 0.31725939528180624, + "learning_rate": 1.1841370748482925e-05, + "loss": 0.8495, + "num_tokens": 34198450521.0, + "step": 8182 + }, + { + "epoch": 0.97243018419489, + "grad_norm": 0.3584989052577028, + "learning_rate": 1.1839646352512168e-05, + "loss": 0.8592, + "num_tokens": 34202635377.0, + "step": 8183 + }, + { + "epoch": 0.9725490196078431, + "grad_norm": 0.3562168272661363, + "learning_rate": 1.1837921925446465e-05, + "loss": 0.8277, + "num_tokens": 34206820555.0, + "step": 8184 + }, + { + "epoch": 0.9726678550207962, + "grad_norm": 0.3557079390550921, + "learning_rate": 1.1836197467349684e-05, + "loss": 0.8105, + "num_tokens": 34211009504.0, + "step": 8185 + }, + { + "epoch": 0.9727866904337492, + "grad_norm": 0.30758605862664984, + "learning_rate": 1.1834472978285683e-05, + "loss": 0.8161, + "num_tokens": 34215180620.0, + "step": 8186 + }, + { + "epoch": 0.9729055258467023, + "grad_norm": 0.34030220288983576, + "learning_rate": 1.1832748458318324e-05, + "loss": 0.8731, + "num_tokens": 34219349328.0, + "step": 8187 + }, + { + "epoch": 0.9730243612596554, + "grad_norm": 0.29441080457936675, + "learning_rate": 1.1831023907511474e-05, + "loss": 0.808, + "num_tokens": 34223525596.0, + "step": 8188 + }, + { + "epoch": 0.9731431966726084, + "grad_norm": 0.3277306530529116, + "learning_rate": 1.1829299325929002e-05, + "loss": 0.8361, + "num_tokens": 34227713333.0, + "step": 8189 + }, + { + "epoch": 0.9732620320855615, + "grad_norm": 0.33293522185580643, + "learning_rate": 1.182757471363477e-05, + "loss": 0.8315, + "num_tokens": 34231900387.0, + "step": 8190 + }, + { + "epoch": 0.9733808674985146, + "grad_norm": 0.3242304586821143, + "learning_rate": 1.1825850070692648e-05, + "loss": 0.8691, + "num_tokens": 34236069481.0, + "step": 8191 + }, + { + "epoch": 0.9734997029114676, + "grad_norm": 0.3890832709585745, + "learning_rate": 1.1824125397166504e-05, + "loss": 0.8384, + "num_tokens": 34240258295.0, + "step": 8192 + }, + { + "epoch": 0.9736185383244207, + "grad_norm": 0.3455132528121144, + "learning_rate": 1.1822400693120216e-05, + "loss": 0.8301, + "num_tokens": 34244448630.0, + "step": 8193 + }, + { + "epoch": 0.9737373737373738, + "grad_norm": 0.35814199987936113, + "learning_rate": 1.1820675958617642e-05, + "loss": 0.7822, + "num_tokens": 34248634976.0, + "step": 8194 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.35820460631300105, + "learning_rate": 1.1818951193722668e-05, + "loss": 0.8227, + "num_tokens": 34252788068.0, + "step": 8195 + }, + { + "epoch": 0.9739750445632799, + "grad_norm": 0.3557370048097814, + "learning_rate": 1.181722639849916e-05, + "loss": 0.8794, + "num_tokens": 34256977184.0, + "step": 8196 + }, + { + "epoch": 0.9740938799762329, + "grad_norm": 0.3736509039535076, + "learning_rate": 1.1815501573010994e-05, + "loss": 0.8021, + "num_tokens": 34261166877.0, + "step": 8197 + }, + { + "epoch": 0.974212715389186, + "grad_norm": 0.3414134349345755, + "learning_rate": 1.1813776717322052e-05, + "loss": 0.798, + "num_tokens": 34265344936.0, + "step": 8198 + }, + { + "epoch": 0.9743315508021391, + "grad_norm": 0.4302725236977016, + "learning_rate": 1.1812051831496206e-05, + "loss": 0.8543, + "num_tokens": 34269534663.0, + "step": 8199 + }, + { + "epoch": 0.9744503862150921, + "grad_norm": 0.3458686135165955, + "learning_rate": 1.181032691559733e-05, + "loss": 0.813, + "num_tokens": 34273699556.0, + "step": 8200 + }, + { + "epoch": 0.9745692216280452, + "grad_norm": 0.436178032158504, + "learning_rate": 1.1808601969689313e-05, + "loss": 0.8586, + "num_tokens": 34277889322.0, + "step": 8201 + }, + { + "epoch": 0.9746880570409983, + "grad_norm": 0.38389934257676933, + "learning_rate": 1.180687699383603e-05, + "loss": 0.866, + "num_tokens": 34282064406.0, + "step": 8202 + }, + { + "epoch": 0.9748068924539512, + "grad_norm": 0.3872029901922641, + "learning_rate": 1.1805151988101363e-05, + "loss": 0.8719, + "num_tokens": 34286254215.0, + "step": 8203 + }, + { + "epoch": 0.9749257278669043, + "grad_norm": 0.3212019963266263, + "learning_rate": 1.1803426952549196e-05, + "loss": 0.7927, + "num_tokens": 34290441241.0, + "step": 8204 + }, + { + "epoch": 0.9750445632798574, + "grad_norm": 0.3376172240939519, + "learning_rate": 1.1801701887243412e-05, + "loss": 0.85, + "num_tokens": 34294630328.0, + "step": 8205 + }, + { + "epoch": 0.9751633986928104, + "grad_norm": 0.42196223422534584, + "learning_rate": 1.1799976792247898e-05, + "loss": 0.8334, + "num_tokens": 34298820022.0, + "step": 8206 + }, + { + "epoch": 0.9752822341057635, + "grad_norm": 0.3568561128798355, + "learning_rate": 1.1798251667626537e-05, + "loss": 0.8487, + "num_tokens": 34303007734.0, + "step": 8207 + }, + { + "epoch": 0.9754010695187165, + "grad_norm": 0.32336019387958653, + "learning_rate": 1.179652651344322e-05, + "loss": 0.8386, + "num_tokens": 34307195315.0, + "step": 8208 + }, + { + "epoch": 0.9755199049316696, + "grad_norm": 0.40398849236650136, + "learning_rate": 1.1794801329761834e-05, + "loss": 0.8334, + "num_tokens": 34311379241.0, + "step": 8209 + }, + { + "epoch": 0.9756387403446227, + "grad_norm": 0.3632106872236428, + "learning_rate": 1.179307611664626e-05, + "loss": 0.8007, + "num_tokens": 34315545098.0, + "step": 8210 + }, + { + "epoch": 0.9757575757575757, + "grad_norm": 0.37745558045045496, + "learning_rate": 1.1791350874160404e-05, + "loss": 0.8483, + "num_tokens": 34319734050.0, + "step": 8211 + }, + { + "epoch": 0.9758764111705288, + "grad_norm": 0.33599911043686137, + "learning_rate": 1.178962560236815e-05, + "loss": 0.8663, + "num_tokens": 34323864597.0, + "step": 8212 + }, + { + "epoch": 0.9759952465834819, + "grad_norm": 0.330966532570965, + "learning_rate": 1.1787900301333383e-05, + "loss": 0.8294, + "num_tokens": 34328054068.0, + "step": 8213 + }, + { + "epoch": 0.9761140819964349, + "grad_norm": 0.40419944368408, + "learning_rate": 1.1786174971120007e-05, + "loss": 0.8213, + "num_tokens": 34332242650.0, + "step": 8214 + }, + { + "epoch": 0.976232917409388, + "grad_norm": 0.4118923768714439, + "learning_rate": 1.1784449611791916e-05, + "loss": 0.867, + "num_tokens": 34336431165.0, + "step": 8215 + }, + { + "epoch": 0.9763517528223411, + "grad_norm": 0.3240888536756057, + "learning_rate": 1.1782724223413e-05, + "loss": 0.8525, + "num_tokens": 34340608902.0, + "step": 8216 + }, + { + "epoch": 0.9764705882352941, + "grad_norm": 0.3079843789260613, + "learning_rate": 1.1780998806047163e-05, + "loss": 0.8283, + "num_tokens": 34344798122.0, + "step": 8217 + }, + { + "epoch": 0.9765894236482472, + "grad_norm": 0.3559796974390423, + "learning_rate": 1.1779273359758298e-05, + "loss": 0.8004, + "num_tokens": 34348968565.0, + "step": 8218 + }, + { + "epoch": 0.9767082590612003, + "grad_norm": 0.3871398998122234, + "learning_rate": 1.1777547884610306e-05, + "loss": 0.8302, + "num_tokens": 34353135074.0, + "step": 8219 + }, + { + "epoch": 0.9768270944741533, + "grad_norm": 0.3256577524412644, + "learning_rate": 1.1775822380667086e-05, + "loss": 0.8544, + "num_tokens": 34357304244.0, + "step": 8220 + }, + { + "epoch": 0.9769459298871064, + "grad_norm": 0.3362139696666883, + "learning_rate": 1.1774096847992543e-05, + "loss": 0.7927, + "num_tokens": 34361483376.0, + "step": 8221 + }, + { + "epoch": 0.9770647653000594, + "grad_norm": 0.31793283331963706, + "learning_rate": 1.1772371286650573e-05, + "loss": 0.836, + "num_tokens": 34365671499.0, + "step": 8222 + }, + { + "epoch": 0.9771836007130125, + "grad_norm": 0.3165094692489024, + "learning_rate": 1.177064569670508e-05, + "loss": 0.8335, + "num_tokens": 34369859290.0, + "step": 8223 + }, + { + "epoch": 0.9773024361259656, + "grad_norm": 0.3606889485821392, + "learning_rate": 1.1768920078219979e-05, + "loss": 0.8215, + "num_tokens": 34374003711.0, + "step": 8224 + }, + { + "epoch": 0.9774212715389186, + "grad_norm": 0.34801420718754283, + "learning_rate": 1.1767194431259163e-05, + "loss": 0.8548, + "num_tokens": 34378193312.0, + "step": 8225 + }, + { + "epoch": 0.9775401069518717, + "grad_norm": 0.37771511234070027, + "learning_rate": 1.1765468755886546e-05, + "loss": 0.7994, + "num_tokens": 34382382232.0, + "step": 8226 + }, + { + "epoch": 0.9776589423648248, + "grad_norm": 0.3792715919540947, + "learning_rate": 1.176374305216603e-05, + "loss": 0.7819, + "num_tokens": 34386571560.0, + "step": 8227 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.32503188020115303, + "learning_rate": 1.1762017320161527e-05, + "loss": 0.818, + "num_tokens": 34390760001.0, + "step": 8228 + }, + { + "epoch": 0.9778966131907308, + "grad_norm": 0.2998633441443416, + "learning_rate": 1.1760291559936947e-05, + "loss": 0.8149, + "num_tokens": 34394949343.0, + "step": 8229 + }, + { + "epoch": 0.978015448603684, + "grad_norm": 0.38029189791458334, + "learning_rate": 1.17585657715562e-05, + "loss": 0.8096, + "num_tokens": 34399139334.0, + "step": 8230 + }, + { + "epoch": 0.9781342840166369, + "grad_norm": 0.39340882392008925, + "learning_rate": 1.1756839955083196e-05, + "loss": 0.8013, + "num_tokens": 34403327962.0, + "step": 8231 + }, + { + "epoch": 0.97825311942959, + "grad_norm": 0.39080751583670004, + "learning_rate": 1.1755114110581854e-05, + "loss": 0.8085, + "num_tokens": 34407518160.0, + "step": 8232 + }, + { + "epoch": 0.978371954842543, + "grad_norm": 0.3344934298322289, + "learning_rate": 1.175338823811608e-05, + "loss": 0.8161, + "num_tokens": 34411706227.0, + "step": 8233 + }, + { + "epoch": 0.9784907902554961, + "grad_norm": 0.2925108523884843, + "learning_rate": 1.1751662337749792e-05, + "loss": 0.8341, + "num_tokens": 34415894345.0, + "step": 8234 + }, + { + "epoch": 0.9786096256684492, + "grad_norm": 0.34436670665052194, + "learning_rate": 1.1749936409546907e-05, + "loss": 0.8508, + "num_tokens": 34420064220.0, + "step": 8235 + }, + { + "epoch": 0.9787284610814022, + "grad_norm": 0.2987436636286244, + "learning_rate": 1.174821045357134e-05, + "loss": 0.8504, + "num_tokens": 34424236312.0, + "step": 8236 + }, + { + "epoch": 0.9788472964943553, + "grad_norm": 0.36727596015677133, + "learning_rate": 1.1746484469887014e-05, + "loss": 0.8243, + "num_tokens": 34428406860.0, + "step": 8237 + }, + { + "epoch": 0.9789661319073084, + "grad_norm": 0.3536616926600274, + "learning_rate": 1.1744758458557839e-05, + "loss": 0.8, + "num_tokens": 34432597074.0, + "step": 8238 + }, + { + "epoch": 0.9790849673202614, + "grad_norm": 0.31233422575245623, + "learning_rate": 1.1743032419647744e-05, + "loss": 0.8366, + "num_tokens": 34436786335.0, + "step": 8239 + }, + { + "epoch": 0.9792038027332145, + "grad_norm": 0.37465849327704276, + "learning_rate": 1.1741306353220646e-05, + "loss": 0.8393, + "num_tokens": 34440957990.0, + "step": 8240 + }, + { + "epoch": 0.9793226381461676, + "grad_norm": 0.3235193017331915, + "learning_rate": 1.1739580259340464e-05, + "loss": 0.8383, + "num_tokens": 34445146458.0, + "step": 8241 + }, + { + "epoch": 0.9794414735591206, + "grad_norm": 0.3361971056569502, + "learning_rate": 1.1737854138071128e-05, + "loss": 0.765, + "num_tokens": 34449305713.0, + "step": 8242 + }, + { + "epoch": 0.9795603089720737, + "grad_norm": 0.3646854720950434, + "learning_rate": 1.1736127989476555e-05, + "loss": 0.8568, + "num_tokens": 34453476279.0, + "step": 8243 + }, + { + "epoch": 0.9796791443850268, + "grad_norm": 0.3159603548276014, + "learning_rate": 1.1734401813620676e-05, + "loss": 0.8054, + "num_tokens": 34457644205.0, + "step": 8244 + }, + { + "epoch": 0.9797979797979798, + "grad_norm": 0.3435096287549167, + "learning_rate": 1.1732675610567417e-05, + "loss": 0.8545, + "num_tokens": 34461832956.0, + "step": 8245 + }, + { + "epoch": 0.9799168152109329, + "grad_norm": 0.3373824605139079, + "learning_rate": 1.1730949380380698e-05, + "loss": 0.8542, + "num_tokens": 34466023456.0, + "step": 8246 + }, + { + "epoch": 0.9800356506238859, + "grad_norm": 0.29325382703288067, + "learning_rate": 1.1729223123124456e-05, + "loss": 0.8471, + "num_tokens": 34470194623.0, + "step": 8247 + }, + { + "epoch": 0.980154486036839, + "grad_norm": 0.35775158633679877, + "learning_rate": 1.1727496838862613e-05, + "loss": 0.8261, + "num_tokens": 34474378287.0, + "step": 8248 + }, + { + "epoch": 0.9802733214497921, + "grad_norm": 0.3327076694533363, + "learning_rate": 1.1725770527659103e-05, + "loss": 0.8417, + "num_tokens": 34478568016.0, + "step": 8249 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.30218871165359157, + "learning_rate": 1.1724044189577861e-05, + "loss": 0.8289, + "num_tokens": 34482756887.0, + "step": 8250 + }, + { + "epoch": 0.9805109922756982, + "grad_norm": 0.30198176729764803, + "learning_rate": 1.172231782468281e-05, + "loss": 0.7933, + "num_tokens": 34486947714.0, + "step": 8251 + }, + { + "epoch": 0.9806298276886513, + "grad_norm": 0.3716380275818165, + "learning_rate": 1.1720591433037886e-05, + "loss": 0.8396, + "num_tokens": 34491110047.0, + "step": 8252 + }, + { + "epoch": 0.9807486631016042, + "grad_norm": 0.3778450666894976, + "learning_rate": 1.1718865014707031e-05, + "loss": 0.8606, + "num_tokens": 34495288496.0, + "step": 8253 + }, + { + "epoch": 0.9808674985145573, + "grad_norm": 0.33431904753459585, + "learning_rate": 1.1717138569754168e-05, + "loss": 0.8322, + "num_tokens": 34499478783.0, + "step": 8254 + }, + { + "epoch": 0.9809863339275104, + "grad_norm": 0.36618651754960424, + "learning_rate": 1.1715412098243244e-05, + "loss": 0.834, + "num_tokens": 34503668420.0, + "step": 8255 + }, + { + "epoch": 0.9811051693404634, + "grad_norm": 0.36705367372266917, + "learning_rate": 1.1713685600238187e-05, + "loss": 0.8414, + "num_tokens": 34507859011.0, + "step": 8256 + }, + { + "epoch": 0.9812240047534165, + "grad_norm": 0.36055719217577187, + "learning_rate": 1.171195907580294e-05, + "loss": 0.879, + "num_tokens": 34511993994.0, + "step": 8257 + }, + { + "epoch": 0.9813428401663695, + "grad_norm": 0.3132132536882948, + "learning_rate": 1.1710232525001444e-05, + "loss": 0.8168, + "num_tokens": 34516182230.0, + "step": 8258 + }, + { + "epoch": 0.9814616755793226, + "grad_norm": 0.31242089547807733, + "learning_rate": 1.1708505947897632e-05, + "loss": 0.824, + "num_tokens": 34520370757.0, + "step": 8259 + }, + { + "epoch": 0.9815805109922757, + "grad_norm": 0.29268391382581255, + "learning_rate": 1.1706779344555453e-05, + "loss": 0.8206, + "num_tokens": 34524539766.0, + "step": 8260 + }, + { + "epoch": 0.9816993464052287, + "grad_norm": 0.3423736852257816, + "learning_rate": 1.1705052715038842e-05, + "loss": 0.8299, + "num_tokens": 34528691148.0, + "step": 8261 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 0.346970146347674, + "learning_rate": 1.1703326059411748e-05, + "loss": 0.8561, + "num_tokens": 34532878503.0, + "step": 8262 + }, + { + "epoch": 0.9819370172311349, + "grad_norm": 0.37424658524936716, + "learning_rate": 1.1701599377738113e-05, + "loss": 0.8219, + "num_tokens": 34537066327.0, + "step": 8263 + }, + { + "epoch": 0.9820558526440879, + "grad_norm": 0.38594448741681714, + "learning_rate": 1.1699872670081877e-05, + "loss": 0.8523, + "num_tokens": 34541255581.0, + "step": 8264 + }, + { + "epoch": 0.982174688057041, + "grad_norm": 0.31987256306384754, + "learning_rate": 1.1698145936506993e-05, + "loss": 0.8461, + "num_tokens": 34545443966.0, + "step": 8265 + }, + { + "epoch": 0.9822935234699941, + "grad_norm": 0.36474831122752016, + "learning_rate": 1.1696419177077404e-05, + "loss": 0.8172, + "num_tokens": 34549604926.0, + "step": 8266 + }, + { + "epoch": 0.9824123588829471, + "grad_norm": 0.37837192387269103, + "learning_rate": 1.1694692391857055e-05, + "loss": 0.7958, + "num_tokens": 34553775827.0, + "step": 8267 + }, + { + "epoch": 0.9825311942959002, + "grad_norm": 0.37452825231616826, + "learning_rate": 1.1692965580909904e-05, + "loss": 0.8324, + "num_tokens": 34557965872.0, + "step": 8268 + }, + { + "epoch": 0.9826500297088533, + "grad_norm": 0.3919181711760015, + "learning_rate": 1.1691238744299889e-05, + "loss": 0.8248, + "num_tokens": 34562155437.0, + "step": 8269 + }, + { + "epoch": 0.9827688651218063, + "grad_norm": 0.32996284157855943, + "learning_rate": 1.1689511882090966e-05, + "loss": 0.8476, + "num_tokens": 34566344864.0, + "step": 8270 + }, + { + "epoch": 0.9828877005347594, + "grad_norm": 0.37830596030511254, + "learning_rate": 1.1687784994347091e-05, + "loss": 0.8533, + "num_tokens": 34570535662.0, + "step": 8271 + }, + { + "epoch": 0.9830065359477124, + "grad_norm": 0.32870927032686403, + "learning_rate": 1.1686058081132207e-05, + "loss": 0.8766, + "num_tokens": 34574724317.0, + "step": 8272 + }, + { + "epoch": 0.9831253713606655, + "grad_norm": 0.40342752613538835, + "learning_rate": 1.1684331142510277e-05, + "loss": 0.8411, + "num_tokens": 34578913526.0, + "step": 8273 + }, + { + "epoch": 0.9832442067736186, + "grad_norm": 0.3126857785515842, + "learning_rate": 1.1682604178545251e-05, + "loss": 0.8441, + "num_tokens": 34583102985.0, + "step": 8274 + }, + { + "epoch": 0.9833630421865716, + "grad_norm": 0.3884943157287268, + "learning_rate": 1.1680877189301082e-05, + "loss": 0.8148, + "num_tokens": 34587292112.0, + "step": 8275 + }, + { + "epoch": 0.9834818775995247, + "grad_norm": 0.3220609156432714, + "learning_rate": 1.1679150174841729e-05, + "loss": 0.8188, + "num_tokens": 34591483280.0, + "step": 8276 + }, + { + "epoch": 0.9836007130124778, + "grad_norm": 0.38237390202284294, + "learning_rate": 1.167742313523115e-05, + "loss": 0.7946, + "num_tokens": 34595671085.0, + "step": 8277 + }, + { + "epoch": 0.9837195484254307, + "grad_norm": 0.3109201359042199, + "learning_rate": 1.16756960705333e-05, + "loss": 0.866, + "num_tokens": 34599858138.0, + "step": 8278 + }, + { + "epoch": 0.9838383838383838, + "grad_norm": 0.43319731393971994, + "learning_rate": 1.1673968980812141e-05, + "loss": 0.8298, + "num_tokens": 34604028143.0, + "step": 8279 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 0.3649446544794741, + "learning_rate": 1.1672241866131633e-05, + "loss": 0.8299, + "num_tokens": 34608217373.0, + "step": 8280 + }, + { + "epoch": 0.9840760546642899, + "grad_norm": 0.3747148055079618, + "learning_rate": 1.1670514726555736e-05, + "loss": 0.7997, + "num_tokens": 34612405956.0, + "step": 8281 + }, + { + "epoch": 0.984194890077243, + "grad_norm": 0.373542287781635, + "learning_rate": 1.166878756214841e-05, + "loss": 0.8451, + "num_tokens": 34616595562.0, + "step": 8282 + }, + { + "epoch": 0.984313725490196, + "grad_norm": 0.38250729173598286, + "learning_rate": 1.1667060372973617e-05, + "loss": 0.8006, + "num_tokens": 34620757026.0, + "step": 8283 + }, + { + "epoch": 0.9844325609031491, + "grad_norm": 0.33875513815886693, + "learning_rate": 1.1665333159095327e-05, + "loss": 0.8616, + "num_tokens": 34624945820.0, + "step": 8284 + }, + { + "epoch": 0.9845513963161022, + "grad_norm": 0.36230288537788885, + "learning_rate": 1.16636059205775e-05, + "loss": 0.8399, + "num_tokens": 34629135518.0, + "step": 8285 + }, + { + "epoch": 0.9846702317290552, + "grad_norm": 0.43940541029525304, + "learning_rate": 1.16618786574841e-05, + "loss": 0.838, + "num_tokens": 34633323705.0, + "step": 8286 + }, + { + "epoch": 0.9847890671420083, + "grad_norm": 0.37366414968490885, + "learning_rate": 1.1660151369879098e-05, + "loss": 0.8262, + "num_tokens": 34637512515.0, + "step": 8287 + }, + { + "epoch": 0.9849079025549614, + "grad_norm": 0.34832886922586115, + "learning_rate": 1.1658424057826456e-05, + "loss": 0.8329, + "num_tokens": 34641702211.0, + "step": 8288 + }, + { + "epoch": 0.9850267379679144, + "grad_norm": 0.31185029489638205, + "learning_rate": 1.1656696721390148e-05, + "loss": 0.8472, + "num_tokens": 34645856876.0, + "step": 8289 + }, + { + "epoch": 0.9851455733808675, + "grad_norm": 0.35817359189664183, + "learning_rate": 1.1654969360634136e-05, + "loss": 0.8437, + "num_tokens": 34650045523.0, + "step": 8290 + }, + { + "epoch": 0.9852644087938206, + "grad_norm": 0.30677651348165863, + "learning_rate": 1.1653241975622397e-05, + "loss": 0.8281, + "num_tokens": 34654223455.0, + "step": 8291 + }, + { + "epoch": 0.9853832442067736, + "grad_norm": 0.3902666061216421, + "learning_rate": 1.1651514566418897e-05, + "loss": 0.7915, + "num_tokens": 34658414381.0, + "step": 8292 + }, + { + "epoch": 0.9855020796197267, + "grad_norm": 0.358782740636341, + "learning_rate": 1.1649787133087612e-05, + "loss": 0.8356, + "num_tokens": 34662583817.0, + "step": 8293 + }, + { + "epoch": 0.9856209150326798, + "grad_norm": 0.36868034434419833, + "learning_rate": 1.1648059675692513e-05, + "loss": 0.8491, + "num_tokens": 34666771838.0, + "step": 8294 + }, + { + "epoch": 0.9857397504456328, + "grad_norm": 0.42397571118173255, + "learning_rate": 1.1646332194297572e-05, + "loss": 0.8084, + "num_tokens": 34670959372.0, + "step": 8295 + }, + { + "epoch": 0.9858585858585859, + "grad_norm": 0.3390209691263102, + "learning_rate": 1.1644604688966765e-05, + "loss": 0.8152, + "num_tokens": 34675147743.0, + "step": 8296 + }, + { + "epoch": 0.9859774212715389, + "grad_norm": 0.31983135480458247, + "learning_rate": 1.1642877159764065e-05, + "loss": 0.7854, + "num_tokens": 34679330034.0, + "step": 8297 + }, + { + "epoch": 0.986096256684492, + "grad_norm": 0.32845636185152455, + "learning_rate": 1.1641149606753454e-05, + "loss": 0.8324, + "num_tokens": 34683492337.0, + "step": 8298 + }, + { + "epoch": 0.9862150920974451, + "grad_norm": 0.33563065455628077, + "learning_rate": 1.1639422029998901e-05, + "loss": 0.8273, + "num_tokens": 34687676799.0, + "step": 8299 + }, + { + "epoch": 0.9863339275103981, + "grad_norm": 0.30725397152538314, + "learning_rate": 1.1637694429564393e-05, + "loss": 0.8285, + "num_tokens": 34691851514.0, + "step": 8300 + }, + { + "epoch": 0.9864527629233512, + "grad_norm": 0.35249622130809516, + "learning_rate": 1.1635966805513898e-05, + "loss": 0.8238, + "num_tokens": 34696041179.0, + "step": 8301 + }, + { + "epoch": 0.9865715983363043, + "grad_norm": 0.32700580892883785, + "learning_rate": 1.1634239157911405e-05, + "loss": 0.8426, + "num_tokens": 34700230786.0, + "step": 8302 + }, + { + "epoch": 0.9866904337492572, + "grad_norm": 0.34474090915863037, + "learning_rate": 1.1632511486820891e-05, + "loss": 0.7925, + "num_tokens": 34704418936.0, + "step": 8303 + }, + { + "epoch": 0.9868092691622103, + "grad_norm": 0.33812872748451167, + "learning_rate": 1.1630783792306339e-05, + "loss": 0.8347, + "num_tokens": 34708608789.0, + "step": 8304 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.34652788580650096, + "learning_rate": 1.1629056074431731e-05, + "loss": 0.8526, + "num_tokens": 34712797833.0, + "step": 8305 + }, + { + "epoch": 0.9870469399881164, + "grad_norm": 0.3373677451617977, + "learning_rate": 1.162732833326105e-05, + "loss": 0.8132, + "num_tokens": 34716953513.0, + "step": 8306 + }, + { + "epoch": 0.9871657754010695, + "grad_norm": 0.34556850352813034, + "learning_rate": 1.1625600568858277e-05, + "loss": 0.8596, + "num_tokens": 34721124929.0, + "step": 8307 + }, + { + "epoch": 0.9872846108140226, + "grad_norm": 0.3640395766864965, + "learning_rate": 1.1623872781287403e-05, + "loss": 0.8264, + "num_tokens": 34725315025.0, + "step": 8308 + }, + { + "epoch": 0.9874034462269756, + "grad_norm": 0.35195058296237663, + "learning_rate": 1.1622144970612409e-05, + "loss": 0.8478, + "num_tokens": 34729504291.0, + "step": 8309 + }, + { + "epoch": 0.9875222816399287, + "grad_norm": 0.3052120169273801, + "learning_rate": 1.1620417136897283e-05, + "loss": 0.8229, + "num_tokens": 34733673944.0, + "step": 8310 + }, + { + "epoch": 0.9876411170528817, + "grad_norm": 0.3192056164818972, + "learning_rate": 1.161868928020601e-05, + "loss": 0.8278, + "num_tokens": 34737855788.0, + "step": 8311 + }, + { + "epoch": 0.9877599524658348, + "grad_norm": 0.33533896035707655, + "learning_rate": 1.1616961400602587e-05, + "loss": 0.8495, + "num_tokens": 34742046501.0, + "step": 8312 + }, + { + "epoch": 0.9878787878787879, + "grad_norm": 0.36730886289422593, + "learning_rate": 1.1615233498150993e-05, + "loss": 0.8124, + "num_tokens": 34746219827.0, + "step": 8313 + }, + { + "epoch": 0.9879976232917409, + "grad_norm": 0.32920228812385105, + "learning_rate": 1.1613505572915222e-05, + "loss": 0.8031, + "num_tokens": 34750405740.0, + "step": 8314 + }, + { + "epoch": 0.988116458704694, + "grad_norm": 0.3122302295147444, + "learning_rate": 1.161177762495927e-05, + "loss": 0.8537, + "num_tokens": 34754594145.0, + "step": 8315 + }, + { + "epoch": 0.9882352941176471, + "grad_norm": 0.4300325396026827, + "learning_rate": 1.1610049654347119e-05, + "loss": 0.8323, + "num_tokens": 34758780523.0, + "step": 8316 + }, + { + "epoch": 0.9883541295306001, + "grad_norm": 0.3619906487836336, + "learning_rate": 1.1608321661142772e-05, + "loss": 0.8416, + "num_tokens": 34762969564.0, + "step": 8317 + }, + { + "epoch": 0.9884729649435532, + "grad_norm": 0.34040286540019965, + "learning_rate": 1.1606593645410211e-05, + "loss": 0.8581, + "num_tokens": 34767154338.0, + "step": 8318 + }, + { + "epoch": 0.9885918003565063, + "grad_norm": 0.3845801160426104, + "learning_rate": 1.160486560721344e-05, + "loss": 0.8566, + "num_tokens": 34771343519.0, + "step": 8319 + }, + { + "epoch": 0.9887106357694593, + "grad_norm": 0.33354194789096636, + "learning_rate": 1.1603137546616451e-05, + "loss": 0.8576, + "num_tokens": 34775532317.0, + "step": 8320 + }, + { + "epoch": 0.9888294711824124, + "grad_norm": 0.326700526893684, + "learning_rate": 1.1601409463683237e-05, + "loss": 0.8309, + "num_tokens": 34779721552.0, + "step": 8321 + }, + { + "epoch": 0.9889483065953654, + "grad_norm": 0.4121630995682986, + "learning_rate": 1.1599681358477799e-05, + "loss": 0.848, + "num_tokens": 34783910452.0, + "step": 8322 + }, + { + "epoch": 0.9890671420083185, + "grad_norm": 0.3954838522041493, + "learning_rate": 1.1597953231064135e-05, + "loss": 0.8346, + "num_tokens": 34788098523.0, + "step": 8323 + }, + { + "epoch": 0.9891859774212716, + "grad_norm": 0.30768167285614095, + "learning_rate": 1.1596225081506234e-05, + "loss": 0.8117, + "num_tokens": 34792287356.0, + "step": 8324 + }, + { + "epoch": 0.9893048128342246, + "grad_norm": 0.3875963002300547, + "learning_rate": 1.1594496909868107e-05, + "loss": 0.8442, + "num_tokens": 34796448282.0, + "step": 8325 + }, + { + "epoch": 0.9894236482471777, + "grad_norm": 0.3610558807513014, + "learning_rate": 1.1592768716213751e-05, + "loss": 0.8322, + "num_tokens": 34800635989.0, + "step": 8326 + }, + { + "epoch": 0.9895424836601308, + "grad_norm": 0.31410966711367644, + "learning_rate": 1.1591040500607162e-05, + "loss": 0.8466, + "num_tokens": 34804793047.0, + "step": 8327 + }, + { + "epoch": 0.9896613190730837, + "grad_norm": 0.29696311823705424, + "learning_rate": 1.1589312263112347e-05, + "loss": 0.8328, + "num_tokens": 34808953187.0, + "step": 8328 + }, + { + "epoch": 0.9897801544860368, + "grad_norm": 0.3449248149392982, + "learning_rate": 1.1587584003793307e-05, + "loss": 0.8037, + "num_tokens": 34813142480.0, + "step": 8329 + }, + { + "epoch": 0.98989898989899, + "grad_norm": 0.3232305567989265, + "learning_rate": 1.1585855722714039e-05, + "loss": 0.8671, + "num_tokens": 34817326875.0, + "step": 8330 + }, + { + "epoch": 0.9900178253119429, + "grad_norm": 0.3743398696771858, + "learning_rate": 1.1584127419938558e-05, + "loss": 0.799, + "num_tokens": 34821517764.0, + "step": 8331 + }, + { + "epoch": 0.990136660724896, + "grad_norm": 0.3496010559186332, + "learning_rate": 1.1582399095530864e-05, + "loss": 0.8505, + "num_tokens": 34825706678.0, + "step": 8332 + }, + { + "epoch": 0.9902554961378491, + "grad_norm": 0.325882362797077, + "learning_rate": 1.1580670749554961e-05, + "loss": 0.8133, + "num_tokens": 34829896541.0, + "step": 8333 + }, + { + "epoch": 0.9903743315508021, + "grad_norm": 0.36072198332436406, + "learning_rate": 1.1578942382074857e-05, + "loss": 0.8187, + "num_tokens": 34834058450.0, + "step": 8334 + }, + { + "epoch": 0.9904931669637552, + "grad_norm": 0.3539200707337579, + "learning_rate": 1.1577213993154559e-05, + "loss": 0.8489, + "num_tokens": 34838226532.0, + "step": 8335 + }, + { + "epoch": 0.9906120023767082, + "grad_norm": 0.2856782598699268, + "learning_rate": 1.1575485582858077e-05, + "loss": 0.824, + "num_tokens": 34842413770.0, + "step": 8336 + }, + { + "epoch": 0.9907308377896613, + "grad_norm": 0.3282467846500332, + "learning_rate": 1.1573757151249413e-05, + "loss": 0.8374, + "num_tokens": 34846602882.0, + "step": 8337 + }, + { + "epoch": 0.9908496732026144, + "grad_norm": 0.35792908249523625, + "learning_rate": 1.1572028698392585e-05, + "loss": 0.8339, + "num_tokens": 34850792019.0, + "step": 8338 + }, + { + "epoch": 0.9909685086155674, + "grad_norm": 0.3670886487360022, + "learning_rate": 1.1570300224351602e-05, + "loss": 0.8709, + "num_tokens": 34854975007.0, + "step": 8339 + }, + { + "epoch": 0.9910873440285205, + "grad_norm": 0.3251969353930513, + "learning_rate": 1.156857172919047e-05, + "loss": 0.8463, + "num_tokens": 34859164553.0, + "step": 8340 + }, + { + "epoch": 0.9912061794414736, + "grad_norm": 0.37239460236948874, + "learning_rate": 1.1566843212973207e-05, + "loss": 0.8836, + "num_tokens": 34863351260.0, + "step": 8341 + }, + { + "epoch": 0.9913250148544266, + "grad_norm": 0.35276536276299203, + "learning_rate": 1.1565114675763823e-05, + "loss": 0.8369, + "num_tokens": 34867517215.0, + "step": 8342 + }, + { + "epoch": 0.9914438502673797, + "grad_norm": 0.33666087581282655, + "learning_rate": 1.1563386117626329e-05, + "loss": 0.8472, + "num_tokens": 34871706011.0, + "step": 8343 + }, + { + "epoch": 0.9915626856803328, + "grad_norm": 0.3697081225885035, + "learning_rate": 1.1561657538624747e-05, + "loss": 0.8349, + "num_tokens": 34875893462.0, + "step": 8344 + }, + { + "epoch": 0.9916815210932858, + "grad_norm": 0.416658567405946, + "learning_rate": 1.1559928938823085e-05, + "loss": 0.8195, + "num_tokens": 34880073026.0, + "step": 8345 + }, + { + "epoch": 0.9918003565062389, + "grad_norm": 0.3226163829451126, + "learning_rate": 1.1558200318285361e-05, + "loss": 0.8259, + "num_tokens": 34884254281.0, + "step": 8346 + }, + { + "epoch": 0.9919191919191919, + "grad_norm": 0.43759465571880724, + "learning_rate": 1.1556471677075589e-05, + "loss": 0.8205, + "num_tokens": 34888442957.0, + "step": 8347 + }, + { + "epoch": 0.992038027332145, + "grad_norm": 0.3634531765589202, + "learning_rate": 1.1554743015257795e-05, + "loss": 0.8262, + "num_tokens": 34892633981.0, + "step": 8348 + }, + { + "epoch": 0.9921568627450981, + "grad_norm": 0.32140523301103735, + "learning_rate": 1.1553014332895987e-05, + "loss": 0.8237, + "num_tokens": 34896821978.0, + "step": 8349 + }, + { + "epoch": 0.9922756981580511, + "grad_norm": 0.4426980950323953, + "learning_rate": 1.155128563005419e-05, + "loss": 0.822, + "num_tokens": 34901010911.0, + "step": 8350 + }, + { + "epoch": 0.9923945335710042, + "grad_norm": 0.31670742040786376, + "learning_rate": 1.1549556906796424e-05, + "loss": 0.8115, + "num_tokens": 34905199874.0, + "step": 8351 + }, + { + "epoch": 0.9925133689839573, + "grad_norm": 0.44586687408805287, + "learning_rate": 1.1547828163186708e-05, + "loss": 0.8518, + "num_tokens": 34909390079.0, + "step": 8352 + }, + { + "epoch": 0.9926322043969102, + "grad_norm": 0.35505147451132085, + "learning_rate": 1.154609939928906e-05, + "loss": 0.8148, + "num_tokens": 34913578450.0, + "step": 8353 + }, + { + "epoch": 0.9927510398098633, + "grad_norm": 0.35989481972229004, + "learning_rate": 1.1544370615167504e-05, + "loss": 0.8333, + "num_tokens": 34917748299.0, + "step": 8354 + }, + { + "epoch": 0.9928698752228164, + "grad_norm": 0.41537391469540863, + "learning_rate": 1.1542641810886066e-05, + "loss": 0.8285, + "num_tokens": 34921911900.0, + "step": 8355 + }, + { + "epoch": 0.9929887106357694, + "grad_norm": 0.3369873136500374, + "learning_rate": 1.1540912986508765e-05, + "loss": 0.8605, + "num_tokens": 34926101467.0, + "step": 8356 + }, + { + "epoch": 0.9931075460487225, + "grad_norm": 0.36445165220212733, + "learning_rate": 1.153918414209963e-05, + "loss": 0.817, + "num_tokens": 34930289709.0, + "step": 8357 + }, + { + "epoch": 0.9932263814616756, + "grad_norm": 0.3867672599165347, + "learning_rate": 1.1537455277722682e-05, + "loss": 0.8571, + "num_tokens": 34934476350.0, + "step": 8358 + }, + { + "epoch": 0.9933452168746286, + "grad_norm": 0.35408405431771334, + "learning_rate": 1.1535726393441949e-05, + "loss": 0.8529, + "num_tokens": 34938664585.0, + "step": 8359 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.41889641763223506, + "learning_rate": 1.1533997489321454e-05, + "loss": 0.8177, + "num_tokens": 34942853561.0, + "step": 8360 + }, + { + "epoch": 0.9935828877005347, + "grad_norm": 0.2959446572213374, + "learning_rate": 1.1532268565425228e-05, + "loss": 0.8427, + "num_tokens": 34947042950.0, + "step": 8361 + }, + { + "epoch": 0.9937017231134878, + "grad_norm": 0.3926383337598507, + "learning_rate": 1.1530539621817296e-05, + "loss": 0.8196, + "num_tokens": 34951173695.0, + "step": 8362 + }, + { + "epoch": 0.9938205585264409, + "grad_norm": 0.34021860271849524, + "learning_rate": 1.1528810658561689e-05, + "loss": 0.8231, + "num_tokens": 34955361793.0, + "step": 8363 + }, + { + "epoch": 0.9939393939393939, + "grad_norm": 0.40174253856667397, + "learning_rate": 1.1527081675722436e-05, + "loss": 0.8479, + "num_tokens": 34959538796.0, + "step": 8364 + }, + { + "epoch": 0.994058229352347, + "grad_norm": 0.3522691156876679, + "learning_rate": 1.1525352673363563e-05, + "loss": 0.8241, + "num_tokens": 34963726655.0, + "step": 8365 + }, + { + "epoch": 0.9941770647653001, + "grad_norm": 0.341336347624272, + "learning_rate": 1.1523623651549109e-05, + "loss": 0.8326, + "num_tokens": 34967915239.0, + "step": 8366 + }, + { + "epoch": 0.9942959001782531, + "grad_norm": 0.32027486599795046, + "learning_rate": 1.1521894610343098e-05, + "loss": 0.8464, + "num_tokens": 34972104208.0, + "step": 8367 + }, + { + "epoch": 0.9944147355912062, + "grad_norm": 0.3835104012699084, + "learning_rate": 1.1520165549809568e-05, + "loss": 0.8236, + "num_tokens": 34976292651.0, + "step": 8368 + }, + { + "epoch": 0.9945335710041593, + "grad_norm": 0.33947259620335546, + "learning_rate": 1.1518436470012545e-05, + "loss": 0.8011, + "num_tokens": 34980445308.0, + "step": 8369 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 0.42226360134484237, + "learning_rate": 1.1516707371016069e-05, + "loss": 0.8069, + "num_tokens": 34984632794.0, + "step": 8370 + }, + { + "epoch": 0.9947712418300654, + "grad_norm": 0.31786071359729906, + "learning_rate": 1.1514978252884172e-05, + "loss": 0.8219, + "num_tokens": 34988822714.0, + "step": 8371 + }, + { + "epoch": 0.9948900772430184, + "grad_norm": 0.3473176004347278, + "learning_rate": 1.1513249115680886e-05, + "loss": 0.8575, + "num_tokens": 34993012278.0, + "step": 8372 + }, + { + "epoch": 0.9950089126559715, + "grad_norm": 0.3182314149074181, + "learning_rate": 1.1511519959470254e-05, + "loss": 0.8233, + "num_tokens": 34997179480.0, + "step": 8373 + }, + { + "epoch": 0.9951277480689246, + "grad_norm": 0.35493163976797787, + "learning_rate": 1.1509790784316307e-05, + "loss": 0.8193, + "num_tokens": 35001359794.0, + "step": 8374 + }, + { + "epoch": 0.9952465834818776, + "grad_norm": 0.3306213188584462, + "learning_rate": 1.1508061590283084e-05, + "loss": 0.8661, + "num_tokens": 35005548912.0, + "step": 8375 + }, + { + "epoch": 0.9953654188948307, + "grad_norm": 0.32855778910246153, + "learning_rate": 1.150633237743462e-05, + "loss": 0.8147, + "num_tokens": 35009728177.0, + "step": 8376 + }, + { + "epoch": 0.9954842543077838, + "grad_norm": 0.38047813198192826, + "learning_rate": 1.1504603145834958e-05, + "loss": 0.8574, + "num_tokens": 35013916226.0, + "step": 8377 + }, + { + "epoch": 0.9956030897207367, + "grad_norm": 0.32300452173658695, + "learning_rate": 1.1502873895548137e-05, + "loss": 0.8537, + "num_tokens": 35018082607.0, + "step": 8378 + }, + { + "epoch": 0.9957219251336898, + "grad_norm": 0.36636966934401893, + "learning_rate": 1.1501144626638194e-05, + "loss": 0.8295, + "num_tokens": 35022271280.0, + "step": 8379 + }, + { + "epoch": 0.995840760546643, + "grad_norm": 0.3232830108865295, + "learning_rate": 1.1499415339169171e-05, + "loss": 0.8312, + "num_tokens": 35026461267.0, + "step": 8380 + }, + { + "epoch": 0.9959595959595959, + "grad_norm": 0.34007475869705, + "learning_rate": 1.149768603320511e-05, + "loss": 0.8441, + "num_tokens": 35030608890.0, + "step": 8381 + }, + { + "epoch": 0.996078431372549, + "grad_norm": 0.38401766597226394, + "learning_rate": 1.149595670881005e-05, + "loss": 0.8116, + "num_tokens": 35034775769.0, + "step": 8382 + }, + { + "epoch": 0.9961972667855021, + "grad_norm": 0.31674330344957574, + "learning_rate": 1.1494227366048039e-05, + "loss": 0.8304, + "num_tokens": 35038934730.0, + "step": 8383 + }, + { + "epoch": 0.9963161021984551, + "grad_norm": 0.36641448854951747, + "learning_rate": 1.1492498004983115e-05, + "loss": 0.8665, + "num_tokens": 35043124639.0, + "step": 8384 + }, + { + "epoch": 0.9964349376114082, + "grad_norm": 0.3532994388601355, + "learning_rate": 1.1490768625679326e-05, + "loss": 0.8084, + "num_tokens": 35047306035.0, + "step": 8385 + }, + { + "epoch": 0.9965537730243612, + "grad_norm": 0.32648478341137144, + "learning_rate": 1.1489039228200718e-05, + "loss": 0.8187, + "num_tokens": 35051494335.0, + "step": 8386 + }, + { + "epoch": 0.9966726084373143, + "grad_norm": 0.37070200185873187, + "learning_rate": 1.1487309812611331e-05, + "loss": 0.8427, + "num_tokens": 35055650238.0, + "step": 8387 + }, + { + "epoch": 0.9967914438502674, + "grad_norm": 0.33594580767462134, + "learning_rate": 1.1485580378975215e-05, + "loss": 0.812, + "num_tokens": 35059839915.0, + "step": 8388 + }, + { + "epoch": 0.9969102792632204, + "grad_norm": 0.44162107345667173, + "learning_rate": 1.1483850927356414e-05, + "loss": 0.8194, + "num_tokens": 35064019331.0, + "step": 8389 + }, + { + "epoch": 0.9970291146761735, + "grad_norm": 0.3590812508701262, + "learning_rate": 1.148212145781898e-05, + "loss": 0.8367, + "num_tokens": 35068209270.0, + "step": 8390 + }, + { + "epoch": 0.9971479500891266, + "grad_norm": 0.39093433315785225, + "learning_rate": 1.1480391970426957e-05, + "loss": 0.8258, + "num_tokens": 35072398297.0, + "step": 8391 + }, + { + "epoch": 0.9972667855020796, + "grad_norm": 0.36461213844517215, + "learning_rate": 1.1478662465244395e-05, + "loss": 0.8009, + "num_tokens": 35076587917.0, + "step": 8392 + }, + { + "epoch": 0.9973856209150327, + "grad_norm": 0.40973069115506844, + "learning_rate": 1.1476932942335344e-05, + "loss": 0.8201, + "num_tokens": 35080777211.0, + "step": 8393 + }, + { + "epoch": 0.9975044563279858, + "grad_norm": 0.32678268036593006, + "learning_rate": 1.1475203401763852e-05, + "loss": 0.8433, + "num_tokens": 35084966442.0, + "step": 8394 + }, + { + "epoch": 0.9976232917409388, + "grad_norm": 0.404002241337783, + "learning_rate": 1.1473473843593972e-05, + "loss": 0.8522, + "num_tokens": 35089155364.0, + "step": 8395 + }, + { + "epoch": 0.9977421271538919, + "grad_norm": 0.3787763079777646, + "learning_rate": 1.1471744267889758e-05, + "loss": 0.8207, + "num_tokens": 35093320458.0, + "step": 8396 + }, + { + "epoch": 0.9978609625668449, + "grad_norm": 0.33871020261012397, + "learning_rate": 1.1470014674715253e-05, + "loss": 0.802, + "num_tokens": 35097507552.0, + "step": 8397 + }, + { + "epoch": 0.997979797979798, + "grad_norm": 0.34295072773475255, + "learning_rate": 1.1468285064134518e-05, + "loss": 0.8398, + "num_tokens": 35101696519.0, + "step": 8398 + }, + { + "epoch": 0.9980986333927511, + "grad_norm": 0.35572764810436475, + "learning_rate": 1.1466555436211607e-05, + "loss": 0.8212, + "num_tokens": 35105882606.0, + "step": 8399 + }, + { + "epoch": 0.9982174688057041, + "grad_norm": 0.33281757507749515, + "learning_rate": 1.1464825791010565e-05, + "loss": 0.8132, + "num_tokens": 35110050182.0, + "step": 8400 + }, + { + "epoch": 0.9983363042186572, + "grad_norm": 0.3734540731730909, + "learning_rate": 1.1463096128595459e-05, + "loss": 0.8344, + "num_tokens": 35114238857.0, + "step": 8401 + }, + { + "epoch": 0.9984551396316103, + "grad_norm": 0.3154654459941828, + "learning_rate": 1.1461366449030331e-05, + "loss": 0.8098, + "num_tokens": 35118428359.0, + "step": 8402 + }, + { + "epoch": 0.9985739750445632, + "grad_norm": 0.40922876603462316, + "learning_rate": 1.1459636752379246e-05, + "loss": 0.868, + "num_tokens": 35122617597.0, + "step": 8403 + }, + { + "epoch": 0.9986928104575163, + "grad_norm": 0.2969139343384184, + "learning_rate": 1.1457907038706262e-05, + "loss": 0.819, + "num_tokens": 35126804113.0, + "step": 8404 + }, + { + "epoch": 0.9988116458704694, + "grad_norm": 0.4631613773464848, + "learning_rate": 1.1456177308075427e-05, + "loss": 0.8519, + "num_tokens": 35130992269.0, + "step": 8405 + }, + { + "epoch": 0.9989304812834224, + "grad_norm": 0.37329266721371374, + "learning_rate": 1.1454447560550802e-05, + "loss": 0.8165, + "num_tokens": 35135152315.0, + "step": 8406 + }, + { + "epoch": 0.9990493166963755, + "grad_norm": 0.35994098873178904, + "learning_rate": 1.1452717796196453e-05, + "loss": 0.8614, + "num_tokens": 35139342369.0, + "step": 8407 + }, + { + "epoch": 0.9991681521093286, + "grad_norm": 0.3642391363639566, + "learning_rate": 1.1450988015076429e-05, + "loss": 0.8536, + "num_tokens": 35143507446.0, + "step": 8408 + }, + { + "epoch": 0.9992869875222816, + "grad_norm": 0.3530583851456611, + "learning_rate": 1.1449258217254799e-05, + "loss": 0.82, + "num_tokens": 35147695520.0, + "step": 8409 + }, + { + "epoch": 0.9994058229352347, + "grad_norm": 0.3956622341329999, + "learning_rate": 1.1447528402795615e-05, + "loss": 0.8129, + "num_tokens": 35151803612.0, + "step": 8410 + }, + { + "epoch": 0.9995246583481877, + "grad_norm": 0.3018720880896415, + "learning_rate": 1.1445798571762941e-05, + "loss": 0.8331, + "num_tokens": 35155975548.0, + "step": 8411 + }, + { + "epoch": 0.9996434937611408, + "grad_norm": 0.3979060458855823, + "learning_rate": 1.1444068724220841e-05, + "loss": 0.8626, + "num_tokens": 35160164439.0, + "step": 8412 + }, + { + "epoch": 0.9997623291740939, + "grad_norm": 0.31595738104176807, + "learning_rate": 1.144233886023337e-05, + "loss": 0.856, + "num_tokens": 35164340497.0, + "step": 8413 + }, + { + "epoch": 0.9998811645870469, + "grad_norm": 0.37223297677888495, + "learning_rate": 1.14406089798646e-05, + "loss": 0.8385, + "num_tokens": 35168530359.0, + "step": 8414 + }, + { + "epoch": 1.0, + "grad_norm": 0.3151132197871044, + "learning_rate": 1.1438879083178589e-05, + "loss": 0.8407, + "num_tokens": 35172718332.0, + "step": 8415 + }, + { + "epoch": 1.000118835412953, + "grad_norm": 0.36827678473404485, + "learning_rate": 1.1437149170239397e-05, + "loss": 0.844, + "num_tokens": 35176905484.0, + "step": 8416 + }, + { + "epoch": 1.0002376708259062, + "grad_norm": 0.39523691611329403, + "learning_rate": 1.1435419241111099e-05, + "loss": 0.844, + "num_tokens": 35181093981.0, + "step": 8417 + }, + { + "epoch": 1.000356506238859, + "grad_norm": 0.3959347788117729, + "learning_rate": 1.1433689295857753e-05, + "loss": 0.8, + "num_tokens": 35185283337.0, + "step": 8418 + }, + { + "epoch": 1.0004753416518122, + "grad_norm": 0.34160889515755805, + "learning_rate": 1.1431959334543423e-05, + "loss": 0.8508, + "num_tokens": 35189472546.0, + "step": 8419 + }, + { + "epoch": 1.0005941770647653, + "grad_norm": 0.3995297101670817, + "learning_rate": 1.143022935723218e-05, + "loss": 0.8053, + "num_tokens": 35193640340.0, + "step": 8420 + }, + { + "epoch": 1.0007130124777184, + "grad_norm": 0.3379591057992053, + "learning_rate": 1.1428499363988092e-05, + "loss": 0.8408, + "num_tokens": 35197826314.0, + "step": 8421 + }, + { + "epoch": 1.0008318478906715, + "grad_norm": 0.4133904748667997, + "learning_rate": 1.142676935487522e-05, + "loss": 0.8249, + "num_tokens": 35201961376.0, + "step": 8422 + }, + { + "epoch": 1.0009506833036246, + "grad_norm": 0.37750998253937484, + "learning_rate": 1.1425039329957637e-05, + "loss": 0.821, + "num_tokens": 35206142003.0, + "step": 8423 + }, + { + "epoch": 1.0010695187165775, + "grad_norm": 0.3842412943661655, + "learning_rate": 1.1423309289299411e-05, + "loss": 0.8495, + "num_tokens": 35210316977.0, + "step": 8424 + }, + { + "epoch": 1.0011883541295306, + "grad_norm": 0.37646380939926866, + "learning_rate": 1.1421579232964613e-05, + "loss": 0.8085, + "num_tokens": 35214506130.0, + "step": 8425 + }, + { + "epoch": 1.0013071895424837, + "grad_norm": 0.3817076298595507, + "learning_rate": 1.1419849161017305e-05, + "loss": 0.8195, + "num_tokens": 35218695819.0, + "step": 8426 + }, + { + "epoch": 1.0014260249554368, + "grad_norm": 0.32239862147365467, + "learning_rate": 1.1418119073521567e-05, + "loss": 0.7919, + "num_tokens": 35222862306.0, + "step": 8427 + }, + { + "epoch": 1.0015448603683899, + "grad_norm": 0.371569294045942, + "learning_rate": 1.1416388970541468e-05, + "loss": 0.815, + "num_tokens": 35227033044.0, + "step": 8428 + }, + { + "epoch": 1.0016636957813427, + "grad_norm": 0.35442185764125167, + "learning_rate": 1.1414658852141072e-05, + "loss": 0.8078, + "num_tokens": 35231195760.0, + "step": 8429 + }, + { + "epoch": 1.0017825311942958, + "grad_norm": 0.3723187690735229, + "learning_rate": 1.1412928718384462e-05, + "loss": 0.8221, + "num_tokens": 35235374117.0, + "step": 8430 + }, + { + "epoch": 1.001901366607249, + "grad_norm": 0.3403607752600095, + "learning_rate": 1.1411198569335702e-05, + "loss": 0.8165, + "num_tokens": 35239563366.0, + "step": 8431 + }, + { + "epoch": 1.002020202020202, + "grad_norm": 0.3657791808585208, + "learning_rate": 1.1409468405058867e-05, + "loss": 0.8368, + "num_tokens": 35243752591.0, + "step": 8432 + }, + { + "epoch": 1.0021390374331551, + "grad_norm": 0.4006180839416463, + "learning_rate": 1.1407738225618037e-05, + "loss": 0.8185, + "num_tokens": 35247911733.0, + "step": 8433 + }, + { + "epoch": 1.0022578728461082, + "grad_norm": 0.4362571538690303, + "learning_rate": 1.140600803107728e-05, + "loss": 0.8026, + "num_tokens": 35252083553.0, + "step": 8434 + }, + { + "epoch": 1.0023767082590611, + "grad_norm": 0.33850929929838186, + "learning_rate": 1.1404277821500673e-05, + "loss": 0.832, + "num_tokens": 35256273662.0, + "step": 8435 + }, + { + "epoch": 1.0024955436720142, + "grad_norm": 0.5802472457793222, + "learning_rate": 1.1402547596952289e-05, + "loss": 0.8235, + "num_tokens": 35260456191.0, + "step": 8436 + }, + { + "epoch": 1.0026143790849673, + "grad_norm": 0.3713630090090835, + "learning_rate": 1.140081735749621e-05, + "loss": 0.8101, + "num_tokens": 35264645771.0, + "step": 8437 + }, + { + "epoch": 1.0027332144979204, + "grad_norm": 0.5777344960855035, + "learning_rate": 1.139908710319651e-05, + "loss": 0.8426, + "num_tokens": 35268815589.0, + "step": 8438 + }, + { + "epoch": 1.0028520499108735, + "grad_norm": 0.42709003370354176, + "learning_rate": 1.1397356834117262e-05, + "loss": 0.8534, + "num_tokens": 35273003630.0, + "step": 8439 + }, + { + "epoch": 1.0029708853238266, + "grad_norm": 0.4392396199525758, + "learning_rate": 1.1395626550322545e-05, + "loss": 0.8019, + "num_tokens": 35277192591.0, + "step": 8440 + }, + { + "epoch": 1.0030897207367795, + "grad_norm": 0.44829843722977014, + "learning_rate": 1.1393896251876447e-05, + "loss": 0.815, + "num_tokens": 35281375976.0, + "step": 8441 + }, + { + "epoch": 1.0032085561497326, + "grad_norm": 0.3942479610837953, + "learning_rate": 1.139216593884303e-05, + "loss": 0.8032, + "num_tokens": 35285529204.0, + "step": 8442 + }, + { + "epoch": 1.0033273915626857, + "grad_norm": 0.4044386711483192, + "learning_rate": 1.1390435611286388e-05, + "loss": 0.8419, + "num_tokens": 35289716630.0, + "step": 8443 + }, + { + "epoch": 1.0034462269756388, + "grad_norm": 0.39593960617615054, + "learning_rate": 1.1388705269270595e-05, + "loss": 0.8202, + "num_tokens": 35293905450.0, + "step": 8444 + }, + { + "epoch": 1.0035650623885919, + "grad_norm": 0.4281398268058112, + "learning_rate": 1.1386974912859727e-05, + "loss": 0.8118, + "num_tokens": 35298095042.0, + "step": 8445 + }, + { + "epoch": 1.0036838978015448, + "grad_norm": 0.3681419179432974, + "learning_rate": 1.1385244542117876e-05, + "loss": 0.8521, + "num_tokens": 35302279312.0, + "step": 8446 + }, + { + "epoch": 1.0038027332144979, + "grad_norm": 0.5345186371724723, + "learning_rate": 1.1383514157109113e-05, + "loss": 0.8358, + "num_tokens": 35306448203.0, + "step": 8447 + }, + { + "epoch": 1.003921568627451, + "grad_norm": 0.41006165117007104, + "learning_rate": 1.1381783757897526e-05, + "loss": 0.793, + "num_tokens": 35310627899.0, + "step": 8448 + }, + { + "epoch": 1.004040404040404, + "grad_norm": 0.4813286301262758, + "learning_rate": 1.1380053344547195e-05, + "loss": 0.8615, + "num_tokens": 35314816680.0, + "step": 8449 + }, + { + "epoch": 1.0041592394533572, + "grad_norm": 0.4432771688747181, + "learning_rate": 1.1378322917122202e-05, + "loss": 0.8347, + "num_tokens": 35318998985.0, + "step": 8450 + }, + { + "epoch": 1.0042780748663103, + "grad_norm": 0.43500247342309417, + "learning_rate": 1.1376592475686634e-05, + "loss": 0.7876, + "num_tokens": 35323188484.0, + "step": 8451 + }, + { + "epoch": 1.0043969102792631, + "grad_norm": 0.40751829232209597, + "learning_rate": 1.1374862020304574e-05, + "loss": 0.8101, + "num_tokens": 35327376165.0, + "step": 8452 + }, + { + "epoch": 1.0045157456922162, + "grad_norm": 0.47533906689752403, + "learning_rate": 1.1373131551040102e-05, + "loss": 0.8237, + "num_tokens": 35331566088.0, + "step": 8453 + }, + { + "epoch": 1.0046345811051693, + "grad_norm": 0.3661207388316373, + "learning_rate": 1.1371401067957311e-05, + "loss": 0.7987, + "num_tokens": 35335756895.0, + "step": 8454 + }, + { + "epoch": 1.0047534165181224, + "grad_norm": 0.580365337352647, + "learning_rate": 1.136967057112028e-05, + "loss": 0.8421, + "num_tokens": 35339944902.0, + "step": 8455 + }, + { + "epoch": 1.0048722519310755, + "grad_norm": 0.37087379121657144, + "learning_rate": 1.1367940060593101e-05, + "loss": 0.8347, + "num_tokens": 35344133952.0, + "step": 8456 + }, + { + "epoch": 1.0049910873440284, + "grad_norm": 0.5747818682233427, + "learning_rate": 1.1366209536439856e-05, + "loss": 0.8103, + "num_tokens": 35348292168.0, + "step": 8457 + }, + { + "epoch": 1.0051099227569815, + "grad_norm": 0.43956480677924414, + "learning_rate": 1.1364478998724633e-05, + "loss": 0.8271, + "num_tokens": 35352469520.0, + "step": 8458 + }, + { + "epoch": 1.0052287581699346, + "grad_norm": 0.48648085247134093, + "learning_rate": 1.1362748447511523e-05, + "loss": 0.8334, + "num_tokens": 35356639114.0, + "step": 8459 + }, + { + "epoch": 1.0053475935828877, + "grad_norm": 0.47390320986168055, + "learning_rate": 1.136101788286461e-05, + "loss": 0.7981, + "num_tokens": 35360828460.0, + "step": 8460 + }, + { + "epoch": 1.0054664289958408, + "grad_norm": 0.398611788002675, + "learning_rate": 1.1359287304847984e-05, + "loss": 0.827, + "num_tokens": 35365015496.0, + "step": 8461 + }, + { + "epoch": 1.005585264408794, + "grad_norm": 0.47653329182781035, + "learning_rate": 1.1357556713525732e-05, + "loss": 0.8209, + "num_tokens": 35369199610.0, + "step": 8462 + }, + { + "epoch": 1.0057040998217468, + "grad_norm": 0.3673296320687643, + "learning_rate": 1.1355826108961949e-05, + "loss": 0.8061, + "num_tokens": 35373388717.0, + "step": 8463 + }, + { + "epoch": 1.0058229352347, + "grad_norm": 0.3665292468156755, + "learning_rate": 1.135409549122072e-05, + "loss": 0.8177, + "num_tokens": 35377578253.0, + "step": 8464 + }, + { + "epoch": 1.005941770647653, + "grad_norm": 0.4001566960192481, + "learning_rate": 1.1352364860366138e-05, + "loss": 0.8329, + "num_tokens": 35381766099.0, + "step": 8465 + }, + { + "epoch": 1.006060606060606, + "grad_norm": 0.32489596469705007, + "learning_rate": 1.1350634216462294e-05, + "loss": 0.8454, + "num_tokens": 35385932727.0, + "step": 8466 + }, + { + "epoch": 1.0061794414735592, + "grad_norm": 0.3693794370855228, + "learning_rate": 1.1348903559573281e-05, + "loss": 0.7784, + "num_tokens": 35390122288.0, + "step": 8467 + }, + { + "epoch": 1.006298276886512, + "grad_norm": 0.3325460355613977, + "learning_rate": 1.1347172889763187e-05, + "loss": 0.8017, + "num_tokens": 35394288692.0, + "step": 8468 + }, + { + "epoch": 1.0064171122994652, + "grad_norm": 0.3611230437168519, + "learning_rate": 1.1345442207096111e-05, + "loss": 0.8127, + "num_tokens": 35398478822.0, + "step": 8469 + }, + { + "epoch": 1.0065359477124183, + "grad_norm": 0.39652663491181545, + "learning_rate": 1.134371151163614e-05, + "loss": 0.8255, + "num_tokens": 35402642362.0, + "step": 8470 + }, + { + "epoch": 1.0066547831253714, + "grad_norm": 0.3317031599734896, + "learning_rate": 1.1341980803447367e-05, + "loss": 0.8304, + "num_tokens": 35406830458.0, + "step": 8471 + }, + { + "epoch": 1.0067736185383245, + "grad_norm": 0.4859121348146413, + "learning_rate": 1.1340250082593893e-05, + "loss": 0.8751, + "num_tokens": 35411013169.0, + "step": 8472 + }, + { + "epoch": 1.0068924539512776, + "grad_norm": 0.34616851050075675, + "learning_rate": 1.1338519349139805e-05, + "loss": 0.8124, + "num_tokens": 35415201749.0, + "step": 8473 + }, + { + "epoch": 1.0070112893642305, + "grad_norm": 0.5032460235501878, + "learning_rate": 1.1336788603149204e-05, + "loss": 0.8211, + "num_tokens": 35419383392.0, + "step": 8474 + }, + { + "epoch": 1.0071301247771836, + "grad_norm": 0.36398441918187713, + "learning_rate": 1.1335057844686181e-05, + "loss": 0.8234, + "num_tokens": 35423570357.0, + "step": 8475 + }, + { + "epoch": 1.0072489601901367, + "grad_norm": 0.5223686377505412, + "learning_rate": 1.1333327073814833e-05, + "loss": 0.864, + "num_tokens": 35427759494.0, + "step": 8476 + }, + { + "epoch": 1.0073677956030898, + "grad_norm": 0.44984877343202556, + "learning_rate": 1.1331596290599255e-05, + "loss": 0.8224, + "num_tokens": 35431946854.0, + "step": 8477 + }, + { + "epoch": 1.0074866310160429, + "grad_norm": 0.5147963270374478, + "learning_rate": 1.1329865495103547e-05, + "loss": 0.8856, + "num_tokens": 35436136171.0, + "step": 8478 + }, + { + "epoch": 1.0076054664289957, + "grad_norm": 0.477958439072053, + "learning_rate": 1.1328134687391801e-05, + "loss": 0.8248, + "num_tokens": 35440323498.0, + "step": 8479 + }, + { + "epoch": 1.0077243018419488, + "grad_norm": 0.43836855829913657, + "learning_rate": 1.1326403867528121e-05, + "loss": 0.8399, + "num_tokens": 35444509598.0, + "step": 8480 + }, + { + "epoch": 1.007843137254902, + "grad_norm": 0.4722320364779647, + "learning_rate": 1.1324673035576601e-05, + "loss": 0.7998, + "num_tokens": 35448690589.0, + "step": 8481 + }, + { + "epoch": 1.007961972667855, + "grad_norm": 0.39989788957019373, + "learning_rate": 1.1322942191601342e-05, + "loss": 0.8029, + "num_tokens": 35452850993.0, + "step": 8482 + }, + { + "epoch": 1.0080808080808081, + "grad_norm": 0.4737029856797733, + "learning_rate": 1.132121133566644e-05, + "loss": 0.8232, + "num_tokens": 35457040272.0, + "step": 8483 + }, + { + "epoch": 1.0081996434937612, + "grad_norm": 0.42010278543559293, + "learning_rate": 1.1319480467836e-05, + "loss": 0.8088, + "num_tokens": 35461229879.0, + "step": 8484 + }, + { + "epoch": 1.008318478906714, + "grad_norm": 0.40738954379090303, + "learning_rate": 1.1317749588174116e-05, + "loss": 0.8345, + "num_tokens": 35465419999.0, + "step": 8485 + }, + { + "epoch": 1.0084373143196672, + "grad_norm": 0.4204236997995057, + "learning_rate": 1.1316018696744888e-05, + "loss": 0.8563, + "num_tokens": 35469594323.0, + "step": 8486 + }, + { + "epoch": 1.0085561497326203, + "grad_norm": 0.398326667557238, + "learning_rate": 1.1314287793612421e-05, + "loss": 0.8278, + "num_tokens": 35473785098.0, + "step": 8487 + }, + { + "epoch": 1.0086749851455734, + "grad_norm": 0.4263089595889442, + "learning_rate": 1.1312556878840817e-05, + "loss": 0.8039, + "num_tokens": 35477941126.0, + "step": 8488 + }, + { + "epoch": 1.0087938205585265, + "grad_norm": 0.3962822820317918, + "learning_rate": 1.131082595249417e-05, + "loss": 0.8347, + "num_tokens": 35482110022.0, + "step": 8489 + }, + { + "epoch": 1.0089126559714796, + "grad_norm": 0.34282394512539205, + "learning_rate": 1.130909501463659e-05, + "loss": 0.8007, + "num_tokens": 35486300195.0, + "step": 8490 + }, + { + "epoch": 1.0090314913844325, + "grad_norm": 0.35196759288900414, + "learning_rate": 1.1307364065332177e-05, + "loss": 0.846, + "num_tokens": 35490488419.0, + "step": 8491 + }, + { + "epoch": 1.0091503267973856, + "grad_norm": 0.3860686866132896, + "learning_rate": 1.1305633104645033e-05, + "loss": 0.7929, + "num_tokens": 35494633850.0, + "step": 8492 + }, + { + "epoch": 1.0092691622103387, + "grad_norm": 0.31030494273681103, + "learning_rate": 1.1303902132639262e-05, + "loss": 0.8505, + "num_tokens": 35498813105.0, + "step": 8493 + }, + { + "epoch": 1.0093879976232918, + "grad_norm": 0.461824246622731, + "learning_rate": 1.1302171149378967e-05, + "loss": 0.8463, + "num_tokens": 35502969027.0, + "step": 8494 + }, + { + "epoch": 1.0095068330362449, + "grad_norm": 0.32581191616074107, + "learning_rate": 1.1300440154928252e-05, + "loss": 0.8496, + "num_tokens": 35507159065.0, + "step": 8495 + }, + { + "epoch": 1.0096256684491978, + "grad_norm": 0.4244896540442428, + "learning_rate": 1.1298709149351221e-05, + "loss": 0.8497, + "num_tokens": 35511349108.0, + "step": 8496 + }, + { + "epoch": 1.0097445038621509, + "grad_norm": 0.3287050876442674, + "learning_rate": 1.1296978132711982e-05, + "loss": 0.8522, + "num_tokens": 35515539121.0, + "step": 8497 + }, + { + "epoch": 1.009863339275104, + "grad_norm": 0.3976766587679651, + "learning_rate": 1.1295247105074642e-05, + "loss": 0.843, + "num_tokens": 35519715436.0, + "step": 8498 + }, + { + "epoch": 1.009982174688057, + "grad_norm": 0.3416571106404122, + "learning_rate": 1.1293516066503298e-05, + "loss": 0.8524, + "num_tokens": 35523904844.0, + "step": 8499 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.4289948739696649, + "learning_rate": 1.1291785017062062e-05, + "loss": 0.8573, + "num_tokens": 35528093782.0, + "step": 8500 + }, + { + "epoch": 1.0102198455139633, + "grad_norm": 0.3596908620695912, + "learning_rate": 1.1290053956815043e-05, + "loss": 0.8104, + "num_tokens": 35532283501.0, + "step": 8501 + }, + { + "epoch": 1.0103386809269161, + "grad_norm": 0.41560307284963494, + "learning_rate": 1.1288322885826343e-05, + "loss": 0.8227, + "num_tokens": 35536420645.0, + "step": 8502 + }, + { + "epoch": 1.0104575163398692, + "grad_norm": 0.34178352566225706, + "learning_rate": 1.1286591804160077e-05, + "loss": 0.8121, + "num_tokens": 35540572351.0, + "step": 8503 + }, + { + "epoch": 1.0105763517528223, + "grad_norm": 0.4212458036241222, + "learning_rate": 1.1284860711880341e-05, + "loss": 0.8181, + "num_tokens": 35544762065.0, + "step": 8504 + }, + { + "epoch": 1.0106951871657754, + "grad_norm": 0.3515334916095643, + "learning_rate": 1.1283129609051251e-05, + "loss": 0.8232, + "num_tokens": 35548952791.0, + "step": 8505 + }, + { + "epoch": 1.0108140225787285, + "grad_norm": 0.4398946668110974, + "learning_rate": 1.1281398495736916e-05, + "loss": 0.8008, + "num_tokens": 35553131060.0, + "step": 8506 + }, + { + "epoch": 1.0109328579916814, + "grad_norm": 0.35026405332774724, + "learning_rate": 1.1279667372001444e-05, + "loss": 0.8164, + "num_tokens": 35557320962.0, + "step": 8507 + }, + { + "epoch": 1.0110516934046345, + "grad_norm": 0.47743762091734565, + "learning_rate": 1.1277936237908942e-05, + "loss": 0.8547, + "num_tokens": 35561511686.0, + "step": 8508 + }, + { + "epoch": 1.0111705288175876, + "grad_norm": 0.39377747049764983, + "learning_rate": 1.1276205093523521e-05, + "loss": 0.8542, + "num_tokens": 35565699313.0, + "step": 8509 + }, + { + "epoch": 1.0112893642305407, + "grad_norm": 0.4416959305858566, + "learning_rate": 1.1274473938909294e-05, + "loss": 0.8247, + "num_tokens": 35569888356.0, + "step": 8510 + }, + { + "epoch": 1.0114081996434938, + "grad_norm": 0.3973682050254047, + "learning_rate": 1.127274277413037e-05, + "loss": 0.8253, + "num_tokens": 35574076385.0, + "step": 8511 + }, + { + "epoch": 1.011527035056447, + "grad_norm": 0.3914519761767902, + "learning_rate": 1.1271011599250851e-05, + "loss": 0.8335, + "num_tokens": 35578242481.0, + "step": 8512 + }, + { + "epoch": 1.0116458704693998, + "grad_norm": 0.3630304590440199, + "learning_rate": 1.1269280414334863e-05, + "loss": 0.8178, + "num_tokens": 35582379863.0, + "step": 8513 + }, + { + "epoch": 1.011764705882353, + "grad_norm": 0.3794724684337091, + "learning_rate": 1.1267549219446512e-05, + "loss": 0.8317, + "num_tokens": 35586550527.0, + "step": 8514 + }, + { + "epoch": 1.011883541295306, + "grad_norm": 0.36752654907989046, + "learning_rate": 1.1265818014649901e-05, + "loss": 0.8365, + "num_tokens": 35590741153.0, + "step": 8515 + }, + { + "epoch": 1.012002376708259, + "grad_norm": 0.33960037287124134, + "learning_rate": 1.1264086800009157e-05, + "loss": 0.8162, + "num_tokens": 35594929660.0, + "step": 8516 + }, + { + "epoch": 1.0121212121212122, + "grad_norm": 0.3917277217999268, + "learning_rate": 1.1262355575588386e-05, + "loss": 0.8191, + "num_tokens": 35599119077.0, + "step": 8517 + }, + { + "epoch": 1.012240047534165, + "grad_norm": 0.3502472531991415, + "learning_rate": 1.1260624341451696e-05, + "loss": 0.8681, + "num_tokens": 35603275255.0, + "step": 8518 + }, + { + "epoch": 1.0123588829471182, + "grad_norm": 0.3225883128754744, + "learning_rate": 1.1258893097663211e-05, + "loss": 0.8482, + "num_tokens": 35607463655.0, + "step": 8519 + }, + { + "epoch": 1.0124777183600713, + "grad_norm": 0.34235963417430765, + "learning_rate": 1.1257161844287037e-05, + "loss": 0.831, + "num_tokens": 35611638217.0, + "step": 8520 + }, + { + "epoch": 1.0125965537730244, + "grad_norm": 0.3411629505490047, + "learning_rate": 1.125543058138729e-05, + "loss": 0.8784, + "num_tokens": 35615820298.0, + "step": 8521 + }, + { + "epoch": 1.0127153891859775, + "grad_norm": 0.32454753601983805, + "learning_rate": 1.1253699309028086e-05, + "loss": 0.8755, + "num_tokens": 35620010212.0, + "step": 8522 + }, + { + "epoch": 1.0128342245989306, + "grad_norm": 0.33035691736070516, + "learning_rate": 1.1251968027273537e-05, + "loss": 0.8387, + "num_tokens": 35624198213.0, + "step": 8523 + }, + { + "epoch": 1.0129530600118835, + "grad_norm": 0.3319437182267773, + "learning_rate": 1.1250236736187759e-05, + "loss": 0.8126, + "num_tokens": 35628388327.0, + "step": 8524 + }, + { + "epoch": 1.0130718954248366, + "grad_norm": 0.33962780340413606, + "learning_rate": 1.1248505435834872e-05, + "loss": 0.8456, + "num_tokens": 35632560138.0, + "step": 8525 + }, + { + "epoch": 1.0131907308377897, + "grad_norm": 0.34311736717476177, + "learning_rate": 1.1246774126278985e-05, + "loss": 0.833, + "num_tokens": 35636748248.0, + "step": 8526 + }, + { + "epoch": 1.0133095662507428, + "grad_norm": 0.3241240469557521, + "learning_rate": 1.124504280758422e-05, + "loss": 0.8333, + "num_tokens": 35640936514.0, + "step": 8527 + }, + { + "epoch": 1.0134284016636959, + "grad_norm": 0.3424798095434278, + "learning_rate": 1.1243311479814691e-05, + "loss": 0.8357, + "num_tokens": 35645125520.0, + "step": 8528 + }, + { + "epoch": 1.0135472370766487, + "grad_norm": 0.41807016954553555, + "learning_rate": 1.1241580143034518e-05, + "loss": 0.8383, + "num_tokens": 35649285980.0, + "step": 8529 + }, + { + "epoch": 1.0136660724896018, + "grad_norm": 0.39467147147251497, + "learning_rate": 1.1239848797307814e-05, + "loss": 0.8067, + "num_tokens": 35653469657.0, + "step": 8530 + }, + { + "epoch": 1.013784907902555, + "grad_norm": 0.3201837694272726, + "learning_rate": 1.1238117442698693e-05, + "loss": 0.8209, + "num_tokens": 35657652945.0, + "step": 8531 + }, + { + "epoch": 1.013903743315508, + "grad_norm": 0.34117586872742606, + "learning_rate": 1.1236386079271284e-05, + "loss": 0.8288, + "num_tokens": 35661832522.0, + "step": 8532 + }, + { + "epoch": 1.0140225787284611, + "grad_norm": 0.400831687138398, + "learning_rate": 1.1234654707089698e-05, + "loss": 0.8066, + "num_tokens": 35666022385.0, + "step": 8533 + }, + { + "epoch": 1.0141414141414142, + "grad_norm": 0.39471632810317553, + "learning_rate": 1.1232923326218053e-05, + "loss": 0.815, + "num_tokens": 35670205200.0, + "step": 8534 + }, + { + "epoch": 1.014260249554367, + "grad_norm": 0.3071162220111459, + "learning_rate": 1.123119193672047e-05, + "loss": 0.8113, + "num_tokens": 35674395643.0, + "step": 8535 + }, + { + "epoch": 1.0143790849673202, + "grad_norm": 0.38186945982250114, + "learning_rate": 1.122946053866107e-05, + "loss": 0.8483, + "num_tokens": 35678551336.0, + "step": 8536 + }, + { + "epoch": 1.0144979203802733, + "grad_norm": 0.3307080419676186, + "learning_rate": 1.1227729132103969e-05, + "loss": 0.8359, + "num_tokens": 35682741187.0, + "step": 8537 + }, + { + "epoch": 1.0146167557932264, + "grad_norm": 0.39582920135341587, + "learning_rate": 1.1225997717113288e-05, + "loss": 0.7975, + "num_tokens": 35686911991.0, + "step": 8538 + }, + { + "epoch": 1.0147355912061795, + "grad_norm": 0.3772805565910721, + "learning_rate": 1.1224266293753146e-05, + "loss": 0.8578, + "num_tokens": 35691101665.0, + "step": 8539 + }, + { + "epoch": 1.0148544266191326, + "grad_norm": 0.2965759906105497, + "learning_rate": 1.1222534862087666e-05, + "loss": 0.8573, + "num_tokens": 35695290291.0, + "step": 8540 + }, + { + "epoch": 1.0149732620320855, + "grad_norm": 0.4485892238095403, + "learning_rate": 1.122080342218097e-05, + "loss": 0.8168, + "num_tokens": 35699451122.0, + "step": 8541 + }, + { + "epoch": 1.0150920974450386, + "grad_norm": 0.3793456164358678, + "learning_rate": 1.1219071974097173e-05, + "loss": 0.8398, + "num_tokens": 35703640644.0, + "step": 8542 + }, + { + "epoch": 1.0152109328579917, + "grad_norm": 0.34940799468791295, + "learning_rate": 1.1217340517900404e-05, + "loss": 0.8294, + "num_tokens": 35707788881.0, + "step": 8543 + }, + { + "epoch": 1.0153297682709448, + "grad_norm": 0.4640168393386607, + "learning_rate": 1.121560905365478e-05, + "loss": 0.8106, + "num_tokens": 35711977187.0, + "step": 8544 + }, + { + "epoch": 1.0154486036838979, + "grad_norm": 0.33979552634558435, + "learning_rate": 1.1213877581424421e-05, + "loss": 0.8234, + "num_tokens": 35716121958.0, + "step": 8545 + }, + { + "epoch": 1.0155674390968508, + "grad_norm": 0.41694302927677007, + "learning_rate": 1.1212146101273456e-05, + "loss": 0.8184, + "num_tokens": 35720302887.0, + "step": 8546 + }, + { + "epoch": 1.0156862745098039, + "grad_norm": 0.4434651121975402, + "learning_rate": 1.1210414613265998e-05, + "loss": 0.8322, + "num_tokens": 35724493868.0, + "step": 8547 + }, + { + "epoch": 1.015805109922757, + "grad_norm": 0.37077655558978334, + "learning_rate": 1.1208683117466182e-05, + "loss": 0.858, + "num_tokens": 35728681466.0, + "step": 8548 + }, + { + "epoch": 1.01592394533571, + "grad_norm": 0.3636759129395452, + "learning_rate": 1.1206951613938119e-05, + "loss": 0.8405, + "num_tokens": 35732852857.0, + "step": 8549 + }, + { + "epoch": 1.0160427807486632, + "grad_norm": 0.4440236001578628, + "learning_rate": 1.1205220102745939e-05, + "loss": 0.8498, + "num_tokens": 35737041562.0, + "step": 8550 + }, + { + "epoch": 1.0161616161616163, + "grad_norm": 0.35674371487549, + "learning_rate": 1.1203488583953768e-05, + "loss": 0.8134, + "num_tokens": 35741230820.0, + "step": 8551 + }, + { + "epoch": 1.0162804515745691, + "grad_norm": 0.4097641866202346, + "learning_rate": 1.1201757057625724e-05, + "loss": 0.8458, + "num_tokens": 35745405536.0, + "step": 8552 + }, + { + "epoch": 1.0163992869875222, + "grad_norm": 0.36784285089088237, + "learning_rate": 1.1200025523825935e-05, + "loss": 0.8462, + "num_tokens": 35749595374.0, + "step": 8553 + }, + { + "epoch": 1.0165181224004753, + "grad_norm": 0.3629330624524379, + "learning_rate": 1.1198293982618524e-05, + "loss": 0.7774, + "num_tokens": 35753784714.0, + "step": 8554 + }, + { + "epoch": 1.0166369578134284, + "grad_norm": 0.3098075928238679, + "learning_rate": 1.1196562434067616e-05, + "loss": 0.826, + "num_tokens": 35757974862.0, + "step": 8555 + }, + { + "epoch": 1.0167557932263815, + "grad_norm": 0.4114617604624029, + "learning_rate": 1.1194830878237336e-05, + "loss": 0.8027, + "num_tokens": 35762164259.0, + "step": 8556 + }, + { + "epoch": 1.0168746286393344, + "grad_norm": 0.3730980533063351, + "learning_rate": 1.119309931519181e-05, + "loss": 0.8052, + "num_tokens": 35766354701.0, + "step": 8557 + }, + { + "epoch": 1.0169934640522875, + "grad_norm": 0.3103151106679506, + "learning_rate": 1.1191367744995168e-05, + "loss": 0.8156, + "num_tokens": 35770543965.0, + "step": 8558 + }, + { + "epoch": 1.0171122994652406, + "grad_norm": 0.34624708856165864, + "learning_rate": 1.1189636167711526e-05, + "loss": 0.8389, + "num_tokens": 35774732263.0, + "step": 8559 + }, + { + "epoch": 1.0172311348781937, + "grad_norm": 0.33046267404027985, + "learning_rate": 1.1187904583405016e-05, + "loss": 0.8598, + "num_tokens": 35778922469.0, + "step": 8560 + }, + { + "epoch": 1.0173499702911468, + "grad_norm": 0.4061089731196289, + "learning_rate": 1.1186172992139767e-05, + "loss": 0.8275, + "num_tokens": 35783111400.0, + "step": 8561 + }, + { + "epoch": 1.0174688057041, + "grad_norm": 0.3151509552231205, + "learning_rate": 1.1184441393979899e-05, + "loss": 0.8041, + "num_tokens": 35787248977.0, + "step": 8562 + }, + { + "epoch": 1.0175876411170528, + "grad_norm": 0.4775886673735257, + "learning_rate": 1.1182709788989545e-05, + "loss": 0.7908, + "num_tokens": 35791439010.0, + "step": 8563 + }, + { + "epoch": 1.017706476530006, + "grad_norm": 0.36610409001821265, + "learning_rate": 1.118097817723283e-05, + "loss": 0.8361, + "num_tokens": 35795627233.0, + "step": 8564 + }, + { + "epoch": 1.017825311942959, + "grad_norm": 0.39954615123613557, + "learning_rate": 1.117924655877388e-05, + "loss": 0.808, + "num_tokens": 35799817771.0, + "step": 8565 + }, + { + "epoch": 1.017944147355912, + "grad_norm": 0.41476707387868345, + "learning_rate": 1.1177514933676826e-05, + "loss": 0.8356, + "num_tokens": 35804007452.0, + "step": 8566 + }, + { + "epoch": 1.0180629827688652, + "grad_norm": 0.4067831200707959, + "learning_rate": 1.1175783302005792e-05, + "loss": 0.8308, + "num_tokens": 35808156714.0, + "step": 8567 + }, + { + "epoch": 1.018181818181818, + "grad_norm": 0.3702597159815601, + "learning_rate": 1.1174051663824908e-05, + "loss": 0.8208, + "num_tokens": 35812316575.0, + "step": 8568 + }, + { + "epoch": 1.0183006535947712, + "grad_norm": 0.35836488965188473, + "learning_rate": 1.1172320019198301e-05, + "loss": 0.7987, + "num_tokens": 35816506066.0, + "step": 8569 + }, + { + "epoch": 1.0184194890077243, + "grad_norm": 0.3690149595011988, + "learning_rate": 1.1170588368190103e-05, + "loss": 0.8512, + "num_tokens": 35820663839.0, + "step": 8570 + }, + { + "epoch": 1.0185383244206774, + "grad_norm": 0.3205414657065811, + "learning_rate": 1.1168856710864443e-05, + "loss": 0.826, + "num_tokens": 35824838097.0, + "step": 8571 + }, + { + "epoch": 1.0186571598336305, + "grad_norm": 0.43982997048666483, + "learning_rate": 1.1167125047285443e-05, + "loss": 0.8149, + "num_tokens": 35829003235.0, + "step": 8572 + }, + { + "epoch": 1.0187759952465836, + "grad_norm": 0.32009736832938973, + "learning_rate": 1.1165393377517239e-05, + "loss": 0.8525, + "num_tokens": 35833159214.0, + "step": 8573 + }, + { + "epoch": 1.0188948306595365, + "grad_norm": 0.4232355434557555, + "learning_rate": 1.1163661701623965e-05, + "loss": 0.8709, + "num_tokens": 35837347370.0, + "step": 8574 + }, + { + "epoch": 1.0190136660724896, + "grad_norm": 0.3996227899668582, + "learning_rate": 1.1161930019669737e-05, + "loss": 0.8242, + "num_tokens": 35841534515.0, + "step": 8575 + }, + { + "epoch": 1.0191325014854427, + "grad_norm": 0.39768702762763364, + "learning_rate": 1.1160198331718697e-05, + "loss": 0.8051, + "num_tokens": 35845704892.0, + "step": 8576 + }, + { + "epoch": 1.0192513368983958, + "grad_norm": 0.34528164395318633, + "learning_rate": 1.1158466637834971e-05, + "loss": 0.8528, + "num_tokens": 35849889469.0, + "step": 8577 + }, + { + "epoch": 1.0193701723113489, + "grad_norm": 0.4074786975371716, + "learning_rate": 1.1156734938082683e-05, + "loss": 0.8128, + "num_tokens": 35854012568.0, + "step": 8578 + }, + { + "epoch": 1.0194890077243017, + "grad_norm": 0.3223259971663585, + "learning_rate": 1.1155003232525978e-05, + "loss": 0.8134, + "num_tokens": 35858184860.0, + "step": 8579 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.30113907887407315, + "learning_rate": 1.115327152122898e-05, + "loss": 0.8007, + "num_tokens": 35862374144.0, + "step": 8580 + }, + { + "epoch": 1.019726678550208, + "grad_norm": 0.3089063452395909, + "learning_rate": 1.1151539804255813e-05, + "loss": 0.7863, + "num_tokens": 35866563212.0, + "step": 8581 + }, + { + "epoch": 1.019845513963161, + "grad_norm": 0.31148482084699, + "learning_rate": 1.1149808081670622e-05, + "loss": 0.834, + "num_tokens": 35870746552.0, + "step": 8582 + }, + { + "epoch": 1.0199643493761141, + "grad_norm": 0.34544666888259085, + "learning_rate": 1.1148076353537526e-05, + "loss": 0.7955, + "num_tokens": 35874936377.0, + "step": 8583 + }, + { + "epoch": 1.0200831847890672, + "grad_norm": 0.4023854249672849, + "learning_rate": 1.1146344619920663e-05, + "loss": 0.8305, + "num_tokens": 35879116080.0, + "step": 8584 + }, + { + "epoch": 1.02020202020202, + "grad_norm": 0.3594167603136863, + "learning_rate": 1.1144612880884163e-05, + "loss": 0.844, + "num_tokens": 35883293519.0, + "step": 8585 + }, + { + "epoch": 1.0203208556149732, + "grad_norm": 0.3166790272356487, + "learning_rate": 1.114288113649216e-05, + "loss": 0.8284, + "num_tokens": 35887484016.0, + "step": 8586 + }, + { + "epoch": 1.0204396910279263, + "grad_norm": 0.35297599850826106, + "learning_rate": 1.1141149386808788e-05, + "loss": 0.7934, + "num_tokens": 35891642207.0, + "step": 8587 + }, + { + "epoch": 1.0205585264408794, + "grad_norm": 0.4271411784479763, + "learning_rate": 1.1139417631898173e-05, + "loss": 0.8214, + "num_tokens": 35895820698.0, + "step": 8588 + }, + { + "epoch": 1.0206773618538325, + "grad_norm": 0.32885248531281536, + "learning_rate": 1.1137685871824455e-05, + "loss": 0.8175, + "num_tokens": 35900008750.0, + "step": 8589 + }, + { + "epoch": 1.0207961972667856, + "grad_norm": 0.37414141093581404, + "learning_rate": 1.113595410665176e-05, + "loss": 0.8314, + "num_tokens": 35904197525.0, + "step": 8590 + }, + { + "epoch": 1.0209150326797385, + "grad_norm": 0.36609574349794954, + "learning_rate": 1.1134222336444225e-05, + "loss": 0.8271, + "num_tokens": 35908367476.0, + "step": 8591 + }, + { + "epoch": 1.0210338680926916, + "grad_norm": 0.4111976376236607, + "learning_rate": 1.1132490561265988e-05, + "loss": 0.819, + "num_tokens": 35912556986.0, + "step": 8592 + }, + { + "epoch": 1.0211527035056447, + "grad_norm": 0.3204628957153114, + "learning_rate": 1.1130758781181172e-05, + "loss": 0.7606, + "num_tokens": 35916747355.0, + "step": 8593 + }, + { + "epoch": 1.0212715389185978, + "grad_norm": 0.42873166943993196, + "learning_rate": 1.112902699625392e-05, + "loss": 0.8528, + "num_tokens": 35920937384.0, + "step": 8594 + }, + { + "epoch": 1.0213903743315509, + "grad_norm": 0.3540205386089486, + "learning_rate": 1.1127295206548361e-05, + "loss": 0.8069, + "num_tokens": 35925126655.0, + "step": 8595 + }, + { + "epoch": 1.0215092097445038, + "grad_norm": 0.3142709944987759, + "learning_rate": 1.112556341212863e-05, + "loss": 0.8485, + "num_tokens": 35929276236.0, + "step": 8596 + }, + { + "epoch": 1.0216280451574569, + "grad_norm": 0.40476742213534417, + "learning_rate": 1.1123831613058859e-05, + "loss": 0.8294, + "num_tokens": 35933466145.0, + "step": 8597 + }, + { + "epoch": 1.02174688057041, + "grad_norm": 0.35364091367656697, + "learning_rate": 1.1122099809403189e-05, + "loss": 0.8261, + "num_tokens": 35937595543.0, + "step": 8598 + }, + { + "epoch": 1.021865715983363, + "grad_norm": 0.3886181109006216, + "learning_rate": 1.1120368001225745e-05, + "loss": 0.8541, + "num_tokens": 35941782891.0, + "step": 8599 + }, + { + "epoch": 1.0219845513963162, + "grad_norm": 0.3136509615856153, + "learning_rate": 1.1118636188590672e-05, + "loss": 0.8109, + "num_tokens": 35945973424.0, + "step": 8600 + }, + { + "epoch": 1.0221033868092693, + "grad_norm": 0.37556022112342663, + "learning_rate": 1.1116904371562098e-05, + "loss": 0.8372, + "num_tokens": 35950162120.0, + "step": 8601 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.3520646163563956, + "learning_rate": 1.111517255020416e-05, + "loss": 0.8219, + "num_tokens": 35954338982.0, + "step": 8602 + }, + { + "epoch": 1.0223410576351752, + "grad_norm": 0.3005843746860627, + "learning_rate": 1.1113440724580994e-05, + "loss": 0.8311, + "num_tokens": 35958527576.0, + "step": 8603 + }, + { + "epoch": 1.0224598930481283, + "grad_norm": 0.3369386805782695, + "learning_rate": 1.111170889475673e-05, + "loss": 0.8443, + "num_tokens": 35962717595.0, + "step": 8604 + }, + { + "epoch": 1.0225787284610814, + "grad_norm": 0.35136115567059745, + "learning_rate": 1.1109977060795511e-05, + "loss": 0.7999, + "num_tokens": 35966907929.0, + "step": 8605 + }, + { + "epoch": 1.0226975638740345, + "grad_norm": 0.34519058691838833, + "learning_rate": 1.110824522276147e-05, + "loss": 0.7879, + "num_tokens": 35971088633.0, + "step": 8606 + }, + { + "epoch": 1.0228163992869874, + "grad_norm": 0.3222367993296464, + "learning_rate": 1.1106513380718745e-05, + "loss": 0.8258, + "num_tokens": 35975276907.0, + "step": 8607 + }, + { + "epoch": 1.0229352346999405, + "grad_norm": 0.3155020274529047, + "learning_rate": 1.1104781534731463e-05, + "loss": 0.8308, + "num_tokens": 35979465200.0, + "step": 8608 + }, + { + "epoch": 1.0230540701128936, + "grad_norm": 0.3123941542085991, + "learning_rate": 1.110304968486377e-05, + "loss": 0.8308, + "num_tokens": 35983625016.0, + "step": 8609 + }, + { + "epoch": 1.0231729055258467, + "grad_norm": 0.37070602435305755, + "learning_rate": 1.11013178311798e-05, + "loss": 0.8057, + "num_tokens": 35987789293.0, + "step": 8610 + }, + { + "epoch": 1.0232917409387998, + "grad_norm": 0.4152318762208367, + "learning_rate": 1.1099585973743688e-05, + "loss": 0.8375, + "num_tokens": 35991978320.0, + "step": 8611 + }, + { + "epoch": 1.023410576351753, + "grad_norm": 0.3163010180816573, + "learning_rate": 1.1097854112619568e-05, + "loss": 0.818, + "num_tokens": 35996135137.0, + "step": 8612 + }, + { + "epoch": 1.0235294117647058, + "grad_norm": 0.4131110859793427, + "learning_rate": 1.1096122247871582e-05, + "loss": 0.7934, + "num_tokens": 36000324903.0, + "step": 8613 + }, + { + "epoch": 1.023648247177659, + "grad_norm": 0.35644330336118685, + "learning_rate": 1.1094390379563862e-05, + "loss": 0.7804, + "num_tokens": 36004513490.0, + "step": 8614 + }, + { + "epoch": 1.023767082590612, + "grad_norm": 0.44904277141832083, + "learning_rate": 1.109265850776055e-05, + "loss": 0.8374, + "num_tokens": 36008702949.0, + "step": 8615 + }, + { + "epoch": 1.023885918003565, + "grad_norm": 0.3300937423348004, + "learning_rate": 1.1090926632525779e-05, + "loss": 0.8531, + "num_tokens": 36012891675.0, + "step": 8616 + }, + { + "epoch": 1.0240047534165182, + "grad_norm": 0.4692134808814767, + "learning_rate": 1.1089194753923681e-05, + "loss": 0.8222, + "num_tokens": 36017079598.0, + "step": 8617 + }, + { + "epoch": 1.024123588829471, + "grad_norm": 0.3738728680364345, + "learning_rate": 1.1087462872018411e-05, + "loss": 0.8351, + "num_tokens": 36021268746.0, + "step": 8618 + }, + { + "epoch": 1.0242424242424242, + "grad_norm": 0.3567573024788329, + "learning_rate": 1.1085730986874089e-05, + "loss": 0.826, + "num_tokens": 36025442110.0, + "step": 8619 + }, + { + "epoch": 1.0243612596553773, + "grad_norm": 0.3979136291668386, + "learning_rate": 1.1083999098554857e-05, + "loss": 0.8748, + "num_tokens": 36029610751.0, + "step": 8620 + }, + { + "epoch": 1.0244800950683304, + "grad_norm": 0.40166654623744363, + "learning_rate": 1.1082267207124857e-05, + "loss": 0.8347, + "num_tokens": 36033781900.0, + "step": 8621 + }, + { + "epoch": 1.0245989304812835, + "grad_norm": 0.3102825003051397, + "learning_rate": 1.1080535312648224e-05, + "loss": 0.8062, + "num_tokens": 36037943832.0, + "step": 8622 + }, + { + "epoch": 1.0247177658942366, + "grad_norm": 0.35627605596477296, + "learning_rate": 1.1078803415189094e-05, + "loss": 0.8423, + "num_tokens": 36042106242.0, + "step": 8623 + }, + { + "epoch": 1.0248366013071895, + "grad_norm": 0.3745807299556312, + "learning_rate": 1.1077071514811611e-05, + "loss": 0.7996, + "num_tokens": 36046295004.0, + "step": 8624 + }, + { + "epoch": 1.0249554367201426, + "grad_norm": 0.3846364622297977, + "learning_rate": 1.1075339611579908e-05, + "loss": 0.8063, + "num_tokens": 36050483683.0, + "step": 8625 + }, + { + "epoch": 1.0250742721330957, + "grad_norm": 0.3278606598960515, + "learning_rate": 1.107360770555812e-05, + "loss": 0.8395, + "num_tokens": 36054671555.0, + "step": 8626 + }, + { + "epoch": 1.0251931075460488, + "grad_norm": 0.35557317381375947, + "learning_rate": 1.1071875796810397e-05, + "loss": 0.8114, + "num_tokens": 36058860299.0, + "step": 8627 + }, + { + "epoch": 1.0253119429590019, + "grad_norm": 0.35922951005327736, + "learning_rate": 1.1070143885400863e-05, + "loss": 0.8325, + "num_tokens": 36063048806.0, + "step": 8628 + }, + { + "epoch": 1.025430778371955, + "grad_norm": 0.3146538991225106, + "learning_rate": 1.106841197139367e-05, + "loss": 0.8119, + "num_tokens": 36067238674.0, + "step": 8629 + }, + { + "epoch": 1.0255496137849078, + "grad_norm": 0.3436914313389919, + "learning_rate": 1.1066680054852948e-05, + "loss": 0.8055, + "num_tokens": 36071428224.0, + "step": 8630 + }, + { + "epoch": 1.025668449197861, + "grad_norm": 0.33440817048997745, + "learning_rate": 1.1064948135842839e-05, + "loss": 0.8031, + "num_tokens": 36075588208.0, + "step": 8631 + }, + { + "epoch": 1.025787284610814, + "grad_norm": 0.31622422301570674, + "learning_rate": 1.1063216214427476e-05, + "loss": 0.8386, + "num_tokens": 36079766637.0, + "step": 8632 + }, + { + "epoch": 1.0259061200237671, + "grad_norm": 0.3188920298793099, + "learning_rate": 1.1061484290671008e-05, + "loss": 0.8033, + "num_tokens": 36083955982.0, + "step": 8633 + }, + { + "epoch": 1.0260249554367202, + "grad_norm": 0.3291962700367269, + "learning_rate": 1.1059752364637568e-05, + "loss": 0.8384, + "num_tokens": 36088144508.0, + "step": 8634 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.3169441182312931, + "learning_rate": 1.1058020436391296e-05, + "loss": 0.7862, + "num_tokens": 36092314084.0, + "step": 8635 + }, + { + "epoch": 1.0262626262626262, + "grad_norm": 0.3713889439199637, + "learning_rate": 1.1056288505996329e-05, + "loss": 0.8386, + "num_tokens": 36096446272.0, + "step": 8636 + }, + { + "epoch": 1.0263814616755793, + "grad_norm": 0.3252480732344291, + "learning_rate": 1.1054556573516812e-05, + "loss": 0.8219, + "num_tokens": 36100634085.0, + "step": 8637 + }, + { + "epoch": 1.0265002970885324, + "grad_norm": 0.364121453649393, + "learning_rate": 1.1052824639016878e-05, + "loss": 0.8388, + "num_tokens": 36104779270.0, + "step": 8638 + }, + { + "epoch": 1.0266191325014855, + "grad_norm": 0.29978488620995125, + "learning_rate": 1.1051092702560668e-05, + "loss": 0.8569, + "num_tokens": 36108967904.0, + "step": 8639 + }, + { + "epoch": 1.0267379679144386, + "grad_norm": 0.30434285242863196, + "learning_rate": 1.1049360764212324e-05, + "loss": 0.8445, + "num_tokens": 36113154931.0, + "step": 8640 + }, + { + "epoch": 1.0268568033273915, + "grad_norm": 0.31029658020221973, + "learning_rate": 1.1047628824035983e-05, + "loss": 0.8174, + "num_tokens": 36117344645.0, + "step": 8641 + }, + { + "epoch": 1.0269756387403446, + "grad_norm": 0.31748632894880646, + "learning_rate": 1.1045896882095787e-05, + "loss": 0.848, + "num_tokens": 36121504507.0, + "step": 8642 + }, + { + "epoch": 1.0270944741532977, + "grad_norm": 0.338718113012566, + "learning_rate": 1.1044164938455873e-05, + "loss": 0.7914, + "num_tokens": 36125652195.0, + "step": 8643 + }, + { + "epoch": 1.0272133095662508, + "grad_norm": 0.319105234194249, + "learning_rate": 1.1042432993180385e-05, + "loss": 0.8156, + "num_tokens": 36129815815.0, + "step": 8644 + }, + { + "epoch": 1.0273321449792039, + "grad_norm": 0.3356996196675388, + "learning_rate": 1.1040701046333455e-05, + "loss": 0.8425, + "num_tokens": 36133997028.0, + "step": 8645 + }, + { + "epoch": 1.0274509803921568, + "grad_norm": 0.32783333065000403, + "learning_rate": 1.103896909797923e-05, + "loss": 0.8153, + "num_tokens": 36138186254.0, + "step": 8646 + }, + { + "epoch": 1.0275698158051099, + "grad_norm": 0.31445646905476254, + "learning_rate": 1.1037237148181849e-05, + "loss": 0.8064, + "num_tokens": 36142375054.0, + "step": 8647 + }, + { + "epoch": 1.027688651218063, + "grad_norm": 0.4323403816369641, + "learning_rate": 1.1035505197005447e-05, + "loss": 0.8381, + "num_tokens": 36146558065.0, + "step": 8648 + }, + { + "epoch": 1.027807486631016, + "grad_norm": 0.3162275446301852, + "learning_rate": 1.1033773244514174e-05, + "loss": 0.8321, + "num_tokens": 36150725826.0, + "step": 8649 + }, + { + "epoch": 1.0279263220439692, + "grad_norm": 0.42381289186235316, + "learning_rate": 1.1032041290772158e-05, + "loss": 0.8369, + "num_tokens": 36154915042.0, + "step": 8650 + }, + { + "epoch": 1.0280451574569223, + "grad_norm": 0.386886008964814, + "learning_rate": 1.1030309335843545e-05, + "loss": 0.8243, + "num_tokens": 36159103949.0, + "step": 8651 + }, + { + "epoch": 1.0281639928698751, + "grad_norm": 0.36784523260137025, + "learning_rate": 1.1028577379792477e-05, + "loss": 0.8342, + "num_tokens": 36163292361.0, + "step": 8652 + }, + { + "epoch": 1.0282828282828282, + "grad_norm": 0.39308032459264614, + "learning_rate": 1.102684542268309e-05, + "loss": 0.7965, + "num_tokens": 36167482280.0, + "step": 8653 + }, + { + "epoch": 1.0284016636957813, + "grad_norm": 0.3773566958055853, + "learning_rate": 1.1025113464579526e-05, + "loss": 0.8446, + "num_tokens": 36171671241.0, + "step": 8654 + }, + { + "epoch": 1.0285204991087344, + "grad_norm": 0.3707009808224756, + "learning_rate": 1.102338150554593e-05, + "loss": 0.8196, + "num_tokens": 36175854481.0, + "step": 8655 + }, + { + "epoch": 1.0286393345216875, + "grad_norm": 0.30777771255977127, + "learning_rate": 1.1021649545646433e-05, + "loss": 0.8171, + "num_tokens": 36180027516.0, + "step": 8656 + }, + { + "epoch": 1.0287581699346404, + "grad_norm": 0.35465860159040763, + "learning_rate": 1.101991758494518e-05, + "loss": 0.8497, + "num_tokens": 36184158987.0, + "step": 8657 + }, + { + "epoch": 1.0288770053475935, + "grad_norm": 0.4158917221302272, + "learning_rate": 1.1018185623506312e-05, + "loss": 0.8068, + "num_tokens": 36188317131.0, + "step": 8658 + }, + { + "epoch": 1.0289958407605466, + "grad_norm": 0.36037887740827146, + "learning_rate": 1.1016453661393971e-05, + "loss": 0.8143, + "num_tokens": 36192484540.0, + "step": 8659 + }, + { + "epoch": 1.0291146761734997, + "grad_norm": 0.40524644505708024, + "learning_rate": 1.1014721698672296e-05, + "loss": 0.8573, + "num_tokens": 36196674024.0, + "step": 8660 + }, + { + "epoch": 1.0292335115864528, + "grad_norm": 0.3775416797682012, + "learning_rate": 1.1012989735405422e-05, + "loss": 0.8683, + "num_tokens": 36200847912.0, + "step": 8661 + }, + { + "epoch": 1.029352346999406, + "grad_norm": 0.4144633150826339, + "learning_rate": 1.1011257771657498e-05, + "loss": 0.8597, + "num_tokens": 36205036364.0, + "step": 8662 + }, + { + "epoch": 1.0294711824123588, + "grad_norm": 0.3196298865306966, + "learning_rate": 1.100952580749266e-05, + "loss": 0.819, + "num_tokens": 36209226597.0, + "step": 8663 + }, + { + "epoch": 1.029590017825312, + "grad_norm": 0.4371000256176818, + "learning_rate": 1.1007793842975047e-05, + "loss": 0.8321, + "num_tokens": 36213400996.0, + "step": 8664 + }, + { + "epoch": 1.029708853238265, + "grad_norm": 0.32191515320931596, + "learning_rate": 1.1006061878168806e-05, + "loss": 0.8214, + "num_tokens": 36217586588.0, + "step": 8665 + }, + { + "epoch": 1.029827688651218, + "grad_norm": 0.47384807436612514, + "learning_rate": 1.100432991313807e-05, + "loss": 0.8324, + "num_tokens": 36221776565.0, + "step": 8666 + }, + { + "epoch": 1.0299465240641712, + "grad_norm": 0.3236869935429605, + "learning_rate": 1.1002597947946983e-05, + "loss": 0.8288, + "num_tokens": 36225965420.0, + "step": 8667 + }, + { + "epoch": 1.030065359477124, + "grad_norm": 0.46357022067885123, + "learning_rate": 1.1000865982659688e-05, + "loss": 0.7669, + "num_tokens": 36230153665.0, + "step": 8668 + }, + { + "epoch": 1.0301841948900772, + "grad_norm": 0.3638853431900635, + "learning_rate": 1.0999134017340315e-05, + "loss": 0.8562, + "num_tokens": 36234322352.0, + "step": 8669 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 0.41843202617726794, + "learning_rate": 1.0997402052053019e-05, + "loss": 0.8302, + "num_tokens": 36238511402.0, + "step": 8670 + }, + { + "epoch": 1.0304218657159834, + "grad_norm": 0.36739775773206346, + "learning_rate": 1.0995670086861935e-05, + "loss": 0.8335, + "num_tokens": 36242701666.0, + "step": 8671 + }, + { + "epoch": 1.0305407011289365, + "grad_norm": 0.42567325637269293, + "learning_rate": 1.0993938121831197e-05, + "loss": 0.8359, + "num_tokens": 36246876483.0, + "step": 8672 + }, + { + "epoch": 1.0306595365418896, + "grad_norm": 0.32575787298818343, + "learning_rate": 1.0992206157024956e-05, + "loss": 0.8013, + "num_tokens": 36251034136.0, + "step": 8673 + }, + { + "epoch": 1.0307783719548425, + "grad_norm": 0.4164020805037647, + "learning_rate": 1.0990474192507346e-05, + "loss": 0.7828, + "num_tokens": 36255217534.0, + "step": 8674 + }, + { + "epoch": 1.0308972073677956, + "grad_norm": 0.35008135241821337, + "learning_rate": 1.0988742228342503e-05, + "loss": 0.8085, + "num_tokens": 36259405853.0, + "step": 8675 + }, + { + "epoch": 1.0310160427807487, + "grad_norm": 0.41140927024241397, + "learning_rate": 1.0987010264594579e-05, + "loss": 0.7998, + "num_tokens": 36263594357.0, + "step": 8676 + }, + { + "epoch": 1.0311348781937018, + "grad_norm": 0.3412920208481129, + "learning_rate": 1.0985278301327708e-05, + "loss": 0.8204, + "num_tokens": 36267782455.0, + "step": 8677 + }, + { + "epoch": 1.0312537136066549, + "grad_norm": 0.43190958098172466, + "learning_rate": 1.0983546338606032e-05, + "loss": 0.8306, + "num_tokens": 36271955867.0, + "step": 8678 + }, + { + "epoch": 1.0313725490196077, + "grad_norm": 0.37714758562197714, + "learning_rate": 1.098181437649369e-05, + "loss": 0.8318, + "num_tokens": 36276145931.0, + "step": 8679 + }, + { + "epoch": 1.0314913844325608, + "grad_norm": 0.39375205996060925, + "learning_rate": 1.0980082415054822e-05, + "loss": 0.823, + "num_tokens": 36280335439.0, + "step": 8680 + }, + { + "epoch": 1.031610219845514, + "grad_norm": 0.39145026535977745, + "learning_rate": 1.0978350454353572e-05, + "loss": 0.8207, + "num_tokens": 36284510122.0, + "step": 8681 + }, + { + "epoch": 1.031729055258467, + "grad_norm": 0.3787392226079064, + "learning_rate": 1.0976618494454074e-05, + "loss": 0.8534, + "num_tokens": 36288701078.0, + "step": 8682 + }, + { + "epoch": 1.0318478906714201, + "grad_norm": 0.37400541102403206, + "learning_rate": 1.0974886535420477e-05, + "loss": 0.8435, + "num_tokens": 36292890981.0, + "step": 8683 + }, + { + "epoch": 1.0319667260843732, + "grad_norm": 0.41784454780614544, + "learning_rate": 1.0973154577316914e-05, + "loss": 0.8313, + "num_tokens": 36297081130.0, + "step": 8684 + }, + { + "epoch": 1.032085561497326, + "grad_norm": 0.344311280392877, + "learning_rate": 1.0971422620207526e-05, + "loss": 0.8353, + "num_tokens": 36301270488.0, + "step": 8685 + }, + { + "epoch": 1.0322043969102792, + "grad_norm": 0.41646696712744696, + "learning_rate": 1.0969690664156459e-05, + "loss": 0.8223, + "num_tokens": 36305460234.0, + "step": 8686 + }, + { + "epoch": 1.0323232323232323, + "grad_norm": 0.35688833397967223, + "learning_rate": 1.0967958709227847e-05, + "loss": 0.8486, + "num_tokens": 36309640130.0, + "step": 8687 + }, + { + "epoch": 1.0324420677361854, + "grad_norm": 0.4015359506698111, + "learning_rate": 1.0966226755485831e-05, + "loss": 0.8124, + "num_tokens": 36313829718.0, + "step": 8688 + }, + { + "epoch": 1.0325609031491385, + "grad_norm": 0.34936616324186953, + "learning_rate": 1.0964494802994552e-05, + "loss": 0.8479, + "num_tokens": 36318017376.0, + "step": 8689 + }, + { + "epoch": 1.0326797385620916, + "grad_norm": 0.446821293967529, + "learning_rate": 1.0962762851818152e-05, + "loss": 0.8224, + "num_tokens": 36322186090.0, + "step": 8690 + }, + { + "epoch": 1.0327985739750445, + "grad_norm": 0.3661267731668459, + "learning_rate": 1.0961030902020772e-05, + "loss": 0.8247, + "num_tokens": 36326374619.0, + "step": 8691 + }, + { + "epoch": 1.0329174093879976, + "grad_norm": 0.4473345609776657, + "learning_rate": 1.0959298953666548e-05, + "loss": 0.8075, + "num_tokens": 36330537480.0, + "step": 8692 + }, + { + "epoch": 1.0330362448009507, + "grad_norm": 0.41511331771045556, + "learning_rate": 1.0957567006819621e-05, + "loss": 0.8171, + "num_tokens": 36334726862.0, + "step": 8693 + }, + { + "epoch": 1.0331550802139038, + "grad_norm": 0.3568303134620665, + "learning_rate": 1.0955835061544127e-05, + "loss": 0.8076, + "num_tokens": 36338914857.0, + "step": 8694 + }, + { + "epoch": 1.0332739156268569, + "grad_norm": 0.4057710135311363, + "learning_rate": 1.0954103117904214e-05, + "loss": 0.8283, + "num_tokens": 36343103638.0, + "step": 8695 + }, + { + "epoch": 1.0333927510398098, + "grad_norm": 0.34607042194377, + "learning_rate": 1.095237117596402e-05, + "loss": 0.8231, + "num_tokens": 36347293682.0, + "step": 8696 + }, + { + "epoch": 1.0335115864527629, + "grad_norm": 0.36038994996532303, + "learning_rate": 1.0950639235787679e-05, + "loss": 0.8113, + "num_tokens": 36351483043.0, + "step": 8697 + }, + { + "epoch": 1.033630421865716, + "grad_norm": 0.3017944575272293, + "learning_rate": 1.0948907297439335e-05, + "loss": 0.8278, + "num_tokens": 36355640651.0, + "step": 8698 + }, + { + "epoch": 1.033749257278669, + "grad_norm": 0.31874870404592953, + "learning_rate": 1.0947175360983129e-05, + "loss": 0.8253, + "num_tokens": 36359811859.0, + "step": 8699 + }, + { + "epoch": 1.0338680926916222, + "grad_norm": 0.34629902881232416, + "learning_rate": 1.0945443426483196e-05, + "loss": 0.8411, + "num_tokens": 36364001705.0, + "step": 8700 + }, + { + "epoch": 1.0339869281045753, + "grad_norm": 0.41741182174340186, + "learning_rate": 1.0943711494003672e-05, + "loss": 0.8536, + "num_tokens": 36368191919.0, + "step": 8701 + }, + { + "epoch": 1.0341057635175281, + "grad_norm": 0.30894410067180816, + "learning_rate": 1.0941979563608707e-05, + "loss": 0.8137, + "num_tokens": 36372379992.0, + "step": 8702 + }, + { + "epoch": 1.0342245989304812, + "grad_norm": 0.4307157726477333, + "learning_rate": 1.0940247635362435e-05, + "loss": 0.7852, + "num_tokens": 36376563645.0, + "step": 8703 + }, + { + "epoch": 1.0343434343434343, + "grad_norm": 0.39016919610827966, + "learning_rate": 1.0938515709328997e-05, + "loss": 0.8262, + "num_tokens": 36380724773.0, + "step": 8704 + }, + { + "epoch": 1.0344622697563874, + "grad_norm": 0.39294820718820295, + "learning_rate": 1.0936783785572527e-05, + "loss": 0.8033, + "num_tokens": 36384914575.0, + "step": 8705 + }, + { + "epoch": 1.0345811051693405, + "grad_norm": 0.33855551621688207, + "learning_rate": 1.0935051864157167e-05, + "loss": 0.8275, + "num_tokens": 36389087979.0, + "step": 8706 + }, + { + "epoch": 1.0346999405822934, + "grad_norm": 0.45846071188026044, + "learning_rate": 1.0933319945147053e-05, + "loss": 0.8354, + "num_tokens": 36393276765.0, + "step": 8707 + }, + { + "epoch": 1.0348187759952465, + "grad_norm": 0.3244114120404431, + "learning_rate": 1.0931588028606333e-05, + "loss": 0.8417, + "num_tokens": 36397464453.0, + "step": 8708 + }, + { + "epoch": 1.0349376114081996, + "grad_norm": 0.4482123373955067, + "learning_rate": 1.092985611459914e-05, + "loss": 0.8085, + "num_tokens": 36401653542.0, + "step": 8709 + }, + { + "epoch": 1.0350564468211527, + "grad_norm": 0.38080683049985137, + "learning_rate": 1.0928124203189608e-05, + "loss": 0.8315, + "num_tokens": 36405843137.0, + "step": 8710 + }, + { + "epoch": 1.0351752822341058, + "grad_norm": 0.34914651900973115, + "learning_rate": 1.092639229444188e-05, + "loss": 0.8122, + "num_tokens": 36410032936.0, + "step": 8711 + }, + { + "epoch": 1.035294117647059, + "grad_norm": 0.41820479049942166, + "learning_rate": 1.0924660388420098e-05, + "loss": 0.7867, + "num_tokens": 36414203633.0, + "step": 8712 + }, + { + "epoch": 1.0354129530600118, + "grad_norm": 0.3478026748603335, + "learning_rate": 1.0922928485188395e-05, + "loss": 0.7978, + "num_tokens": 36418393563.0, + "step": 8713 + }, + { + "epoch": 1.035531788472965, + "grad_norm": 0.3396742137919582, + "learning_rate": 1.0921196584810907e-05, + "loss": 0.8206, + "num_tokens": 36422583097.0, + "step": 8714 + }, + { + "epoch": 1.035650623885918, + "grad_norm": 0.3524832519860611, + "learning_rate": 1.091946468735178e-05, + "loss": 0.8339, + "num_tokens": 36426772947.0, + "step": 8715 + }, + { + "epoch": 1.035769459298871, + "grad_norm": 0.34464553498823614, + "learning_rate": 1.0917732792875146e-05, + "loss": 0.8592, + "num_tokens": 36430934902.0, + "step": 8716 + }, + { + "epoch": 1.0358882947118242, + "grad_norm": 0.3321832399975095, + "learning_rate": 1.0916000901445148e-05, + "loss": 0.8254, + "num_tokens": 36435099729.0, + "step": 8717 + }, + { + "epoch": 1.0360071301247773, + "grad_norm": 0.3336268213575249, + "learning_rate": 1.0914269013125917e-05, + "loss": 0.7947, + "num_tokens": 36439261982.0, + "step": 8718 + }, + { + "epoch": 1.0361259655377302, + "grad_norm": 0.35602574413594257, + "learning_rate": 1.0912537127981595e-05, + "loss": 0.8353, + "num_tokens": 36443451949.0, + "step": 8719 + }, + { + "epoch": 1.0362448009506833, + "grad_norm": 0.35283092151922446, + "learning_rate": 1.0910805246076316e-05, + "loss": 0.8231, + "num_tokens": 36447642049.0, + "step": 8720 + }, + { + "epoch": 1.0363636363636364, + "grad_norm": 0.3388418644034125, + "learning_rate": 1.0909073367474226e-05, + "loss": 0.8359, + "num_tokens": 36451833177.0, + "step": 8721 + }, + { + "epoch": 1.0364824717765895, + "grad_norm": 0.318139839915421, + "learning_rate": 1.0907341492239455e-05, + "loss": 0.8409, + "num_tokens": 36456023450.0, + "step": 8722 + }, + { + "epoch": 1.0366013071895426, + "grad_norm": 0.3256308130064924, + "learning_rate": 1.0905609620436142e-05, + "loss": 0.7895, + "num_tokens": 36460177736.0, + "step": 8723 + }, + { + "epoch": 1.0367201426024955, + "grad_norm": 0.35777078443799076, + "learning_rate": 1.0903877752128423e-05, + "loss": 0.8857, + "num_tokens": 36464367172.0, + "step": 8724 + }, + { + "epoch": 1.0368389780154486, + "grad_norm": 0.35006081750607293, + "learning_rate": 1.0902145887380435e-05, + "loss": 0.8139, + "num_tokens": 36468542180.0, + "step": 8725 + }, + { + "epoch": 1.0369578134284017, + "grad_norm": 0.37245476656702353, + "learning_rate": 1.0900414026256314e-05, + "loss": 0.8083, + "num_tokens": 36472699721.0, + "step": 8726 + }, + { + "epoch": 1.0370766488413548, + "grad_norm": 0.29899915200286364, + "learning_rate": 1.0898682168820202e-05, + "loss": 0.8697, + "num_tokens": 36476888793.0, + "step": 8727 + }, + { + "epoch": 1.0371954842543079, + "grad_norm": 0.44435462819471383, + "learning_rate": 1.0896950315136232e-05, + "loss": 0.8448, + "num_tokens": 36481077441.0, + "step": 8728 + }, + { + "epoch": 1.037314319667261, + "grad_norm": 0.3427649734082655, + "learning_rate": 1.0895218465268538e-05, + "loss": 0.8517, + "num_tokens": 36485265686.0, + "step": 8729 + }, + { + "epoch": 1.0374331550802138, + "grad_norm": 0.45492622243283115, + "learning_rate": 1.089348661928126e-05, + "loss": 0.8302, + "num_tokens": 36489453312.0, + "step": 8730 + }, + { + "epoch": 1.037551990493167, + "grad_norm": 0.3516934801637025, + "learning_rate": 1.0891754777238534e-05, + "loss": 0.8398, + "num_tokens": 36493624055.0, + "step": 8731 + }, + { + "epoch": 1.03767082590612, + "grad_norm": 0.47002886974278263, + "learning_rate": 1.0890022939204492e-05, + "loss": 0.8331, + "num_tokens": 36497804614.0, + "step": 8732 + }, + { + "epoch": 1.0377896613190731, + "grad_norm": 0.37295481533620706, + "learning_rate": 1.0888291105243272e-05, + "loss": 0.7994, + "num_tokens": 36501994162.0, + "step": 8733 + }, + { + "epoch": 1.0379084967320262, + "grad_norm": 0.43339497021413703, + "learning_rate": 1.088655927541901e-05, + "loss": 0.8229, + "num_tokens": 36506183123.0, + "step": 8734 + }, + { + "epoch": 1.038027332144979, + "grad_norm": 0.38575915275442957, + "learning_rate": 1.0884827449795844e-05, + "loss": 0.7756, + "num_tokens": 36510360376.0, + "step": 8735 + }, + { + "epoch": 1.0381461675579322, + "grad_norm": 0.39956787013386186, + "learning_rate": 1.0883095628437908e-05, + "loss": 0.8447, + "num_tokens": 36514550358.0, + "step": 8736 + }, + { + "epoch": 1.0382650029708853, + "grad_norm": 0.3597840891970381, + "learning_rate": 1.0881363811409331e-05, + "loss": 0.8112, + "num_tokens": 36518739313.0, + "step": 8737 + }, + { + "epoch": 1.0383838383838384, + "grad_norm": 0.3596899143636593, + "learning_rate": 1.0879631998774256e-05, + "loss": 0.8127, + "num_tokens": 36522928914.0, + "step": 8738 + }, + { + "epoch": 1.0385026737967915, + "grad_norm": 0.4236144240355067, + "learning_rate": 1.0877900190596812e-05, + "loss": 0.8341, + "num_tokens": 36527118474.0, + "step": 8739 + }, + { + "epoch": 1.0386215092097446, + "grad_norm": 0.3742002937506821, + "learning_rate": 1.0876168386941144e-05, + "loss": 0.8395, + "num_tokens": 36531230820.0, + "step": 8740 + }, + { + "epoch": 1.0387403446226975, + "grad_norm": 0.3727172694894371, + "learning_rate": 1.0874436587871376e-05, + "loss": 0.8083, + "num_tokens": 36535419816.0, + "step": 8741 + }, + { + "epoch": 1.0388591800356506, + "grad_norm": 0.4176523666849806, + "learning_rate": 1.0872704793451643e-05, + "loss": 0.8249, + "num_tokens": 36539608580.0, + "step": 8742 + }, + { + "epoch": 1.0389780154486037, + "grad_norm": 0.3731312499797092, + "learning_rate": 1.0870973003746084e-05, + "loss": 0.8491, + "num_tokens": 36543798515.0, + "step": 8743 + }, + { + "epoch": 1.0390968508615568, + "grad_norm": 0.31902555359309864, + "learning_rate": 1.086924121881883e-05, + "loss": 0.8437, + "num_tokens": 36547986260.0, + "step": 8744 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 0.3860368256085568, + "learning_rate": 1.0867509438734017e-05, + "loss": 0.8368, + "num_tokens": 36552154780.0, + "step": 8745 + }, + { + "epoch": 1.0393345216874628, + "grad_norm": 0.2851711609872105, + "learning_rate": 1.0865777663555776e-05, + "loss": 0.8754, + "num_tokens": 36556342524.0, + "step": 8746 + }, + { + "epoch": 1.0394533571004159, + "grad_norm": 0.3934624614173415, + "learning_rate": 1.0864045893348242e-05, + "loss": 0.8024, + "num_tokens": 36560531224.0, + "step": 8747 + }, + { + "epoch": 1.039572192513369, + "grad_norm": 0.33499696068673024, + "learning_rate": 1.086231412817555e-05, + "loss": 0.8303, + "num_tokens": 36564721529.0, + "step": 8748 + }, + { + "epoch": 1.039691027926322, + "grad_norm": 0.3548050688205848, + "learning_rate": 1.0860582368101831e-05, + "loss": 0.7992, + "num_tokens": 36568901103.0, + "step": 8749 + }, + { + "epoch": 1.0398098633392752, + "grad_norm": 0.3478114358530327, + "learning_rate": 1.0858850613191218e-05, + "loss": 0.8095, + "num_tokens": 36573074333.0, + "step": 8750 + }, + { + "epoch": 1.0399286987522283, + "grad_norm": 0.3537828086032247, + "learning_rate": 1.0857118863507846e-05, + "loss": 0.7743, + "num_tokens": 36577265370.0, + "step": 8751 + }, + { + "epoch": 1.0400475341651811, + "grad_norm": 0.30217823156012164, + "learning_rate": 1.0855387119115838e-05, + "loss": 0.8387, + "num_tokens": 36581456948.0, + "step": 8752 + }, + { + "epoch": 1.0401663695781342, + "grad_norm": 0.3386464531831844, + "learning_rate": 1.085365538007934e-05, + "loss": 0.8175, + "num_tokens": 36585646294.0, + "step": 8753 + }, + { + "epoch": 1.0402852049910873, + "grad_norm": 0.3547297398813574, + "learning_rate": 1.0851923646462479e-05, + "loss": 0.8228, + "num_tokens": 36589835881.0, + "step": 8754 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.377161508905233, + "learning_rate": 1.0850191918329384e-05, + "loss": 0.8037, + "num_tokens": 36594023595.0, + "step": 8755 + }, + { + "epoch": 1.0405228758169935, + "grad_norm": 0.34470249466028074, + "learning_rate": 1.084846019574419e-05, + "loss": 0.8158, + "num_tokens": 36598211241.0, + "step": 8756 + }, + { + "epoch": 1.0406417112299464, + "grad_norm": 0.38061305058350314, + "learning_rate": 1.0846728478771027e-05, + "loss": 0.8673, + "num_tokens": 36602376450.0, + "step": 8757 + }, + { + "epoch": 1.0407605466428995, + "grad_norm": 0.3784609042721301, + "learning_rate": 1.0844996767474024e-05, + "loss": 0.8343, + "num_tokens": 36606551986.0, + "step": 8758 + }, + { + "epoch": 1.0408793820558526, + "grad_norm": 0.3675528698004945, + "learning_rate": 1.0843265061917317e-05, + "loss": 0.7999, + "num_tokens": 36610706273.0, + "step": 8759 + }, + { + "epoch": 1.0409982174688057, + "grad_norm": 0.3133226832394376, + "learning_rate": 1.0841533362165035e-05, + "loss": 0.8054, + "num_tokens": 36614896172.0, + "step": 8760 + }, + { + "epoch": 1.0411170528817588, + "grad_norm": 0.3447503974495319, + "learning_rate": 1.0839801668281306e-05, + "loss": 0.8336, + "num_tokens": 36619069495.0, + "step": 8761 + }, + { + "epoch": 1.041235888294712, + "grad_norm": 0.44450574830436596, + "learning_rate": 1.0838069980330269e-05, + "loss": 0.8209, + "num_tokens": 36623244472.0, + "step": 8762 + }, + { + "epoch": 1.0413547237076648, + "grad_norm": 0.32828370487785635, + "learning_rate": 1.0836338298376043e-05, + "loss": 0.8047, + "num_tokens": 36627416951.0, + "step": 8763 + }, + { + "epoch": 1.041473559120618, + "grad_norm": 0.40816932025767383, + "learning_rate": 1.0834606622482762e-05, + "loss": 0.8437, + "num_tokens": 36631606671.0, + "step": 8764 + }, + { + "epoch": 1.041592394533571, + "grad_norm": 0.4007641984927203, + "learning_rate": 1.0832874952714558e-05, + "loss": 0.8056, + "num_tokens": 36635765757.0, + "step": 8765 + }, + { + "epoch": 1.041711229946524, + "grad_norm": 0.35915303515827346, + "learning_rate": 1.0831143289135562e-05, + "loss": 0.8135, + "num_tokens": 36639955854.0, + "step": 8766 + }, + { + "epoch": 1.0418300653594772, + "grad_norm": 0.341878143680976, + "learning_rate": 1.08294116318099e-05, + "loss": 0.8101, + "num_tokens": 36644092618.0, + "step": 8767 + }, + { + "epoch": 1.04194890077243, + "grad_norm": 0.42853774939669126, + "learning_rate": 1.0827679980801702e-05, + "loss": 0.8139, + "num_tokens": 36648280187.0, + "step": 8768 + }, + { + "epoch": 1.0420677361853832, + "grad_norm": 0.36308265263962936, + "learning_rate": 1.0825948336175097e-05, + "loss": 0.8286, + "num_tokens": 36652438056.0, + "step": 8769 + }, + { + "epoch": 1.0421865715983363, + "grad_norm": 0.32075771783346113, + "learning_rate": 1.0824216697994215e-05, + "loss": 0.8357, + "num_tokens": 36656628682.0, + "step": 8770 + }, + { + "epoch": 1.0423054070112894, + "grad_norm": 0.4930282561357124, + "learning_rate": 1.0822485066323177e-05, + "loss": 0.833, + "num_tokens": 36660741640.0, + "step": 8771 + }, + { + "epoch": 1.0424242424242425, + "grad_norm": 0.32518918313347334, + "learning_rate": 1.0820753441226121e-05, + "loss": 0.8305, + "num_tokens": 36664931116.0, + "step": 8772 + }, + { + "epoch": 1.0425430778371956, + "grad_norm": 0.4414519624547888, + "learning_rate": 1.0819021822767173e-05, + "loss": 0.8398, + "num_tokens": 36669076549.0, + "step": 8773 + }, + { + "epoch": 1.0426619132501485, + "grad_norm": 0.40344390586233714, + "learning_rate": 1.0817290211010456e-05, + "loss": 0.8277, + "num_tokens": 36673267111.0, + "step": 8774 + }, + { + "epoch": 1.0427807486631016, + "grad_norm": 0.3688242934451131, + "learning_rate": 1.0815558606020102e-05, + "loss": 0.7993, + "num_tokens": 36677456127.0, + "step": 8775 + }, + { + "epoch": 1.0428995840760547, + "grad_norm": 0.32310117323253423, + "learning_rate": 1.0813827007860237e-05, + "loss": 0.8366, + "num_tokens": 36681644814.0, + "step": 8776 + }, + { + "epoch": 1.0430184194890078, + "grad_norm": 0.3991433351839339, + "learning_rate": 1.0812095416594985e-05, + "loss": 0.8302, + "num_tokens": 36685808089.0, + "step": 8777 + }, + { + "epoch": 1.0431372549019609, + "grad_norm": 0.33327431674737495, + "learning_rate": 1.0810363832288478e-05, + "loss": 0.8484, + "num_tokens": 36689998772.0, + "step": 8778 + }, + { + "epoch": 1.0432560903149137, + "grad_norm": 0.306413280569872, + "learning_rate": 1.0808632255004837e-05, + "loss": 0.8315, + "num_tokens": 36694158584.0, + "step": 8779 + }, + { + "epoch": 1.0433749257278668, + "grad_norm": 0.33717662636939627, + "learning_rate": 1.0806900684808192e-05, + "loss": 0.8454, + "num_tokens": 36698329124.0, + "step": 8780 + }, + { + "epoch": 1.04349376114082, + "grad_norm": 0.343655824294017, + "learning_rate": 1.0805169121762666e-05, + "loss": 0.8292, + "num_tokens": 36702488560.0, + "step": 8781 + }, + { + "epoch": 1.043612596553773, + "grad_norm": 0.35527952434815946, + "learning_rate": 1.0803437565932389e-05, + "loss": 0.7961, + "num_tokens": 36706676620.0, + "step": 8782 + }, + { + "epoch": 1.0437314319667261, + "grad_norm": 0.358672065519992, + "learning_rate": 1.0801706017381479e-05, + "loss": 0.806, + "num_tokens": 36710839173.0, + "step": 8783 + }, + { + "epoch": 1.0438502673796792, + "grad_norm": 0.3282042837147095, + "learning_rate": 1.0799974476174066e-05, + "loss": 0.8222, + "num_tokens": 36715028784.0, + "step": 8784 + }, + { + "epoch": 1.043969102792632, + "grad_norm": 0.27017585188657023, + "learning_rate": 1.0798242942374279e-05, + "loss": 0.8664, + "num_tokens": 36719217159.0, + "step": 8785 + }, + { + "epoch": 1.0440879382055852, + "grad_norm": 0.4105020250574245, + "learning_rate": 1.0796511416046235e-05, + "loss": 0.8321, + "num_tokens": 36723406997.0, + "step": 8786 + }, + { + "epoch": 1.0442067736185383, + "grad_norm": 0.33575420282692525, + "learning_rate": 1.0794779897254062e-05, + "loss": 0.8423, + "num_tokens": 36727595801.0, + "step": 8787 + }, + { + "epoch": 1.0443256090314914, + "grad_norm": 0.3542979828343419, + "learning_rate": 1.0793048386061886e-05, + "loss": 0.8461, + "num_tokens": 36731776279.0, + "step": 8788 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.3675879017761067, + "learning_rate": 1.0791316882533826e-05, + "loss": 0.8038, + "num_tokens": 36735945608.0, + "step": 8789 + }, + { + "epoch": 1.0445632798573976, + "grad_norm": 0.383051102368575, + "learning_rate": 1.0789585386734003e-05, + "loss": 0.7967, + "num_tokens": 36740117219.0, + "step": 8790 + }, + { + "epoch": 1.0446821152703505, + "grad_norm": 0.34661090126054683, + "learning_rate": 1.0787853898726548e-05, + "loss": 0.8344, + "num_tokens": 36744301556.0, + "step": 8791 + }, + { + "epoch": 1.0448009506833036, + "grad_norm": 0.3612172982372546, + "learning_rate": 1.0786122418575582e-05, + "loss": 0.8466, + "num_tokens": 36748468156.0, + "step": 8792 + }, + { + "epoch": 1.0449197860962567, + "grad_norm": 0.32593757708914745, + "learning_rate": 1.0784390946345227e-05, + "loss": 0.8198, + "num_tokens": 36752635513.0, + "step": 8793 + }, + { + "epoch": 1.0450386215092098, + "grad_norm": 0.361180989437317, + "learning_rate": 1.07826594820996e-05, + "loss": 0.8307, + "num_tokens": 36756824544.0, + "step": 8794 + }, + { + "epoch": 1.0451574569221629, + "grad_norm": 0.35745084133587557, + "learning_rate": 1.0780928025902829e-05, + "loss": 0.8388, + "num_tokens": 36761014173.0, + "step": 8795 + }, + { + "epoch": 1.0452762923351158, + "grad_norm": 0.3363012799621784, + "learning_rate": 1.0779196577819032e-05, + "loss": 0.8024, + "num_tokens": 36765202834.0, + "step": 8796 + }, + { + "epoch": 1.0453951277480689, + "grad_norm": 0.32868903264518645, + "learning_rate": 1.0777465137912335e-05, + "loss": 0.8012, + "num_tokens": 36769356058.0, + "step": 8797 + }, + { + "epoch": 1.045513963161022, + "grad_norm": 0.3792489250763888, + "learning_rate": 1.0775733706246855e-05, + "loss": 0.8096, + "num_tokens": 36773544237.0, + "step": 8798 + }, + { + "epoch": 1.045632798573975, + "grad_norm": 0.34888083302775114, + "learning_rate": 1.0774002282886715e-05, + "loss": 0.8292, + "num_tokens": 36777734802.0, + "step": 8799 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.3183213181734346, + "learning_rate": 1.0772270867896036e-05, + "loss": 0.809, + "num_tokens": 36781924473.0, + "step": 8800 + }, + { + "epoch": 1.0458704693998813, + "grad_norm": 0.36519082067259906, + "learning_rate": 1.0770539461338936e-05, + "loss": 0.8319, + "num_tokens": 36786114122.0, + "step": 8801 + }, + { + "epoch": 1.0459893048128341, + "grad_norm": 0.3367506886647287, + "learning_rate": 1.0768808063279534e-05, + "loss": 0.8329, + "num_tokens": 36790288537.0, + "step": 8802 + }, + { + "epoch": 1.0461081402257872, + "grad_norm": 0.38718184277839957, + "learning_rate": 1.0767076673781951e-05, + "loss": 0.799, + "num_tokens": 36794477181.0, + "step": 8803 + }, + { + "epoch": 1.0462269756387403, + "grad_norm": 0.3627882600000752, + "learning_rate": 1.0765345292910305e-05, + "loss": 0.8449, + "num_tokens": 36798666201.0, + "step": 8804 + }, + { + "epoch": 1.0463458110516934, + "grad_norm": 0.3540124774264288, + "learning_rate": 1.0763613920728719e-05, + "loss": 0.8273, + "num_tokens": 36802854508.0, + "step": 8805 + }, + { + "epoch": 1.0464646464646465, + "grad_norm": 0.32225816585386796, + "learning_rate": 1.0761882557301308e-05, + "loss": 0.8477, + "num_tokens": 36807042849.0, + "step": 8806 + }, + { + "epoch": 1.0465834818775994, + "grad_norm": 0.4350725232071134, + "learning_rate": 1.0760151202692192e-05, + "loss": 0.7967, + "num_tokens": 36811232840.0, + "step": 8807 + }, + { + "epoch": 1.0467023172905525, + "grad_norm": 0.3230375575718352, + "learning_rate": 1.0758419856965488e-05, + "loss": 0.7929, + "num_tokens": 36815395805.0, + "step": 8808 + }, + { + "epoch": 1.0468211527035056, + "grad_norm": 0.41177657005716317, + "learning_rate": 1.075668852018531e-05, + "loss": 0.8221, + "num_tokens": 36819555919.0, + "step": 8809 + }, + { + "epoch": 1.0469399881164587, + "grad_norm": 0.34893753028669505, + "learning_rate": 1.075495719241578e-05, + "loss": 0.8026, + "num_tokens": 36823746890.0, + "step": 8810 + }, + { + "epoch": 1.0470588235294118, + "grad_norm": 0.43857143142665783, + "learning_rate": 1.0753225873721016e-05, + "loss": 0.8024, + "num_tokens": 36827923452.0, + "step": 8811 + }, + { + "epoch": 1.047177658942365, + "grad_norm": 0.3299922602699784, + "learning_rate": 1.0751494564165133e-05, + "loss": 0.8188, + "num_tokens": 36832111867.0, + "step": 8812 + }, + { + "epoch": 1.0472964943553178, + "grad_norm": 0.47504429827307637, + "learning_rate": 1.0749763263812242e-05, + "loss": 0.8489, + "num_tokens": 36836283000.0, + "step": 8813 + }, + { + "epoch": 1.047415329768271, + "grad_norm": 0.3356515882510668, + "learning_rate": 1.074803197272647e-05, + "loss": 0.8199, + "num_tokens": 36840471904.0, + "step": 8814 + }, + { + "epoch": 1.047534165181224, + "grad_norm": 0.4105682765636655, + "learning_rate": 1.0746300690971918e-05, + "loss": 0.8508, + "num_tokens": 36844661910.0, + "step": 8815 + }, + { + "epoch": 1.047653000594177, + "grad_norm": 0.34844180834193467, + "learning_rate": 1.0744569418612714e-05, + "loss": 0.8108, + "num_tokens": 36848849077.0, + "step": 8816 + }, + { + "epoch": 1.0477718360071302, + "grad_norm": 0.4099324084957549, + "learning_rate": 1.0742838155712967e-05, + "loss": 0.8134, + "num_tokens": 36853039668.0, + "step": 8817 + }, + { + "epoch": 1.0478906714200833, + "grad_norm": 0.32864533867129986, + "learning_rate": 1.0741106902336793e-05, + "loss": 0.7933, + "num_tokens": 36857197702.0, + "step": 8818 + }, + { + "epoch": 1.0480095068330362, + "grad_norm": 0.4147623361809451, + "learning_rate": 1.0739375658548309e-05, + "loss": 0.8363, + "num_tokens": 36861382534.0, + "step": 8819 + }, + { + "epoch": 1.0481283422459893, + "grad_norm": 0.35373887161977974, + "learning_rate": 1.0737644424411619e-05, + "loss": 0.7893, + "num_tokens": 36865544031.0, + "step": 8820 + }, + { + "epoch": 1.0482471776589424, + "grad_norm": 0.3745526808297342, + "learning_rate": 1.0735913199990846e-05, + "loss": 0.8216, + "num_tokens": 36869732086.0, + "step": 8821 + }, + { + "epoch": 1.0483660130718955, + "grad_norm": 0.3219084636311438, + "learning_rate": 1.0734181985350098e-05, + "loss": 0.805, + "num_tokens": 36873920618.0, + "step": 8822 + }, + { + "epoch": 1.0484848484848486, + "grad_norm": 0.38783569931593154, + "learning_rate": 1.0732450780553494e-05, + "loss": 0.8419, + "num_tokens": 36878103509.0, + "step": 8823 + }, + { + "epoch": 1.0486036838978015, + "grad_norm": 0.309725355066436, + "learning_rate": 1.0730719585665142e-05, + "loss": 0.776, + "num_tokens": 36882293369.0, + "step": 8824 + }, + { + "epoch": 1.0487225193107546, + "grad_norm": 0.40715747700890065, + "learning_rate": 1.0728988400749152e-05, + "loss": 0.7799, + "num_tokens": 36886460005.0, + "step": 8825 + }, + { + "epoch": 1.0488413547237077, + "grad_norm": 0.32424960475539627, + "learning_rate": 1.0727257225869638e-05, + "loss": 0.8778, + "num_tokens": 36890647676.0, + "step": 8826 + }, + { + "epoch": 1.0489601901366608, + "grad_norm": 0.42701214011912536, + "learning_rate": 1.0725526061090712e-05, + "loss": 0.8405, + "num_tokens": 36894837088.0, + "step": 8827 + }, + { + "epoch": 1.0490790255496139, + "grad_norm": 0.3821613496468235, + "learning_rate": 1.0723794906476479e-05, + "loss": 0.8522, + "num_tokens": 36899025449.0, + "step": 8828 + }, + { + "epoch": 1.049197860962567, + "grad_norm": 0.39656690271351897, + "learning_rate": 1.0722063762091061e-05, + "loss": 0.8164, + "num_tokens": 36903214003.0, + "step": 8829 + }, + { + "epoch": 1.0493166963755198, + "grad_norm": 0.4064657460874161, + "learning_rate": 1.0720332627998559e-05, + "loss": 0.8416, + "num_tokens": 36907403022.0, + "step": 8830 + }, + { + "epoch": 1.049435531788473, + "grad_norm": 0.3965287654932313, + "learning_rate": 1.0718601504263086e-05, + "loss": 0.8068, + "num_tokens": 36911592051.0, + "step": 8831 + }, + { + "epoch": 1.049554367201426, + "grad_norm": 0.3987525845983342, + "learning_rate": 1.0716870390948753e-05, + "loss": 0.8188, + "num_tokens": 36915774532.0, + "step": 8832 + }, + { + "epoch": 1.0496732026143791, + "grad_norm": 0.3590081826811084, + "learning_rate": 1.0715139288119665e-05, + "loss": 0.8517, + "num_tokens": 36919943854.0, + "step": 8833 + }, + { + "epoch": 1.0497920380273322, + "grad_norm": 0.3991870593554061, + "learning_rate": 1.0713408195839928e-05, + "loss": 0.8336, + "num_tokens": 36924112624.0, + "step": 8834 + }, + { + "epoch": 1.049910873440285, + "grad_norm": 0.3239982408956726, + "learning_rate": 1.0711677114173658e-05, + "loss": 0.8307, + "num_tokens": 36928242061.0, + "step": 8835 + }, + { + "epoch": 1.0500297088532382, + "grad_norm": 0.3861365756038335, + "learning_rate": 1.0709946043184958e-05, + "loss": 0.8089, + "num_tokens": 36932430773.0, + "step": 8836 + }, + { + "epoch": 1.0501485442661913, + "grad_norm": 0.3219662913480753, + "learning_rate": 1.070821498293794e-05, + "loss": 0.8165, + "num_tokens": 36936598421.0, + "step": 8837 + }, + { + "epoch": 1.0502673796791444, + "grad_norm": 0.5002563254323401, + "learning_rate": 1.0706483933496705e-05, + "loss": 0.8119, + "num_tokens": 36940786868.0, + "step": 8838 + }, + { + "epoch": 1.0503862150920975, + "grad_norm": 0.3590743683409118, + "learning_rate": 1.0704752894925364e-05, + "loss": 0.797, + "num_tokens": 36944948001.0, + "step": 8839 + }, + { + "epoch": 1.0505050505050506, + "grad_norm": 0.5172639361939554, + "learning_rate": 1.0703021867288019e-05, + "loss": 0.8261, + "num_tokens": 36949136739.0, + "step": 8840 + }, + { + "epoch": 1.0506238859180035, + "grad_norm": 0.42387166756585215, + "learning_rate": 1.070129085064878e-05, + "loss": 0.8373, + "num_tokens": 36953324998.0, + "step": 8841 + }, + { + "epoch": 1.0507427213309566, + "grad_norm": 0.4695346186690957, + "learning_rate": 1.0699559845071752e-05, + "loss": 0.8139, + "num_tokens": 36957512848.0, + "step": 8842 + }, + { + "epoch": 1.0508615567439097, + "grad_norm": 0.4760473739542786, + "learning_rate": 1.0697828850621037e-05, + "loss": 0.8621, + "num_tokens": 36961702418.0, + "step": 8843 + }, + { + "epoch": 1.0509803921568628, + "grad_norm": 0.41757172231803547, + "learning_rate": 1.0696097867360741e-05, + "loss": 0.8225, + "num_tokens": 36965892080.0, + "step": 8844 + }, + { + "epoch": 1.0510992275698159, + "grad_norm": 0.45901685802642284, + "learning_rate": 1.0694366895354973e-05, + "loss": 0.8106, + "num_tokens": 36970081739.0, + "step": 8845 + }, + { + "epoch": 1.0512180629827688, + "grad_norm": 0.3857934128188982, + "learning_rate": 1.0692635934667827e-05, + "loss": 0.8159, + "num_tokens": 36974259745.0, + "step": 8846 + }, + { + "epoch": 1.0513368983957219, + "grad_norm": 0.4726304635758217, + "learning_rate": 1.0690904985363411e-05, + "loss": 0.8501, + "num_tokens": 36978449324.0, + "step": 8847 + }, + { + "epoch": 1.051455733808675, + "grad_norm": 0.3929761401080904, + "learning_rate": 1.0689174047505832e-05, + "loss": 0.8262, + "num_tokens": 36982638950.0, + "step": 8848 + }, + { + "epoch": 1.051574569221628, + "grad_norm": 0.40491941869878423, + "learning_rate": 1.0687443121159186e-05, + "loss": 0.8118, + "num_tokens": 36986827823.0, + "step": 8849 + }, + { + "epoch": 1.0516934046345812, + "grad_norm": 0.4144484282856308, + "learning_rate": 1.0685712206387582e-05, + "loss": 0.8449, + "num_tokens": 36991017414.0, + "step": 8850 + }, + { + "epoch": 1.0518122400475343, + "grad_norm": 0.378753010670778, + "learning_rate": 1.0683981303255115e-05, + "loss": 0.8124, + "num_tokens": 36995192175.0, + "step": 8851 + }, + { + "epoch": 1.0519310754604871, + "grad_norm": 0.3206484875622215, + "learning_rate": 1.068225041182589e-05, + "loss": 0.8227, + "num_tokens": 36999351658.0, + "step": 8852 + }, + { + "epoch": 1.0520499108734402, + "grad_norm": 0.35842931722701504, + "learning_rate": 1.0680519532164001e-05, + "loss": 0.7992, + "num_tokens": 37003541360.0, + "step": 8853 + }, + { + "epoch": 1.0521687462863933, + "grad_norm": 0.32650408609804044, + "learning_rate": 1.0678788664333559e-05, + "loss": 0.8431, + "num_tokens": 37007721539.0, + "step": 8854 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.3392134164301133, + "learning_rate": 1.067705780839866e-05, + "loss": 0.8263, + "num_tokens": 37011909955.0, + "step": 8855 + }, + { + "epoch": 1.0524064171122995, + "grad_norm": 0.3121356867253765, + "learning_rate": 1.06753269644234e-05, + "loss": 0.8606, + "num_tokens": 37016098028.0, + "step": 8856 + }, + { + "epoch": 1.0525252525252524, + "grad_norm": 0.321845400828896, + "learning_rate": 1.0673596132471882e-05, + "loss": 0.8347, + "num_tokens": 37020260404.0, + "step": 8857 + }, + { + "epoch": 1.0526440879382055, + "grad_norm": 0.382980941287031, + "learning_rate": 1.06718653126082e-05, + "loss": 0.8367, + "num_tokens": 37024450176.0, + "step": 8858 + }, + { + "epoch": 1.0527629233511586, + "grad_norm": 0.33899593378482634, + "learning_rate": 1.0670134504896461e-05, + "loss": 0.8182, + "num_tokens": 37028609907.0, + "step": 8859 + }, + { + "epoch": 1.0528817587641117, + "grad_norm": 0.3155440298695975, + "learning_rate": 1.0668403709400748e-05, + "loss": 0.8346, + "num_tokens": 37032798534.0, + "step": 8860 + }, + { + "epoch": 1.0530005941770648, + "grad_norm": 0.37131746422865464, + "learning_rate": 1.0666672926185172e-05, + "loss": 0.806, + "num_tokens": 37036946249.0, + "step": 8861 + }, + { + "epoch": 1.053119429590018, + "grad_norm": 0.3363608538042741, + "learning_rate": 1.0664942155313821e-05, + "loss": 0.8395, + "num_tokens": 37041130781.0, + "step": 8862 + }, + { + "epoch": 1.0532382650029708, + "grad_norm": 0.37433888375585145, + "learning_rate": 1.0663211396850799e-05, + "loss": 0.8259, + "num_tokens": 37045318676.0, + "step": 8863 + }, + { + "epoch": 1.053357100415924, + "grad_norm": 0.3178477222822965, + "learning_rate": 1.06614806508602e-05, + "loss": 0.8514, + "num_tokens": 37049509037.0, + "step": 8864 + }, + { + "epoch": 1.053475935828877, + "grad_norm": 0.31595883838676064, + "learning_rate": 1.0659749917406111e-05, + "loss": 0.786, + "num_tokens": 37053698888.0, + "step": 8865 + }, + { + "epoch": 1.05359477124183, + "grad_norm": 0.40304387671335257, + "learning_rate": 1.0658019196552634e-05, + "loss": 0.8285, + "num_tokens": 37057887650.0, + "step": 8866 + }, + { + "epoch": 1.0537136066547832, + "grad_norm": 0.35120553319694897, + "learning_rate": 1.0656288488363861e-05, + "loss": 0.8552, + "num_tokens": 37062078742.0, + "step": 8867 + }, + { + "epoch": 1.053832442067736, + "grad_norm": 0.34564728229195024, + "learning_rate": 1.065455779290389e-05, + "loss": 0.8226, + "num_tokens": 37066267477.0, + "step": 8868 + }, + { + "epoch": 1.0539512774806892, + "grad_norm": 0.30598389478739957, + "learning_rate": 1.0652827110236815e-05, + "loss": 0.831, + "num_tokens": 37070456935.0, + "step": 8869 + }, + { + "epoch": 1.0540701128936423, + "grad_norm": 0.4009286810445893, + "learning_rate": 1.0651096440426722e-05, + "loss": 0.8282, + "num_tokens": 37074644906.0, + "step": 8870 + }, + { + "epoch": 1.0541889483065954, + "grad_norm": 0.3426732784503441, + "learning_rate": 1.0649365783537709e-05, + "loss": 0.7994, + "num_tokens": 37078833079.0, + "step": 8871 + }, + { + "epoch": 1.0543077837195485, + "grad_norm": 0.3071850334726734, + "learning_rate": 1.0647635139633868e-05, + "loss": 0.8416, + "num_tokens": 37082964313.0, + "step": 8872 + }, + { + "epoch": 1.0544266191325016, + "grad_norm": 0.34884557948778816, + "learning_rate": 1.0645904508779281e-05, + "loss": 0.7929, + "num_tokens": 37087154766.0, + "step": 8873 + }, + { + "epoch": 1.0545454545454545, + "grad_norm": 0.34792628420751015, + "learning_rate": 1.0644173891038054e-05, + "loss": 0.7926, + "num_tokens": 37091325864.0, + "step": 8874 + }, + { + "epoch": 1.0546642899584076, + "grad_norm": 0.3787176457362715, + "learning_rate": 1.064244328647427e-05, + "loss": 0.8257, + "num_tokens": 37095514984.0, + "step": 8875 + }, + { + "epoch": 1.0547831253713607, + "grad_norm": 0.3331283268798834, + "learning_rate": 1.064071269515202e-05, + "loss": 0.8039, + "num_tokens": 37099681904.0, + "step": 8876 + }, + { + "epoch": 1.0549019607843138, + "grad_norm": 0.35810321372587856, + "learning_rate": 1.0638982117135397e-05, + "loss": 0.8372, + "num_tokens": 37103847200.0, + "step": 8877 + }, + { + "epoch": 1.0550207961972669, + "grad_norm": 0.3383321153444227, + "learning_rate": 1.0637251552488483e-05, + "loss": 0.7844, + "num_tokens": 37108036757.0, + "step": 8878 + }, + { + "epoch": 1.05513963161022, + "grad_norm": 0.4005289217409367, + "learning_rate": 1.0635521001275368e-05, + "loss": 0.8195, + "num_tokens": 37112224658.0, + "step": 8879 + }, + { + "epoch": 1.0552584670231728, + "grad_norm": 0.30726185077006657, + "learning_rate": 1.0633790463560147e-05, + "loss": 0.8089, + "num_tokens": 37116403161.0, + "step": 8880 + }, + { + "epoch": 1.055377302436126, + "grad_norm": 0.35834999904720133, + "learning_rate": 1.06320599394069e-05, + "loss": 0.8162, + "num_tokens": 37120574447.0, + "step": 8881 + }, + { + "epoch": 1.055496137849079, + "grad_norm": 0.34688196913051816, + "learning_rate": 1.0630329428879722e-05, + "loss": 0.8487, + "num_tokens": 37124762239.0, + "step": 8882 + }, + { + "epoch": 1.0556149732620321, + "grad_norm": 0.3650377654547445, + "learning_rate": 1.0628598932042693e-05, + "loss": 0.8423, + "num_tokens": 37128950679.0, + "step": 8883 + }, + { + "epoch": 1.0557338086749852, + "grad_norm": 0.3053694244904652, + "learning_rate": 1.0626868448959902e-05, + "loss": 0.7975, + "num_tokens": 37133139114.0, + "step": 8884 + }, + { + "epoch": 1.055852644087938, + "grad_norm": 0.41344694640491647, + "learning_rate": 1.0625137979695431e-05, + "loss": 0.8542, + "num_tokens": 37137328728.0, + "step": 8885 + }, + { + "epoch": 1.0559714795008912, + "grad_norm": 0.3215286460946876, + "learning_rate": 1.0623407524313369e-05, + "loss": 0.8247, + "num_tokens": 37141504024.0, + "step": 8886 + }, + { + "epoch": 1.0560903149138443, + "grad_norm": 0.38238708822493006, + "learning_rate": 1.0621677082877802e-05, + "loss": 0.7965, + "num_tokens": 37145694291.0, + "step": 8887 + }, + { + "epoch": 1.0562091503267974, + "grad_norm": 0.38022269038086143, + "learning_rate": 1.0619946655452811e-05, + "loss": 0.8925, + "num_tokens": 37149883707.0, + "step": 8888 + }, + { + "epoch": 1.0563279857397505, + "grad_norm": 0.32556891337198157, + "learning_rate": 1.0618216242102479e-05, + "loss": 0.8136, + "num_tokens": 37154048858.0, + "step": 8889 + }, + { + "epoch": 1.0564468211527036, + "grad_norm": 0.39281026308987327, + "learning_rate": 1.0616485842890893e-05, + "loss": 0.8314, + "num_tokens": 37158237993.0, + "step": 8890 + }, + { + "epoch": 1.0565656565656565, + "grad_norm": 0.3399125221168555, + "learning_rate": 1.061475545788213e-05, + "loss": 0.8445, + "num_tokens": 37162377690.0, + "step": 8891 + }, + { + "epoch": 1.0566844919786096, + "grad_norm": 0.35253410603688234, + "learning_rate": 1.0613025087140274e-05, + "loss": 0.8457, + "num_tokens": 37166567084.0, + "step": 8892 + }, + { + "epoch": 1.0568033273915627, + "grad_norm": 0.4068503394756342, + "learning_rate": 1.061129473072941e-05, + "loss": 0.8258, + "num_tokens": 37170758731.0, + "step": 8893 + }, + { + "epoch": 1.0569221628045158, + "grad_norm": 0.35531638668440735, + "learning_rate": 1.0609564388713615e-05, + "loss": 0.8451, + "num_tokens": 37174946891.0, + "step": 8894 + }, + { + "epoch": 1.0570409982174689, + "grad_norm": 0.38176031233621116, + "learning_rate": 1.0607834061156971e-05, + "loss": 0.799, + "num_tokens": 37179136291.0, + "step": 8895 + }, + { + "epoch": 1.0571598336304218, + "grad_norm": 0.34225151768796575, + "learning_rate": 1.060610374812356e-05, + "loss": 0.8261, + "num_tokens": 37183324605.0, + "step": 8896 + }, + { + "epoch": 1.0572786690433749, + "grad_norm": 0.3849083535898103, + "learning_rate": 1.0604373449677456e-05, + "loss": 0.7865, + "num_tokens": 37187488007.0, + "step": 8897 + }, + { + "epoch": 1.057397504456328, + "grad_norm": 0.338778161333121, + "learning_rate": 1.060264316588274e-05, + "loss": 0.8516, + "num_tokens": 37191650663.0, + "step": 8898 + }, + { + "epoch": 1.057516339869281, + "grad_norm": 0.39603372732712994, + "learning_rate": 1.0600912896803495e-05, + "loss": 0.8105, + "num_tokens": 37195824066.0, + "step": 8899 + }, + { + "epoch": 1.0576351752822342, + "grad_norm": 0.33576115807044854, + "learning_rate": 1.0599182642503794e-05, + "loss": 0.8096, + "num_tokens": 37200012606.0, + "step": 8900 + }, + { + "epoch": 1.0577540106951873, + "grad_norm": 0.3903140589741418, + "learning_rate": 1.0597452403047713e-05, + "loss": 0.8157, + "num_tokens": 37204186863.0, + "step": 8901 + }, + { + "epoch": 1.0578728461081401, + "grad_norm": 0.34804791216599246, + "learning_rate": 1.0595722178499332e-05, + "loss": 0.8654, + "num_tokens": 37208377380.0, + "step": 8902 + }, + { + "epoch": 1.0579916815210932, + "grad_norm": 0.3096493778381202, + "learning_rate": 1.0593991968922726e-05, + "loss": 0.7883, + "num_tokens": 37212567190.0, + "step": 8903 + }, + { + "epoch": 1.0581105169340463, + "grad_norm": 0.36888053416624045, + "learning_rate": 1.0592261774381964e-05, + "loss": 0.83, + "num_tokens": 37216747260.0, + "step": 8904 + }, + { + "epoch": 1.0582293523469994, + "grad_norm": 0.29794757564383234, + "learning_rate": 1.0590531594941134e-05, + "loss": 0.8079, + "num_tokens": 37220936145.0, + "step": 8905 + }, + { + "epoch": 1.0583481877599525, + "grad_norm": 0.36890299567629153, + "learning_rate": 1.0588801430664302e-05, + "loss": 0.8408, + "num_tokens": 37225107705.0, + "step": 8906 + }, + { + "epoch": 1.0584670231729054, + "grad_norm": 0.34184870679017515, + "learning_rate": 1.0587071281615544e-05, + "loss": 0.8638, + "num_tokens": 37229296878.0, + "step": 8907 + }, + { + "epoch": 1.0585858585858585, + "grad_norm": 0.3425879378714172, + "learning_rate": 1.0585341147858932e-05, + "loss": 0.8413, + "num_tokens": 37233487162.0, + "step": 8908 + }, + { + "epoch": 1.0587046939988116, + "grad_norm": 0.3292195104824168, + "learning_rate": 1.058361102945854e-05, + "loss": 0.7952, + "num_tokens": 37237676691.0, + "step": 8909 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.31786545721354026, + "learning_rate": 1.0581880926478439e-05, + "loss": 0.8082, + "num_tokens": 37241839464.0, + "step": 8910 + }, + { + "epoch": 1.0589423648247178, + "grad_norm": 0.3494576983746323, + "learning_rate": 1.0580150838982696e-05, + "loss": 0.8496, + "num_tokens": 37246012232.0, + "step": 8911 + }, + { + "epoch": 1.059061200237671, + "grad_norm": 0.34568658097611754, + "learning_rate": 1.0578420767035391e-05, + "loss": 0.8323, + "num_tokens": 37250173158.0, + "step": 8912 + }, + { + "epoch": 1.0591800356506238, + "grad_norm": 0.33587785495144873, + "learning_rate": 1.0576690710700593e-05, + "loss": 0.817, + "num_tokens": 37254336821.0, + "step": 8913 + }, + { + "epoch": 1.059298871063577, + "grad_norm": 0.3263732266043654, + "learning_rate": 1.0574960670042367e-05, + "loss": 0.8037, + "num_tokens": 37258525152.0, + "step": 8914 + }, + { + "epoch": 1.05941770647653, + "grad_norm": 0.30978260545182157, + "learning_rate": 1.0573230645124783e-05, + "loss": 0.8276, + "num_tokens": 37262714022.0, + "step": 8915 + }, + { + "epoch": 1.059536541889483, + "grad_norm": 0.2831235060135965, + "learning_rate": 1.0571500636011916e-05, + "loss": 0.8081, + "num_tokens": 37266903016.0, + "step": 8916 + }, + { + "epoch": 1.0596553773024362, + "grad_norm": 0.37328659698808286, + "learning_rate": 1.0569770642767821e-05, + "loss": 0.8458, + "num_tokens": 37271067135.0, + "step": 8917 + }, + { + "epoch": 1.0597742127153893, + "grad_norm": 0.3453740888882183, + "learning_rate": 1.056804066545658e-05, + "loss": 0.8222, + "num_tokens": 37275233731.0, + "step": 8918 + }, + { + "epoch": 1.0598930481283422, + "grad_norm": 0.3241633822417947, + "learning_rate": 1.0566310704142253e-05, + "loss": 0.8334, + "num_tokens": 37279423123.0, + "step": 8919 + }, + { + "epoch": 1.0600118835412953, + "grad_norm": 0.3118686601766712, + "learning_rate": 1.0564580758888906e-05, + "loss": 0.8141, + "num_tokens": 37283565739.0, + "step": 8920 + }, + { + "epoch": 1.0601307189542484, + "grad_norm": 0.3739639224226822, + "learning_rate": 1.0562850829760607e-05, + "loss": 0.801, + "num_tokens": 37287756086.0, + "step": 8921 + }, + { + "epoch": 1.0602495543672015, + "grad_norm": 0.31109806017305586, + "learning_rate": 1.0561120916821417e-05, + "loss": 0.8391, + "num_tokens": 37291938482.0, + "step": 8922 + }, + { + "epoch": 1.0603683897801546, + "grad_norm": 0.360239750301251, + "learning_rate": 1.0559391020135403e-05, + "loss": 0.8083, + "num_tokens": 37296102467.0, + "step": 8923 + }, + { + "epoch": 1.0604872251931075, + "grad_norm": 0.35452777523708656, + "learning_rate": 1.055766113976663e-05, + "loss": 0.8026, + "num_tokens": 37300259928.0, + "step": 8924 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.3747289091766588, + "learning_rate": 1.0555931275779164e-05, + "loss": 0.8242, + "num_tokens": 37304440665.0, + "step": 8925 + }, + { + "epoch": 1.0607248960190137, + "grad_norm": 0.33527447795481996, + "learning_rate": 1.0554201428237064e-05, + "loss": 0.8436, + "num_tokens": 37308611086.0, + "step": 8926 + }, + { + "epoch": 1.0608437314319668, + "grad_norm": 0.3958868145242084, + "learning_rate": 1.0552471597204388e-05, + "loss": 0.8002, + "num_tokens": 37312801511.0, + "step": 8927 + }, + { + "epoch": 1.0609625668449199, + "grad_norm": 0.3017110472417456, + "learning_rate": 1.0550741782745204e-05, + "loss": 0.8416, + "num_tokens": 37316991502.0, + "step": 8928 + }, + { + "epoch": 1.061081402257873, + "grad_norm": 0.35741181708200115, + "learning_rate": 1.0549011984923574e-05, + "loss": 0.8129, + "num_tokens": 37321180703.0, + "step": 8929 + }, + { + "epoch": 1.0612002376708258, + "grad_norm": 0.3621737236869702, + "learning_rate": 1.0547282203803548e-05, + "loss": 0.827, + "num_tokens": 37325365070.0, + "step": 8930 + }, + { + "epoch": 1.061319073083779, + "grad_norm": 0.3331385727059868, + "learning_rate": 1.0545552439449199e-05, + "loss": 0.8027, + "num_tokens": 37329553497.0, + "step": 8931 + }, + { + "epoch": 1.061437908496732, + "grad_norm": 0.3328321717394697, + "learning_rate": 1.0543822691924576e-05, + "loss": 0.8094, + "num_tokens": 37333741338.0, + "step": 8932 + }, + { + "epoch": 1.0615567439096851, + "grad_norm": 0.33002132631160747, + "learning_rate": 1.0542092961293745e-05, + "loss": 0.8403, + "num_tokens": 37337894138.0, + "step": 8933 + }, + { + "epoch": 1.0616755793226382, + "grad_norm": 0.3494964359269626, + "learning_rate": 1.0540363247620757e-05, + "loss": 0.8356, + "num_tokens": 37342046428.0, + "step": 8934 + }, + { + "epoch": 1.061794414735591, + "grad_norm": 0.3257557088165491, + "learning_rate": 1.0538633550969672e-05, + "loss": 0.8287, + "num_tokens": 37346218621.0, + "step": 8935 + }, + { + "epoch": 1.0619132501485442, + "grad_norm": 0.3332539905264214, + "learning_rate": 1.0536903871404547e-05, + "loss": 0.8255, + "num_tokens": 37350406396.0, + "step": 8936 + }, + { + "epoch": 1.0620320855614973, + "grad_norm": 0.3369171695649806, + "learning_rate": 1.0535174208989436e-05, + "loss": 0.8184, + "num_tokens": 37354595034.0, + "step": 8937 + }, + { + "epoch": 1.0621509209744504, + "grad_norm": 0.36571706896466444, + "learning_rate": 1.0533444563788396e-05, + "loss": 0.8112, + "num_tokens": 37358780558.0, + "step": 8938 + }, + { + "epoch": 1.0622697563874035, + "grad_norm": 0.3841949295485757, + "learning_rate": 1.0531714935865485e-05, + "loss": 0.8256, + "num_tokens": 37362968010.0, + "step": 8939 + }, + { + "epoch": 1.0623885918003566, + "grad_norm": 0.2935883986134588, + "learning_rate": 1.0529985325284751e-05, + "loss": 0.821, + "num_tokens": 37367156535.0, + "step": 8940 + }, + { + "epoch": 1.0625074272133095, + "grad_norm": 0.3540963923346988, + "learning_rate": 1.0528255732110249e-05, + "loss": 0.8306, + "num_tokens": 37371322922.0, + "step": 8941 + }, + { + "epoch": 1.0626262626262626, + "grad_norm": 0.33205183481807843, + "learning_rate": 1.0526526156406029e-05, + "loss": 0.8264, + "num_tokens": 37375512983.0, + "step": 8942 + }, + { + "epoch": 1.0627450980392157, + "grad_norm": 0.3526004862829323, + "learning_rate": 1.0524796598236149e-05, + "loss": 0.8472, + "num_tokens": 37379692362.0, + "step": 8943 + }, + { + "epoch": 1.0628639334521688, + "grad_norm": 0.3365906981952217, + "learning_rate": 1.052306705766466e-05, + "loss": 0.8168, + "num_tokens": 37383882018.0, + "step": 8944 + }, + { + "epoch": 1.0629827688651219, + "grad_norm": 0.3417201067876701, + "learning_rate": 1.052133753475561e-05, + "loss": 0.8013, + "num_tokens": 37388071610.0, + "step": 8945 + }, + { + "epoch": 1.0631016042780748, + "grad_norm": 0.31798050008136847, + "learning_rate": 1.0519608029573048e-05, + "loss": 0.8294, + "num_tokens": 37392260259.0, + "step": 8946 + }, + { + "epoch": 1.0632204396910279, + "grad_norm": 0.3811837805113269, + "learning_rate": 1.0517878542181025e-05, + "loss": 0.8315, + "num_tokens": 37396449957.0, + "step": 8947 + }, + { + "epoch": 1.063339275103981, + "grad_norm": 0.3557110561510365, + "learning_rate": 1.0516149072643592e-05, + "loss": 0.8105, + "num_tokens": 37400634535.0, + "step": 8948 + }, + { + "epoch": 1.063458110516934, + "grad_norm": 0.34618177107493303, + "learning_rate": 1.0514419621024788e-05, + "loss": 0.8344, + "num_tokens": 37404799714.0, + "step": 8949 + }, + { + "epoch": 1.0635769459298872, + "grad_norm": 0.3720022171063581, + "learning_rate": 1.0512690187388671e-05, + "loss": 0.8255, + "num_tokens": 37408988758.0, + "step": 8950 + }, + { + "epoch": 1.0636957813428403, + "grad_norm": 0.3570818136647648, + "learning_rate": 1.0510960771799286e-05, + "loss": 0.8295, + "num_tokens": 37413152370.0, + "step": 8951 + }, + { + "epoch": 1.0638146167557931, + "grad_norm": 0.3813753691423585, + "learning_rate": 1.0509231374320679e-05, + "loss": 0.8029, + "num_tokens": 37417341553.0, + "step": 8952 + }, + { + "epoch": 1.0639334521687462, + "grad_norm": 0.323986010512384, + "learning_rate": 1.050750199501689e-05, + "loss": 0.8748, + "num_tokens": 37421529278.0, + "step": 8953 + }, + { + "epoch": 1.0640522875816993, + "grad_norm": 0.4009013320894373, + "learning_rate": 1.0505772633951967e-05, + "loss": 0.8324, + "num_tokens": 37425719244.0, + "step": 8954 + }, + { + "epoch": 1.0641711229946524, + "grad_norm": 0.34154486064548556, + "learning_rate": 1.0504043291189952e-05, + "loss": 0.8711, + "num_tokens": 37429905834.0, + "step": 8955 + }, + { + "epoch": 1.0642899584076055, + "grad_norm": 0.37760809209001206, + "learning_rate": 1.0502313966794893e-05, + "loss": 0.8465, + "num_tokens": 37434094454.0, + "step": 8956 + }, + { + "epoch": 1.0644087938205584, + "grad_norm": 0.29276429091743433, + "learning_rate": 1.0500584660830835e-05, + "loss": 0.774, + "num_tokens": 37438283547.0, + "step": 8957 + }, + { + "epoch": 1.0645276292335115, + "grad_norm": 0.4291513386246879, + "learning_rate": 1.0498855373361812e-05, + "loss": 0.7903, + "num_tokens": 37442471014.0, + "step": 8958 + }, + { + "epoch": 1.0646464646464646, + "grad_norm": 0.324521282778607, + "learning_rate": 1.0497126104451867e-05, + "loss": 0.8244, + "num_tokens": 37446657115.0, + "step": 8959 + }, + { + "epoch": 1.0647653000594177, + "grad_norm": 0.5151485417954539, + "learning_rate": 1.0495396854165045e-05, + "loss": 0.8301, + "num_tokens": 37450817148.0, + "step": 8960 + }, + { + "epoch": 1.0648841354723708, + "grad_norm": 0.34079192091126237, + "learning_rate": 1.0493667622565385e-05, + "loss": 0.8272, + "num_tokens": 37454997138.0, + "step": 8961 + }, + { + "epoch": 1.065002970885324, + "grad_norm": 0.5366632724884918, + "learning_rate": 1.049193840971692e-05, + "loss": 0.82, + "num_tokens": 37459186442.0, + "step": 8962 + }, + { + "epoch": 1.0651218062982768, + "grad_norm": 0.36117192382786323, + "learning_rate": 1.0490209215683697e-05, + "loss": 0.8237, + "num_tokens": 37463374442.0, + "step": 8963 + }, + { + "epoch": 1.06524064171123, + "grad_norm": 0.45734543466699873, + "learning_rate": 1.0488480040529749e-05, + "loss": 0.8008, + "num_tokens": 37467565152.0, + "step": 8964 + }, + { + "epoch": 1.065359477124183, + "grad_norm": 0.37872782720119813, + "learning_rate": 1.0486750884319118e-05, + "loss": 0.8103, + "num_tokens": 37471753490.0, + "step": 8965 + }, + { + "epoch": 1.065478312537136, + "grad_norm": 0.4969633384774387, + "learning_rate": 1.0485021747115834e-05, + "loss": 0.822, + "num_tokens": 37475913043.0, + "step": 8966 + }, + { + "epoch": 1.0655971479500892, + "grad_norm": 0.3938630476227264, + "learning_rate": 1.0483292628983937e-05, + "loss": 0.7922, + "num_tokens": 37480102828.0, + "step": 8967 + }, + { + "epoch": 1.065715983363042, + "grad_norm": 0.4923983789664628, + "learning_rate": 1.0481563529987458e-05, + "loss": 0.8323, + "num_tokens": 37484292044.0, + "step": 8968 + }, + { + "epoch": 1.0658348187759952, + "grad_norm": 0.4410206373274995, + "learning_rate": 1.0479834450190436e-05, + "loss": 0.8249, + "num_tokens": 37488472077.0, + "step": 8969 + }, + { + "epoch": 1.0659536541889483, + "grad_norm": 0.3951017744559477, + "learning_rate": 1.0478105389656905e-05, + "loss": 0.8392, + "num_tokens": 37492647551.0, + "step": 8970 + }, + { + "epoch": 1.0660724896019014, + "grad_norm": 0.42239000403101973, + "learning_rate": 1.0476376348450894e-05, + "loss": 0.8334, + "num_tokens": 37496815365.0, + "step": 8971 + }, + { + "epoch": 1.0661913250148545, + "grad_norm": 0.3469373812233118, + "learning_rate": 1.047464732663644e-05, + "loss": 0.8465, + "num_tokens": 37501004434.0, + "step": 8972 + }, + { + "epoch": 1.0663101604278076, + "grad_norm": 0.4196472275939929, + "learning_rate": 1.0472918324277569e-05, + "loss": 0.8224, + "num_tokens": 37505193744.0, + "step": 8973 + }, + { + "epoch": 1.0664289958407605, + "grad_norm": 0.3362518023083104, + "learning_rate": 1.0471189341438312e-05, + "loss": 0.8189, + "num_tokens": 37509382739.0, + "step": 8974 + }, + { + "epoch": 1.0665478312537136, + "grad_norm": 0.4001129696508956, + "learning_rate": 1.0469460378182707e-05, + "loss": 0.8153, + "num_tokens": 37513551750.0, + "step": 8975 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.3048814840019371, + "learning_rate": 1.0467731434574778e-05, + "loss": 0.8009, + "num_tokens": 37517715770.0, + "step": 8976 + }, + { + "epoch": 1.0667855020796198, + "grad_norm": 0.42167089913075706, + "learning_rate": 1.046600251067855e-05, + "loss": 0.7984, + "num_tokens": 37521904903.0, + "step": 8977 + }, + { + "epoch": 1.0669043374925729, + "grad_norm": 0.3103046782208884, + "learning_rate": 1.0464273606558056e-05, + "loss": 0.8707, + "num_tokens": 37526093158.0, + "step": 8978 + }, + { + "epoch": 1.0670231729055257, + "grad_norm": 0.37782487474103926, + "learning_rate": 1.0462544722277324e-05, + "loss": 0.8204, + "num_tokens": 37530282824.0, + "step": 8979 + }, + { + "epoch": 1.0671420083184788, + "grad_norm": 0.34537874749252123, + "learning_rate": 1.0460815857900374e-05, + "loss": 0.8049, + "num_tokens": 37534463488.0, + "step": 8980 + }, + { + "epoch": 1.067260843731432, + "grad_norm": 0.35978513267057327, + "learning_rate": 1.0459087013491236e-05, + "loss": 0.8432, + "num_tokens": 37538653919.0, + "step": 8981 + }, + { + "epoch": 1.067379679144385, + "grad_norm": 0.32462900114731913, + "learning_rate": 1.0457358189113935e-05, + "loss": 0.7867, + "num_tokens": 37542843283.0, + "step": 8982 + }, + { + "epoch": 1.0674985145573381, + "grad_norm": 0.37558583323905426, + "learning_rate": 1.0455629384832499e-05, + "loss": 0.8252, + "num_tokens": 37547009312.0, + "step": 8983 + }, + { + "epoch": 1.0676173499702912, + "grad_norm": 0.32947506943060134, + "learning_rate": 1.0453900600710947e-05, + "loss": 0.8454, + "num_tokens": 37551197835.0, + "step": 8984 + }, + { + "epoch": 1.067736185383244, + "grad_norm": 0.37020885783658564, + "learning_rate": 1.0452171836813299e-05, + "loss": 0.814, + "num_tokens": 37555387316.0, + "step": 8985 + }, + { + "epoch": 1.0678550207961972, + "grad_norm": 0.38714807458327927, + "learning_rate": 1.0450443093203579e-05, + "loss": 0.8514, + "num_tokens": 37559577289.0, + "step": 8986 + }, + { + "epoch": 1.0679738562091503, + "grad_norm": 0.2960594770597502, + "learning_rate": 1.0448714369945811e-05, + "loss": 0.8466, + "num_tokens": 37563759808.0, + "step": 8987 + }, + { + "epoch": 1.0680926916221034, + "grad_norm": 0.3671155588576987, + "learning_rate": 1.0446985667104016e-05, + "loss": 0.7894, + "num_tokens": 37567949144.0, + "step": 8988 + }, + { + "epoch": 1.0682115270350565, + "grad_norm": 0.3101795128282589, + "learning_rate": 1.044525698474221e-05, + "loss": 0.8045, + "num_tokens": 37572137015.0, + "step": 8989 + }, + { + "epoch": 1.0683303624480096, + "grad_norm": 0.3110365532120813, + "learning_rate": 1.0443528322924412e-05, + "loss": 0.8397, + "num_tokens": 37576311649.0, + "step": 8990 + }, + { + "epoch": 1.0684491978609625, + "grad_norm": 0.3543314089634232, + "learning_rate": 1.0441799681714643e-05, + "loss": 0.8268, + "num_tokens": 37580501227.0, + "step": 8991 + }, + { + "epoch": 1.0685680332739156, + "grad_norm": 0.3405620089137928, + "learning_rate": 1.0440071061176922e-05, + "loss": 0.837, + "num_tokens": 37584679365.0, + "step": 8992 + }, + { + "epoch": 1.0686868686868687, + "grad_norm": 0.3673829969375856, + "learning_rate": 1.0438342461375258e-05, + "loss": 0.8229, + "num_tokens": 37588868824.0, + "step": 8993 + }, + { + "epoch": 1.0688057040998218, + "grad_norm": 0.33304194720966157, + "learning_rate": 1.0436613882373672e-05, + "loss": 0.8663, + "num_tokens": 37593056939.0, + "step": 8994 + }, + { + "epoch": 1.0689245395127749, + "grad_norm": 0.30517293239572124, + "learning_rate": 1.0434885324236182e-05, + "loss": 0.8293, + "num_tokens": 37597245689.0, + "step": 8995 + }, + { + "epoch": 1.0690433749257278, + "grad_norm": 0.3884807244010332, + "learning_rate": 1.0433156787026797e-05, + "loss": 0.79, + "num_tokens": 37601434577.0, + "step": 8996 + }, + { + "epoch": 1.0691622103386809, + "grad_norm": 0.35274611038025544, + "learning_rate": 1.0431428270809533e-05, + "loss": 0.8163, + "num_tokens": 37605586203.0, + "step": 8997 + }, + { + "epoch": 1.069281045751634, + "grad_norm": 0.3202432372117104, + "learning_rate": 1.0429699775648404e-05, + "loss": 0.8256, + "num_tokens": 37609748580.0, + "step": 8998 + }, + { + "epoch": 1.069399881164587, + "grad_norm": 0.34492944330790554, + "learning_rate": 1.0427971301607417e-05, + "loss": 0.8437, + "num_tokens": 37613938347.0, + "step": 8999 + }, + { + "epoch": 1.0695187165775402, + "grad_norm": 0.3139456389237318, + "learning_rate": 1.042624284875059e-05, + "loss": 0.8071, + "num_tokens": 37618127760.0, + "step": 9000 + }, + { + "epoch": 1.0696375519904933, + "grad_norm": 0.41047256705324203, + "learning_rate": 1.0424514417141927e-05, + "loss": 0.8206, + "num_tokens": 37622318515.0, + "step": 9001 + }, + { + "epoch": 1.0697563874034461, + "grad_norm": 0.3290862348667071, + "learning_rate": 1.0422786006845447e-05, + "loss": 0.7901, + "num_tokens": 37626489746.0, + "step": 9002 + }, + { + "epoch": 1.0698752228163992, + "grad_norm": 0.31472264490542057, + "learning_rate": 1.0421057617925148e-05, + "loss": 0.8296, + "num_tokens": 37630659775.0, + "step": 9003 + }, + { + "epoch": 1.0699940582293523, + "grad_norm": 0.35770172523155175, + "learning_rate": 1.0419329250445043e-05, + "loss": 0.8302, + "num_tokens": 37634835201.0, + "step": 9004 + }, + { + "epoch": 1.0701128936423054, + "grad_norm": 0.3985342459478452, + "learning_rate": 1.041760090446914e-05, + "loss": 0.8256, + "num_tokens": 37639023729.0, + "step": 9005 + }, + { + "epoch": 1.0702317290552585, + "grad_norm": 0.3238712531412298, + "learning_rate": 1.0415872580061442e-05, + "loss": 0.8355, + "num_tokens": 37643163232.0, + "step": 9006 + }, + { + "epoch": 1.0703505644682116, + "grad_norm": 0.33259092517989275, + "learning_rate": 1.0414144277285962e-05, + "loss": 0.8135, + "num_tokens": 37647346282.0, + "step": 9007 + }, + { + "epoch": 1.0704693998811645, + "grad_norm": 0.38569846269826247, + "learning_rate": 1.0412415996206698e-05, + "loss": 0.8446, + "num_tokens": 37651534690.0, + "step": 9008 + }, + { + "epoch": 1.0705882352941176, + "grad_norm": 0.3473701646851625, + "learning_rate": 1.0410687736887657e-05, + "loss": 0.7866, + "num_tokens": 37655705904.0, + "step": 9009 + }, + { + "epoch": 1.0707070707070707, + "grad_norm": 0.34448665524288946, + "learning_rate": 1.0408959499392844e-05, + "loss": 0.8498, + "num_tokens": 37659866176.0, + "step": 9010 + }, + { + "epoch": 1.0708259061200238, + "grad_norm": 0.30059164459783455, + "learning_rate": 1.0407231283786255e-05, + "loss": 0.8459, + "num_tokens": 37664025831.0, + "step": 9011 + }, + { + "epoch": 1.070944741532977, + "grad_norm": 0.3436445878702961, + "learning_rate": 1.0405503090131894e-05, + "loss": 0.8122, + "num_tokens": 37668215993.0, + "step": 9012 + }, + { + "epoch": 1.0710635769459298, + "grad_norm": 0.3569691575919557, + "learning_rate": 1.0403774918493765e-05, + "loss": 0.8281, + "num_tokens": 37672399252.0, + "step": 9013 + }, + { + "epoch": 1.071182412358883, + "grad_norm": 0.33585227184833266, + "learning_rate": 1.040204676893587e-05, + "loss": 0.8094, + "num_tokens": 37676583112.0, + "step": 9014 + }, + { + "epoch": 1.071301247771836, + "grad_norm": 0.3395327892752614, + "learning_rate": 1.0400318641522204e-05, + "loss": 0.8193, + "num_tokens": 37680767721.0, + "step": 9015 + }, + { + "epoch": 1.071420083184789, + "grad_norm": 0.3525831352250506, + "learning_rate": 1.0398590536316766e-05, + "loss": 0.8045, + "num_tokens": 37684955302.0, + "step": 9016 + }, + { + "epoch": 1.0715389185977422, + "grad_norm": 0.3264705748209273, + "learning_rate": 1.0396862453383553e-05, + "loss": 0.8016, + "num_tokens": 37689144346.0, + "step": 9017 + }, + { + "epoch": 1.0716577540106953, + "grad_norm": 0.3288788046446992, + "learning_rate": 1.0395134392786563e-05, + "loss": 0.8313, + "num_tokens": 37693332269.0, + "step": 9018 + }, + { + "epoch": 1.0717765894236482, + "grad_norm": 0.34342788481118336, + "learning_rate": 1.0393406354589788e-05, + "loss": 0.7795, + "num_tokens": 37697522271.0, + "step": 9019 + }, + { + "epoch": 1.0718954248366013, + "grad_norm": 0.3152467912974016, + "learning_rate": 1.0391678338857232e-05, + "loss": 0.8277, + "num_tokens": 37701710223.0, + "step": 9020 + }, + { + "epoch": 1.0720142602495544, + "grad_norm": 0.3072721304477808, + "learning_rate": 1.0389950345652882e-05, + "loss": 0.8073, + "num_tokens": 37705883130.0, + "step": 9021 + }, + { + "epoch": 1.0721330956625075, + "grad_norm": 0.3610150219131305, + "learning_rate": 1.0388222375040732e-05, + "loss": 0.8109, + "num_tokens": 37710071295.0, + "step": 9022 + }, + { + "epoch": 1.0722519310754606, + "grad_norm": 0.3284687072608756, + "learning_rate": 1.0386494427084781e-05, + "loss": 0.8266, + "num_tokens": 37714260250.0, + "step": 9023 + }, + { + "epoch": 1.0723707664884135, + "grad_norm": 0.4067494183622208, + "learning_rate": 1.0384766501849011e-05, + "loss": 0.8059, + "num_tokens": 37718448236.0, + "step": 9024 + }, + { + "epoch": 1.0724896019013666, + "grad_norm": 0.3455868517633838, + "learning_rate": 1.0383038599397416e-05, + "loss": 0.8429, + "num_tokens": 37722637142.0, + "step": 9025 + }, + { + "epoch": 1.0726084373143197, + "grad_norm": 0.27285110361904374, + "learning_rate": 1.0381310719793991e-05, + "loss": 0.8266, + "num_tokens": 37726826131.0, + "step": 9026 + }, + { + "epoch": 1.0727272727272728, + "grad_norm": 0.35040824245853125, + "learning_rate": 1.037958286310272e-05, + "loss": 0.8029, + "num_tokens": 37731015758.0, + "step": 9027 + }, + { + "epoch": 1.0728461081402259, + "grad_norm": 0.36153779653212786, + "learning_rate": 1.0377855029387596e-05, + "loss": 0.8213, + "num_tokens": 37735203293.0, + "step": 9028 + }, + { + "epoch": 1.072964943553179, + "grad_norm": 0.31941342722675947, + "learning_rate": 1.0376127218712602e-05, + "loss": 0.8742, + "num_tokens": 37739344065.0, + "step": 9029 + }, + { + "epoch": 1.0730837789661318, + "grad_norm": 0.3256633825796963, + "learning_rate": 1.0374399431141726e-05, + "loss": 0.7986, + "num_tokens": 37743503461.0, + "step": 9030 + }, + { + "epoch": 1.073202614379085, + "grad_norm": 0.3396061681111021, + "learning_rate": 1.0372671666738954e-05, + "loss": 0.8166, + "num_tokens": 37747691975.0, + "step": 9031 + }, + { + "epoch": 1.073321449792038, + "grad_norm": 0.3075222620828505, + "learning_rate": 1.037094392556827e-05, + "loss": 0.8573, + "num_tokens": 37751880240.0, + "step": 9032 + }, + { + "epoch": 1.0734402852049911, + "grad_norm": 0.34851104655721, + "learning_rate": 1.0369216207693664e-05, + "loss": 0.834, + "num_tokens": 37756043900.0, + "step": 9033 + }, + { + "epoch": 1.0735591206179442, + "grad_norm": 0.3384662050837318, + "learning_rate": 1.0367488513179111e-05, + "loss": 0.8033, + "num_tokens": 37760233411.0, + "step": 9034 + }, + { + "epoch": 1.073677956030897, + "grad_norm": 0.3554575804246185, + "learning_rate": 1.0365760842088596e-05, + "loss": 0.819, + "num_tokens": 37764422240.0, + "step": 9035 + }, + { + "epoch": 1.0737967914438502, + "grad_norm": 0.3746181987940837, + "learning_rate": 1.0364033194486106e-05, + "loss": 0.8124, + "num_tokens": 37768611070.0, + "step": 9036 + }, + { + "epoch": 1.0739156268568033, + "grad_norm": 0.31434837682252154, + "learning_rate": 1.0362305570435615e-05, + "loss": 0.8041, + "num_tokens": 37772802076.0, + "step": 9037 + }, + { + "epoch": 1.0740344622697564, + "grad_norm": 0.3183431841016488, + "learning_rate": 1.0360577970001102e-05, + "loss": 0.8286, + "num_tokens": 37776962361.0, + "step": 9038 + }, + { + "epoch": 1.0741532976827095, + "grad_norm": 0.3556282247632405, + "learning_rate": 1.0358850393246552e-05, + "loss": 0.8086, + "num_tokens": 37781121631.0, + "step": 9039 + }, + { + "epoch": 1.0742721330956626, + "grad_norm": 0.30559494768124457, + "learning_rate": 1.0357122840235936e-05, + "loss": 0.7942, + "num_tokens": 37785301939.0, + "step": 9040 + }, + { + "epoch": 1.0743909685086155, + "grad_norm": 0.3192663344864926, + "learning_rate": 1.0355395311033241e-05, + "loss": 0.8421, + "num_tokens": 37789490472.0, + "step": 9041 + }, + { + "epoch": 1.0745098039215686, + "grad_norm": 0.3697314464999406, + "learning_rate": 1.0353667805702432e-05, + "loss": 0.8172, + "num_tokens": 37793634977.0, + "step": 9042 + }, + { + "epoch": 1.0746286393345217, + "grad_norm": 0.3491328923440963, + "learning_rate": 1.035194032430749e-05, + "loss": 0.797, + "num_tokens": 37797824651.0, + "step": 9043 + }, + { + "epoch": 1.0747474747474748, + "grad_norm": 0.32721746410618, + "learning_rate": 1.0350212866912388e-05, + "loss": 0.8304, + "num_tokens": 37801987514.0, + "step": 9044 + }, + { + "epoch": 1.0748663101604279, + "grad_norm": 0.2930690648646215, + "learning_rate": 1.0348485433581104e-05, + "loss": 0.8234, + "num_tokens": 37806153937.0, + "step": 9045 + }, + { + "epoch": 1.0749851455733808, + "grad_norm": 0.31485379244148104, + "learning_rate": 1.0346758024377606e-05, + "loss": 0.8423, + "num_tokens": 37810331828.0, + "step": 9046 + }, + { + "epoch": 1.0751039809863339, + "grad_norm": 0.33360300594556924, + "learning_rate": 1.0345030639365866e-05, + "loss": 0.7968, + "num_tokens": 37814520819.0, + "step": 9047 + }, + { + "epoch": 1.075222816399287, + "grad_norm": 0.3938566872875863, + "learning_rate": 1.0343303278609858e-05, + "loss": 0.8787, + "num_tokens": 37818709003.0, + "step": 9048 + }, + { + "epoch": 1.07534165181224, + "grad_norm": 0.2814980280785405, + "learning_rate": 1.0341575942173549e-05, + "loss": 0.8408, + "num_tokens": 37822897202.0, + "step": 9049 + }, + { + "epoch": 1.0754604872251932, + "grad_norm": 0.4589000187735482, + "learning_rate": 1.0339848630120908e-05, + "loss": 0.8409, + "num_tokens": 37827057017.0, + "step": 9050 + }, + { + "epoch": 1.0755793226381463, + "grad_norm": 0.35049331447369997, + "learning_rate": 1.0338121342515903e-05, + "loss": 0.8233, + "num_tokens": 37831246546.0, + "step": 9051 + }, + { + "epoch": 1.0756981580510991, + "grad_norm": 0.320711577379804, + "learning_rate": 1.0336394079422505e-05, + "loss": 0.7829, + "num_tokens": 37835433967.0, + "step": 9052 + }, + { + "epoch": 1.0758169934640522, + "grad_norm": 0.4561113333625083, + "learning_rate": 1.0334666840904677e-05, + "loss": 0.8161, + "num_tokens": 37839623949.0, + "step": 9053 + }, + { + "epoch": 1.0759358288770053, + "grad_norm": 0.33369784640705985, + "learning_rate": 1.0332939627026386e-05, + "loss": 0.7839, + "num_tokens": 37843815587.0, + "step": 9054 + }, + { + "epoch": 1.0760546642899584, + "grad_norm": 0.36550880374445455, + "learning_rate": 1.0331212437851596e-05, + "loss": 0.8213, + "num_tokens": 37847992641.0, + "step": 9055 + }, + { + "epoch": 1.0761734997029115, + "grad_norm": 0.37064973128652867, + "learning_rate": 1.032948527344427e-05, + "loss": 0.8442, + "num_tokens": 37852181648.0, + "step": 9056 + }, + { + "epoch": 1.0762923351158644, + "grad_norm": 0.3588084996453163, + "learning_rate": 1.0327758133868371e-05, + "loss": 0.7977, + "num_tokens": 37856372338.0, + "step": 9057 + }, + { + "epoch": 1.0764111705288175, + "grad_norm": 0.2913580903933559, + "learning_rate": 1.0326031019187862e-05, + "loss": 0.8139, + "num_tokens": 37860560786.0, + "step": 9058 + }, + { + "epoch": 1.0765300059417706, + "grad_norm": 0.3769450545124615, + "learning_rate": 1.0324303929466704e-05, + "loss": 0.8321, + "num_tokens": 37864734439.0, + "step": 9059 + }, + { + "epoch": 1.0766488413547237, + "grad_norm": 0.34396712503289356, + "learning_rate": 1.0322576864768854e-05, + "loss": 0.8289, + "num_tokens": 37868923906.0, + "step": 9060 + }, + { + "epoch": 1.0767676767676768, + "grad_norm": 0.30655873975996795, + "learning_rate": 1.0320849825158274e-05, + "loss": 0.8361, + "num_tokens": 37873110068.0, + "step": 9061 + }, + { + "epoch": 1.07688651218063, + "grad_norm": 0.365619582588401, + "learning_rate": 1.0319122810698925e-05, + "loss": 0.8153, + "num_tokens": 37877300524.0, + "step": 9062 + }, + { + "epoch": 1.0770053475935828, + "grad_norm": 0.3281461783968593, + "learning_rate": 1.0317395821454751e-05, + "loss": 0.8013, + "num_tokens": 37881489065.0, + "step": 9063 + }, + { + "epoch": 1.077124183006536, + "grad_norm": 0.3751963664943412, + "learning_rate": 1.0315668857489728e-05, + "loss": 0.8123, + "num_tokens": 37885677888.0, + "step": 9064 + }, + { + "epoch": 1.077243018419489, + "grad_norm": 0.3314920036317571, + "learning_rate": 1.0313941918867794e-05, + "loss": 0.8193, + "num_tokens": 37889867269.0, + "step": 9065 + }, + { + "epoch": 1.077361853832442, + "grad_norm": 0.33357082059914805, + "learning_rate": 1.0312215005652915e-05, + "loss": 0.8136, + "num_tokens": 37894038783.0, + "step": 9066 + }, + { + "epoch": 1.0774806892453952, + "grad_norm": 0.34379986165072, + "learning_rate": 1.0310488117909037e-05, + "loss": 0.7985, + "num_tokens": 37898202017.0, + "step": 9067 + }, + { + "epoch": 1.077599524658348, + "grad_norm": 0.3173946455850183, + "learning_rate": 1.0308761255700118e-05, + "loss": 0.812, + "num_tokens": 37902360907.0, + "step": 9068 + }, + { + "epoch": 1.0777183600713012, + "grad_norm": 0.3755151866564698, + "learning_rate": 1.0307034419090102e-05, + "loss": 0.8527, + "num_tokens": 37906482696.0, + "step": 9069 + }, + { + "epoch": 1.0778371954842543, + "grad_norm": 0.3119348457172799, + "learning_rate": 1.0305307608142948e-05, + "loss": 0.8184, + "num_tokens": 37910672430.0, + "step": 9070 + }, + { + "epoch": 1.0779560308972074, + "grad_norm": 0.3308049157420249, + "learning_rate": 1.03035808229226e-05, + "loss": 0.8155, + "num_tokens": 37914861310.0, + "step": 9071 + }, + { + "epoch": 1.0780748663101605, + "grad_norm": 0.3087813721178007, + "learning_rate": 1.0301854063493012e-05, + "loss": 0.8131, + "num_tokens": 37919051381.0, + "step": 9072 + }, + { + "epoch": 1.0781937017231136, + "grad_norm": 0.35031283511406497, + "learning_rate": 1.0300127329918125e-05, + "loss": 0.8294, + "num_tokens": 37923226355.0, + "step": 9073 + }, + { + "epoch": 1.0783125371360665, + "grad_norm": 0.3503725303520233, + "learning_rate": 1.0298400622261892e-05, + "loss": 0.8098, + "num_tokens": 37927364101.0, + "step": 9074 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.3136993093741502, + "learning_rate": 1.0296673940588255e-05, + "loss": 0.7938, + "num_tokens": 37931471770.0, + "step": 9075 + }, + { + "epoch": 1.0785502079619727, + "grad_norm": 0.3665677216491352, + "learning_rate": 1.0294947284961159e-05, + "loss": 0.8394, + "num_tokens": 37935661849.0, + "step": 9076 + }, + { + "epoch": 1.0786690433749258, + "grad_norm": 0.3552368555645882, + "learning_rate": 1.0293220655444548e-05, + "loss": 0.8329, + "num_tokens": 37939831520.0, + "step": 9077 + }, + { + "epoch": 1.0787878787878789, + "grad_norm": 0.33419704811913303, + "learning_rate": 1.0291494052102369e-05, + "loss": 0.8362, + "num_tokens": 37944007557.0, + "step": 9078 + }, + { + "epoch": 1.078906714200832, + "grad_norm": 0.3672209989728047, + "learning_rate": 1.028976747499856e-05, + "loss": 0.8004, + "num_tokens": 37948189296.0, + "step": 9079 + }, + { + "epoch": 1.0790255496137848, + "grad_norm": 0.3337919861016202, + "learning_rate": 1.0288040924197064e-05, + "loss": 0.8596, + "num_tokens": 37952379146.0, + "step": 9080 + }, + { + "epoch": 1.079144385026738, + "grad_norm": 0.321921161496348, + "learning_rate": 1.028631439976182e-05, + "loss": 0.7655, + "num_tokens": 37956568738.0, + "step": 9081 + }, + { + "epoch": 1.079263220439691, + "grad_norm": 0.37164201202315, + "learning_rate": 1.028458790175676e-05, + "loss": 0.7726, + "num_tokens": 37960758167.0, + "step": 9082 + }, + { + "epoch": 1.0793820558526441, + "grad_norm": 0.2857611437880359, + "learning_rate": 1.0282861430245835e-05, + "loss": 0.8134, + "num_tokens": 37964918913.0, + "step": 9083 + }, + { + "epoch": 1.0795008912655972, + "grad_norm": 0.38551601974655125, + "learning_rate": 1.0281134985292973e-05, + "loss": 0.8537, + "num_tokens": 37969102482.0, + "step": 9084 + }, + { + "epoch": 1.07961972667855, + "grad_norm": 0.310300086410222, + "learning_rate": 1.0279408566962117e-05, + "loss": 0.8491, + "num_tokens": 37973289713.0, + "step": 9085 + }, + { + "epoch": 1.0797385620915032, + "grad_norm": 0.3521354681863683, + "learning_rate": 1.0277682175317196e-05, + "loss": 0.8435, + "num_tokens": 37977460640.0, + "step": 9086 + }, + { + "epoch": 1.0798573975044563, + "grad_norm": 0.34138055979317544, + "learning_rate": 1.0275955810422145e-05, + "loss": 0.7842, + "num_tokens": 37981650451.0, + "step": 9087 + }, + { + "epoch": 1.0799762329174094, + "grad_norm": 0.2890336274460908, + "learning_rate": 1.0274229472340901e-05, + "loss": 0.8269, + "num_tokens": 37985821355.0, + "step": 9088 + }, + { + "epoch": 1.0800950683303625, + "grad_norm": 0.3431284571400749, + "learning_rate": 1.0272503161137388e-05, + "loss": 0.818, + "num_tokens": 37989992077.0, + "step": 9089 + }, + { + "epoch": 1.0802139037433156, + "grad_norm": 0.327612510118257, + "learning_rate": 1.027077687687555e-05, + "loss": 0.7837, + "num_tokens": 37994179306.0, + "step": 9090 + }, + { + "epoch": 1.0803327391562685, + "grad_norm": 0.3351533949097503, + "learning_rate": 1.0269050619619305e-05, + "loss": 0.8361, + "num_tokens": 37998337115.0, + "step": 9091 + }, + { + "epoch": 1.0804515745692216, + "grad_norm": 0.36789416108908424, + "learning_rate": 1.026732438943259e-05, + "loss": 0.8357, + "num_tokens": 38002526712.0, + "step": 9092 + }, + { + "epoch": 1.0805704099821747, + "grad_norm": 0.2920810289272758, + "learning_rate": 1.0265598186379329e-05, + "loss": 0.8164, + "num_tokens": 38006693195.0, + "step": 9093 + }, + { + "epoch": 1.0806892453951278, + "grad_norm": 0.3312144229289634, + "learning_rate": 1.026387201052345e-05, + "loss": 0.7825, + "num_tokens": 38010880351.0, + "step": 9094 + }, + { + "epoch": 1.0808080808080809, + "grad_norm": 0.31067355193652335, + "learning_rate": 1.0262145861928876e-05, + "loss": 0.838, + "num_tokens": 38015067815.0, + "step": 9095 + }, + { + "epoch": 1.080926916221034, + "grad_norm": 0.31888907816854883, + "learning_rate": 1.0260419740659539e-05, + "loss": 0.8193, + "num_tokens": 38019256225.0, + "step": 9096 + }, + { + "epoch": 1.0810457516339869, + "grad_norm": 0.32465231122548854, + "learning_rate": 1.0258693646779359e-05, + "loss": 0.8263, + "num_tokens": 38023426653.0, + "step": 9097 + }, + { + "epoch": 1.08116458704694, + "grad_norm": 0.32399114377130533, + "learning_rate": 1.0256967580352262e-05, + "loss": 0.8035, + "num_tokens": 38027587091.0, + "step": 9098 + }, + { + "epoch": 1.081283422459893, + "grad_norm": 0.33133094162919524, + "learning_rate": 1.0255241541442164e-05, + "loss": 0.8346, + "num_tokens": 38031744793.0, + "step": 9099 + }, + { + "epoch": 1.0814022578728462, + "grad_norm": 0.37780067594966316, + "learning_rate": 1.0253515530112992e-05, + "loss": 0.8116, + "num_tokens": 38035920561.0, + "step": 9100 + }, + { + "epoch": 1.0815210932857993, + "grad_norm": 0.31948043756213074, + "learning_rate": 1.0251789546428661e-05, + "loss": 0.8167, + "num_tokens": 38040080891.0, + "step": 9101 + }, + { + "epoch": 1.0816399286987521, + "grad_norm": 0.31980637357193914, + "learning_rate": 1.0250063590453094e-05, + "loss": 0.8492, + "num_tokens": 38044269570.0, + "step": 9102 + }, + { + "epoch": 1.0817587641117052, + "grad_norm": 0.30672108766245815, + "learning_rate": 1.0248337662250212e-05, + "loss": 0.8382, + "num_tokens": 38048459705.0, + "step": 9103 + }, + { + "epoch": 1.0818775995246583, + "grad_norm": 0.3114874663063606, + "learning_rate": 1.0246611761883925e-05, + "loss": 0.8159, + "num_tokens": 38052647180.0, + "step": 9104 + }, + { + "epoch": 1.0819964349376114, + "grad_norm": 0.30265670805984096, + "learning_rate": 1.024488588941815e-05, + "loss": 0.788, + "num_tokens": 38056817885.0, + "step": 9105 + }, + { + "epoch": 1.0821152703505645, + "grad_norm": 0.36891650254773894, + "learning_rate": 1.0243160044916805e-05, + "loss": 0.8367, + "num_tokens": 38061006529.0, + "step": 9106 + }, + { + "epoch": 1.0822341057635176, + "grad_norm": 0.3394001621102082, + "learning_rate": 1.0241434228443805e-05, + "loss": 0.8333, + "num_tokens": 38065196761.0, + "step": 9107 + }, + { + "epoch": 1.0823529411764705, + "grad_norm": 0.37057375363539136, + "learning_rate": 1.0239708440063057e-05, + "loss": 0.7978, + "num_tokens": 38069387647.0, + "step": 9108 + }, + { + "epoch": 1.0824717765894236, + "grad_norm": 0.35018811594837107, + "learning_rate": 1.0237982679838474e-05, + "loss": 0.8067, + "num_tokens": 38073560324.0, + "step": 9109 + }, + { + "epoch": 1.0825906120023767, + "grad_norm": 0.35130680589749586, + "learning_rate": 1.0236256947833973e-05, + "loss": 0.8386, + "num_tokens": 38077747199.0, + "step": 9110 + }, + { + "epoch": 1.0827094474153298, + "grad_norm": 0.3360014489095075, + "learning_rate": 1.0234531244113457e-05, + "loss": 0.8023, + "num_tokens": 38081924979.0, + "step": 9111 + }, + { + "epoch": 1.082828282828283, + "grad_norm": 0.36452263607950663, + "learning_rate": 1.023280556874084e-05, + "loss": 0.8469, + "num_tokens": 38086115080.0, + "step": 9112 + }, + { + "epoch": 1.0829471182412358, + "grad_norm": 0.3367127806756744, + "learning_rate": 1.0231079921780026e-05, + "loss": 0.8168, + "num_tokens": 38090304485.0, + "step": 9113 + }, + { + "epoch": 1.083065953654189, + "grad_norm": 0.3673662034186219, + "learning_rate": 1.0229354303294919e-05, + "loss": 0.8378, + "num_tokens": 38094490768.0, + "step": 9114 + }, + { + "epoch": 1.083184789067142, + "grad_norm": 0.3467856493021288, + "learning_rate": 1.0227628713349429e-05, + "loss": 0.8128, + "num_tokens": 38098662563.0, + "step": 9115 + }, + { + "epoch": 1.083303624480095, + "grad_norm": 0.3532372527823284, + "learning_rate": 1.0225903152007462e-05, + "loss": 0.8408, + "num_tokens": 38102852366.0, + "step": 9116 + }, + { + "epoch": 1.0834224598930482, + "grad_norm": 0.3239426167266222, + "learning_rate": 1.0224177619332918e-05, + "loss": 0.8469, + "num_tokens": 38107039758.0, + "step": 9117 + }, + { + "epoch": 1.0835412953060013, + "grad_norm": 0.38545988855355956, + "learning_rate": 1.0222452115389699e-05, + "loss": 0.8013, + "num_tokens": 38111230354.0, + "step": 9118 + }, + { + "epoch": 1.0836601307189542, + "grad_norm": 0.3230783401113218, + "learning_rate": 1.0220726640241708e-05, + "loss": 0.7927, + "num_tokens": 38115402337.0, + "step": 9119 + }, + { + "epoch": 1.0837789661319073, + "grad_norm": 0.3697105472069874, + "learning_rate": 1.0219001193952843e-05, + "loss": 0.8605, + "num_tokens": 38119555807.0, + "step": 9120 + }, + { + "epoch": 1.0838978015448604, + "grad_norm": 0.33395040340417276, + "learning_rate": 1.0217275776586999e-05, + "loss": 0.8165, + "num_tokens": 38123745269.0, + "step": 9121 + }, + { + "epoch": 1.0840166369578135, + "grad_norm": 0.3516088020583018, + "learning_rate": 1.0215550388208089e-05, + "loss": 0.8471, + "num_tokens": 38127933717.0, + "step": 9122 + }, + { + "epoch": 1.0841354723707666, + "grad_norm": 0.3423344602702224, + "learning_rate": 1.0213825028879995e-05, + "loss": 0.8262, + "num_tokens": 38132121759.0, + "step": 9123 + }, + { + "epoch": 1.0842543077837195, + "grad_norm": 0.41622988312403114, + "learning_rate": 1.021209969866662e-05, + "loss": 0.8053, + "num_tokens": 38136299027.0, + "step": 9124 + }, + { + "epoch": 1.0843731431966726, + "grad_norm": 0.2917457129718076, + "learning_rate": 1.0210374397631859e-05, + "loss": 0.8139, + "num_tokens": 38140488574.0, + "step": 9125 + }, + { + "epoch": 1.0844919786096257, + "grad_norm": 0.3623302868930765, + "learning_rate": 1.0208649125839602e-05, + "loss": 0.859, + "num_tokens": 38144670617.0, + "step": 9126 + }, + { + "epoch": 1.0846108140225788, + "grad_norm": 0.36006757437714865, + "learning_rate": 1.020692388335374e-05, + "loss": 0.8597, + "num_tokens": 38148845470.0, + "step": 9127 + }, + { + "epoch": 1.0847296494355319, + "grad_norm": 0.2916635974500607, + "learning_rate": 1.0205198670238172e-05, + "loss": 0.8443, + "num_tokens": 38153034884.0, + "step": 9128 + }, + { + "epoch": 1.084848484848485, + "grad_norm": 0.38043530214329446, + "learning_rate": 1.0203473486556783e-05, + "loss": 0.8528, + "num_tokens": 38157223967.0, + "step": 9129 + }, + { + "epoch": 1.0849673202614378, + "grad_norm": 0.3216668104696063, + "learning_rate": 1.0201748332373467e-05, + "loss": 0.81, + "num_tokens": 38161413111.0, + "step": 9130 + }, + { + "epoch": 1.085086155674391, + "grad_norm": 0.3077225654466545, + "learning_rate": 1.0200023207752105e-05, + "loss": 0.8025, + "num_tokens": 38165603273.0, + "step": 9131 + }, + { + "epoch": 1.085204991087344, + "grad_norm": 0.3535703337438313, + "learning_rate": 1.019829811275659e-05, + "loss": 0.8037, + "num_tokens": 38169791863.0, + "step": 9132 + }, + { + "epoch": 1.0853238265002971, + "grad_norm": 0.3155533178751943, + "learning_rate": 1.0196573047450805e-05, + "loss": 0.8212, + "num_tokens": 38173948111.0, + "step": 9133 + }, + { + "epoch": 1.0854426619132502, + "grad_norm": 0.31567015674511173, + "learning_rate": 1.019484801189864e-05, + "loss": 0.8442, + "num_tokens": 38178106525.0, + "step": 9134 + }, + { + "epoch": 1.085561497326203, + "grad_norm": 0.31564167575049124, + "learning_rate": 1.0193123006163974e-05, + "loss": 0.8326, + "num_tokens": 38182295269.0, + "step": 9135 + }, + { + "epoch": 1.0856803327391562, + "grad_norm": 0.33584116168584754, + "learning_rate": 1.0191398030310691e-05, + "loss": 0.8147, + "num_tokens": 38186484678.0, + "step": 9136 + }, + { + "epoch": 1.0857991681521093, + "grad_norm": 0.3038258697652057, + "learning_rate": 1.0189673084402672e-05, + "loss": 0.8369, + "num_tokens": 38190668472.0, + "step": 9137 + }, + { + "epoch": 1.0859180035650624, + "grad_norm": 0.3192490440717135, + "learning_rate": 1.0187948168503802e-05, + "loss": 0.7802, + "num_tokens": 38194857100.0, + "step": 9138 + }, + { + "epoch": 1.0860368389780155, + "grad_norm": 0.29652278277268385, + "learning_rate": 1.0186223282677955e-05, + "loss": 0.837, + "num_tokens": 38199020032.0, + "step": 9139 + }, + { + "epoch": 1.0861556743909686, + "grad_norm": 0.38458209324772796, + "learning_rate": 1.0184498426989007e-05, + "loss": 0.8454, + "num_tokens": 38203153096.0, + "step": 9140 + }, + { + "epoch": 1.0862745098039215, + "grad_norm": 0.3056170819120933, + "learning_rate": 1.0182773601500842e-05, + "loss": 0.8132, + "num_tokens": 38207341775.0, + "step": 9141 + }, + { + "epoch": 1.0863933452168746, + "grad_norm": 0.33386258521810824, + "learning_rate": 1.0181048806277336e-05, + "loss": 0.8246, + "num_tokens": 38211523203.0, + "step": 9142 + }, + { + "epoch": 1.0865121806298277, + "grad_norm": 0.37375253391945495, + "learning_rate": 1.0179324041382362e-05, + "loss": 0.7962, + "num_tokens": 38215712100.0, + "step": 9143 + }, + { + "epoch": 1.0866310160427808, + "grad_norm": 0.35298424527185823, + "learning_rate": 1.017759930687979e-05, + "loss": 0.8089, + "num_tokens": 38219901844.0, + "step": 9144 + }, + { + "epoch": 1.0867498514557339, + "grad_norm": 0.3075305959831001, + "learning_rate": 1.01758746028335e-05, + "loss": 0.8023, + "num_tokens": 38224060743.0, + "step": 9145 + }, + { + "epoch": 1.0868686868686868, + "grad_norm": 0.345239722059953, + "learning_rate": 1.0174149929307355e-05, + "loss": 0.8326, + "num_tokens": 38228250072.0, + "step": 9146 + }, + { + "epoch": 1.0869875222816399, + "grad_norm": 0.37511123073500297, + "learning_rate": 1.0172425286365234e-05, + "loss": 0.7833, + "num_tokens": 38232433468.0, + "step": 9147 + }, + { + "epoch": 1.087106357694593, + "grad_norm": 0.31151872821917515, + "learning_rate": 1.0170700674071004e-05, + "loss": 0.8086, + "num_tokens": 38236624025.0, + "step": 9148 + }, + { + "epoch": 1.087225193107546, + "grad_norm": 0.32908926317026876, + "learning_rate": 1.016897609248853e-05, + "loss": 0.8461, + "num_tokens": 38240813733.0, + "step": 9149 + }, + { + "epoch": 1.0873440285204992, + "grad_norm": 0.3651164626817634, + "learning_rate": 1.0167251541681679e-05, + "loss": 0.7988, + "num_tokens": 38244955649.0, + "step": 9150 + }, + { + "epoch": 1.0874628639334523, + "grad_norm": 0.308272567647145, + "learning_rate": 1.0165527021714323e-05, + "loss": 0.8232, + "num_tokens": 38249140550.0, + "step": 9151 + }, + { + "epoch": 1.0875816993464051, + "grad_norm": 0.3917022590331001, + "learning_rate": 1.0163802532650317e-05, + "loss": 0.8029, + "num_tokens": 38253313973.0, + "step": 9152 + }, + { + "epoch": 1.0877005347593582, + "grad_norm": 0.35015441227462496, + "learning_rate": 1.0162078074553536e-05, + "loss": 0.7942, + "num_tokens": 38257480161.0, + "step": 9153 + }, + { + "epoch": 1.0878193701723113, + "grad_norm": 0.35651156000736967, + "learning_rate": 1.0160353647487834e-05, + "loss": 0.8421, + "num_tokens": 38261668910.0, + "step": 9154 + }, + { + "epoch": 1.0879382055852644, + "grad_norm": 0.3197213279830652, + "learning_rate": 1.0158629251517078e-05, + "loss": 0.8386, + "num_tokens": 38265857275.0, + "step": 9155 + }, + { + "epoch": 1.0880570409982175, + "grad_norm": 0.3394549975761588, + "learning_rate": 1.0156904886705126e-05, + "loss": 0.8047, + "num_tokens": 38270049212.0, + "step": 9156 + }, + { + "epoch": 1.0881758764111704, + "grad_norm": 0.32531292079402957, + "learning_rate": 1.0155180553115836e-05, + "loss": 0.8174, + "num_tokens": 38274238822.0, + "step": 9157 + }, + { + "epoch": 1.0882947118241235, + "grad_norm": 0.4029530943017312, + "learning_rate": 1.0153456250813062e-05, + "loss": 0.8058, + "num_tokens": 38278411425.0, + "step": 9158 + }, + { + "epoch": 1.0884135472370766, + "grad_norm": 0.30823347038181026, + "learning_rate": 1.0151731979860669e-05, + "loss": 0.7631, + "num_tokens": 38282599899.0, + "step": 9159 + }, + { + "epoch": 1.0885323826500297, + "grad_norm": 0.3520201207837377, + "learning_rate": 1.0150007740322507e-05, + "loss": 0.8103, + "num_tokens": 38286784211.0, + "step": 9160 + }, + { + "epoch": 1.0886512180629828, + "grad_norm": 0.42441730639398784, + "learning_rate": 1.0148283532262437e-05, + "loss": 0.8563, + "num_tokens": 38290950583.0, + "step": 9161 + }, + { + "epoch": 1.088770053475936, + "grad_norm": 0.2707769099910913, + "learning_rate": 1.0146559355744304e-05, + "loss": 0.8117, + "num_tokens": 38295140046.0, + "step": 9162 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.40477009298249905, + "learning_rate": 1.0144835210831963e-05, + "loss": 0.8087, + "num_tokens": 38299328940.0, + "step": 9163 + }, + { + "epoch": 1.089007724301842, + "grad_norm": 0.3541877503987902, + "learning_rate": 1.014311109758927e-05, + "loss": 0.8069, + "num_tokens": 38303519278.0, + "step": 9164 + }, + { + "epoch": 1.089126559714795, + "grad_norm": 0.35919917397054035, + "learning_rate": 1.0141387016080062e-05, + "loss": 0.829, + "num_tokens": 38307708157.0, + "step": 9165 + }, + { + "epoch": 1.089245395127748, + "grad_norm": 0.3241496376441437, + "learning_rate": 1.0139662966368202e-05, + "loss": 0.7928, + "num_tokens": 38311896233.0, + "step": 9166 + }, + { + "epoch": 1.0893642305407012, + "grad_norm": 0.47520913027371314, + "learning_rate": 1.0137938948517529e-05, + "loss": 0.8331, + "num_tokens": 38316055837.0, + "step": 9167 + }, + { + "epoch": 1.089483065953654, + "grad_norm": 0.3434111106546273, + "learning_rate": 1.0136214962591893e-05, + "loss": 0.8238, + "num_tokens": 38320241282.0, + "step": 9168 + }, + { + "epoch": 1.0896019013666072, + "grad_norm": 0.44819192772755, + "learning_rate": 1.0134491008655138e-05, + "loss": 0.7961, + "num_tokens": 38324431877.0, + "step": 9169 + }, + { + "epoch": 1.0897207367795603, + "grad_norm": 0.40612403590525503, + "learning_rate": 1.0132767086771104e-05, + "loss": 0.8229, + "num_tokens": 38328620312.0, + "step": 9170 + }, + { + "epoch": 1.0898395721925134, + "grad_norm": 0.37161487285265304, + "learning_rate": 1.0131043197003639e-05, + "loss": 0.8078, + "num_tokens": 38332776699.0, + "step": 9171 + }, + { + "epoch": 1.0899584076054665, + "grad_norm": 0.39435478091979104, + "learning_rate": 1.0129319339416582e-05, + "loss": 0.818, + "num_tokens": 38336961841.0, + "step": 9172 + }, + { + "epoch": 1.0900772430184196, + "grad_norm": 0.37225537214786214, + "learning_rate": 1.0127595514073773e-05, + "loss": 0.8664, + "num_tokens": 38341150790.0, + "step": 9173 + }, + { + "epoch": 1.0901960784313725, + "grad_norm": 0.30263067904463625, + "learning_rate": 1.0125871721039056e-05, + "loss": 0.8061, + "num_tokens": 38345324858.0, + "step": 9174 + }, + { + "epoch": 1.0903149138443256, + "grad_norm": 0.5016408259159884, + "learning_rate": 1.0124147960376261e-05, + "loss": 0.8271, + "num_tokens": 38349491681.0, + "step": 9175 + }, + { + "epoch": 1.0904337492572787, + "grad_norm": 0.331616701780596, + "learning_rate": 1.0122424232149229e-05, + "loss": 0.8052, + "num_tokens": 38353681149.0, + "step": 9176 + }, + { + "epoch": 1.0905525846702318, + "grad_norm": 0.4270515107292905, + "learning_rate": 1.0120700536421794e-05, + "loss": 0.8177, + "num_tokens": 38357871787.0, + "step": 9177 + }, + { + "epoch": 1.0906714200831849, + "grad_norm": 0.3263193119726747, + "learning_rate": 1.0118976873257789e-05, + "loss": 0.8577, + "num_tokens": 38362060579.0, + "step": 9178 + }, + { + "epoch": 1.090790255496138, + "grad_norm": 0.38554786472009817, + "learning_rate": 1.0117253242721054e-05, + "loss": 0.8313, + "num_tokens": 38366250596.0, + "step": 9179 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.4222451802561926, + "learning_rate": 1.0115529644875413e-05, + "loss": 0.8418, + "num_tokens": 38370438616.0, + "step": 9180 + }, + { + "epoch": 1.091027926322044, + "grad_norm": 0.3337559140586458, + "learning_rate": 1.01138060797847e-05, + "loss": 0.8402, + "num_tokens": 38374629046.0, + "step": 9181 + }, + { + "epoch": 1.091146761734997, + "grad_norm": 0.39080567852279546, + "learning_rate": 1.011208254751275e-05, + "loss": 0.823, + "num_tokens": 38378798980.0, + "step": 9182 + }, + { + "epoch": 1.0912655971479501, + "grad_norm": 0.3679842783619926, + "learning_rate": 1.0110359048123379e-05, + "loss": 0.85, + "num_tokens": 38382986728.0, + "step": 9183 + }, + { + "epoch": 1.0913844325609032, + "grad_norm": 0.3065254042833062, + "learning_rate": 1.010863558168042e-05, + "loss": 0.8152, + "num_tokens": 38387176148.0, + "step": 9184 + }, + { + "epoch": 1.091503267973856, + "grad_norm": 0.43582392807164133, + "learning_rate": 1.0106912148247699e-05, + "loss": 0.8353, + "num_tokens": 38391341966.0, + "step": 9185 + }, + { + "epoch": 1.0916221033868092, + "grad_norm": 0.3332438278183638, + "learning_rate": 1.0105188747889043e-05, + "loss": 0.8175, + "num_tokens": 38395532514.0, + "step": 9186 + }, + { + "epoch": 1.0917409387997623, + "grad_norm": 0.35590133088855364, + "learning_rate": 1.0103465380668275e-05, + "loss": 0.8023, + "num_tokens": 38399661427.0, + "step": 9187 + }, + { + "epoch": 1.0918597742127154, + "grad_norm": 0.3627052430357718, + "learning_rate": 1.0101742046649211e-05, + "loss": 0.8158, + "num_tokens": 38403835418.0, + "step": 9188 + }, + { + "epoch": 1.0919786096256685, + "grad_norm": 0.3113516104146653, + "learning_rate": 1.010001874589568e-05, + "loss": 0.8452, + "num_tokens": 38408025251.0, + "step": 9189 + }, + { + "epoch": 1.0920974450386216, + "grad_norm": 0.3980262693462398, + "learning_rate": 1.0098295478471491e-05, + "loss": 0.8188, + "num_tokens": 38412215607.0, + "step": 9190 + }, + { + "epoch": 1.0922162804515745, + "grad_norm": 0.3149800271203304, + "learning_rate": 1.0096572244440474e-05, + "loss": 0.8125, + "num_tokens": 38416404054.0, + "step": 9191 + }, + { + "epoch": 1.0923351158645276, + "grad_norm": 0.29728624800199216, + "learning_rate": 1.0094849043866443e-05, + "loss": 0.8476, + "num_tokens": 38420591608.0, + "step": 9192 + }, + { + "epoch": 1.0924539512774807, + "grad_norm": 0.39263254527382635, + "learning_rate": 1.0093125876813213e-05, + "loss": 0.7949, + "num_tokens": 38424780822.0, + "step": 9193 + }, + { + "epoch": 1.0925727866904338, + "grad_norm": 0.3078854704588996, + "learning_rate": 1.0091402743344593e-05, + "loss": 0.8022, + "num_tokens": 38428970175.0, + "step": 9194 + }, + { + "epoch": 1.0926916221033869, + "grad_norm": 0.33907986207051355, + "learning_rate": 1.0089679643524408e-05, + "loss": 0.7981, + "num_tokens": 38433159008.0, + "step": 9195 + }, + { + "epoch": 1.09281045751634, + "grad_norm": 0.35120561060518585, + "learning_rate": 1.0087956577416458e-05, + "loss": 0.8127, + "num_tokens": 38437347151.0, + "step": 9196 + }, + { + "epoch": 1.0929292929292929, + "grad_norm": 0.32176436816405235, + "learning_rate": 1.008623354508456e-05, + "loss": 0.849, + "num_tokens": 38441516542.0, + "step": 9197 + }, + { + "epoch": 1.093048128342246, + "grad_norm": 0.3359205782714484, + "learning_rate": 1.0084510546592525e-05, + "loss": 0.8343, + "num_tokens": 38445689313.0, + "step": 9198 + }, + { + "epoch": 1.093166963755199, + "grad_norm": 0.33764691825820625, + "learning_rate": 1.0082787582004158e-05, + "loss": 0.8185, + "num_tokens": 38449880277.0, + "step": 9199 + }, + { + "epoch": 1.0932857991681522, + "grad_norm": 0.31457734361639006, + "learning_rate": 1.0081064651383272e-05, + "loss": 0.8254, + "num_tokens": 38454070658.0, + "step": 9200 + }, + { + "epoch": 1.0934046345811053, + "grad_norm": 0.4576612318748393, + "learning_rate": 1.0079341754793667e-05, + "loss": 0.8402, + "num_tokens": 38458185567.0, + "step": 9201 + }, + { + "epoch": 1.0935234699940581, + "grad_norm": 0.3382146161899952, + "learning_rate": 1.0077618892299148e-05, + "loss": 0.8265, + "num_tokens": 38462357140.0, + "step": 9202 + }, + { + "epoch": 1.0936423054070112, + "grad_norm": 0.4800715926507158, + "learning_rate": 1.0075896063963519e-05, + "loss": 0.8448, + "num_tokens": 38466545336.0, + "step": 9203 + }, + { + "epoch": 1.0937611408199643, + "grad_norm": 0.3613522827139974, + "learning_rate": 1.0074173269850584e-05, + "loss": 0.815, + "num_tokens": 38470734577.0, + "step": 9204 + }, + { + "epoch": 1.0938799762329174, + "grad_norm": 0.5445757092336865, + "learning_rate": 1.0072450510024147e-05, + "loss": 0.7996, + "num_tokens": 38474897817.0, + "step": 9205 + }, + { + "epoch": 1.0939988116458705, + "grad_norm": 0.3726511219527538, + "learning_rate": 1.0070727784548001e-05, + "loss": 0.8451, + "num_tokens": 38479054537.0, + "step": 9206 + }, + { + "epoch": 1.0941176470588236, + "grad_norm": 0.45303293470885575, + "learning_rate": 1.0069005093485949e-05, + "loss": 0.8483, + "num_tokens": 38483227790.0, + "step": 9207 + }, + { + "epoch": 1.0942364824717765, + "grad_norm": 0.35489187283150225, + "learning_rate": 1.0067282436901785e-05, + "loss": 0.8069, + "num_tokens": 38487417037.0, + "step": 9208 + }, + { + "epoch": 1.0943553178847296, + "grad_norm": 0.4435033201681633, + "learning_rate": 1.0065559814859306e-05, + "loss": 0.832, + "num_tokens": 38491606359.0, + "step": 9209 + }, + { + "epoch": 1.0944741532976827, + "grad_norm": 0.3971522541278718, + "learning_rate": 1.0063837227422303e-05, + "loss": 0.8199, + "num_tokens": 38495794502.0, + "step": 9210 + }, + { + "epoch": 1.0945929887106358, + "grad_norm": 0.4733639171945545, + "learning_rate": 1.0062114674654578e-05, + "loss": 0.8353, + "num_tokens": 38499983365.0, + "step": 9211 + }, + { + "epoch": 1.094711824123589, + "grad_norm": 0.4163845046832643, + "learning_rate": 1.0060392156619913e-05, + "loss": 0.8006, + "num_tokens": 38504165597.0, + "step": 9212 + }, + { + "epoch": 1.0948306595365418, + "grad_norm": 0.4408436720296686, + "learning_rate": 1.0058669673382107e-05, + "loss": 0.784, + "num_tokens": 38508338518.0, + "step": 9213 + }, + { + "epoch": 1.094949494949495, + "grad_norm": 0.4664215750281868, + "learning_rate": 1.0056947225004945e-05, + "loss": 0.8234, + "num_tokens": 38512527648.0, + "step": 9214 + }, + { + "epoch": 1.095068330362448, + "grad_norm": 0.3730782156741277, + "learning_rate": 1.0055224811552213e-05, + "loss": 0.8328, + "num_tokens": 38516716496.0, + "step": 9215 + }, + { + "epoch": 1.095187165775401, + "grad_norm": 0.41939669438454985, + "learning_rate": 1.00535024330877e-05, + "loss": 0.7929, + "num_tokens": 38520855004.0, + "step": 9216 + }, + { + "epoch": 1.0953060011883542, + "grad_norm": 0.3715963201467474, + "learning_rate": 1.0051780089675196e-05, + "loss": 0.8207, + "num_tokens": 38525043069.0, + "step": 9217 + }, + { + "epoch": 1.0954248366013073, + "grad_norm": 0.34015384660243836, + "learning_rate": 1.0050057781378481e-05, + "loss": 0.7942, + "num_tokens": 38529233694.0, + "step": 9218 + }, + { + "epoch": 1.0955436720142602, + "grad_norm": 0.37622207820707393, + "learning_rate": 1.0048335508261338e-05, + "loss": 0.8386, + "num_tokens": 38533424383.0, + "step": 9219 + }, + { + "epoch": 1.0956625074272133, + "grad_norm": 0.3012265427776035, + "learning_rate": 1.0046613270387549e-05, + "loss": 0.832, + "num_tokens": 38537598493.0, + "step": 9220 + }, + { + "epoch": 1.0957813428401664, + "grad_norm": 0.31481585018026126, + "learning_rate": 1.0044891067820893e-05, + "loss": 0.8403, + "num_tokens": 38541784793.0, + "step": 9221 + }, + { + "epoch": 1.0959001782531195, + "grad_norm": 0.36337786019242396, + "learning_rate": 1.0043168900625147e-05, + "loss": 0.8384, + "num_tokens": 38545955507.0, + "step": 9222 + }, + { + "epoch": 1.0960190136660726, + "grad_norm": 0.30445414395649173, + "learning_rate": 1.0041446768864098e-05, + "loss": 0.8332, + "num_tokens": 38550141279.0, + "step": 9223 + }, + { + "epoch": 1.0961378490790255, + "grad_norm": 0.33941662705644376, + "learning_rate": 1.0039724672601511e-05, + "loss": 0.8368, + "num_tokens": 38554330125.0, + "step": 9224 + }, + { + "epoch": 1.0962566844919786, + "grad_norm": 0.39428061629668326, + "learning_rate": 1.003800261190117e-05, + "loss": 0.8304, + "num_tokens": 38558490507.0, + "step": 9225 + }, + { + "epoch": 1.0963755199049317, + "grad_norm": 0.35689717830526896, + "learning_rate": 1.0036280586826843e-05, + "loss": 0.868, + "num_tokens": 38562659478.0, + "step": 9226 + }, + { + "epoch": 1.0964943553178848, + "grad_norm": 0.31497550993408785, + "learning_rate": 1.0034558597442305e-05, + "loss": 0.8376, + "num_tokens": 38566846297.0, + "step": 9227 + }, + { + "epoch": 1.0966131907308379, + "grad_norm": 0.3705315744222557, + "learning_rate": 1.0032836643811325e-05, + "loss": 0.8042, + "num_tokens": 38571035060.0, + "step": 9228 + }, + { + "epoch": 1.096732026143791, + "grad_norm": 0.2966557603402835, + "learning_rate": 1.0031114725997673e-05, + "loss": 0.7926, + "num_tokens": 38575224484.0, + "step": 9229 + }, + { + "epoch": 1.0968508615567438, + "grad_norm": 0.3322764958230417, + "learning_rate": 1.002939284406512e-05, + "loss": 0.7994, + "num_tokens": 38579413467.0, + "step": 9230 + }, + { + "epoch": 1.096969696969697, + "grad_norm": 0.3571245415226191, + "learning_rate": 1.0027670998077435e-05, + "loss": 0.8092, + "num_tokens": 38583599499.0, + "step": 9231 + }, + { + "epoch": 1.09708853238265, + "grad_norm": 0.30693798330370803, + "learning_rate": 1.0025949188098376e-05, + "loss": 0.8316, + "num_tokens": 38587740762.0, + "step": 9232 + }, + { + "epoch": 1.0972073677956031, + "grad_norm": 0.34581069927659835, + "learning_rate": 1.0024227414191713e-05, + "loss": 0.8483, + "num_tokens": 38591930027.0, + "step": 9233 + }, + { + "epoch": 1.0973262032085562, + "grad_norm": 0.35147945374555806, + "learning_rate": 1.0022505676421211e-05, + "loss": 0.7998, + "num_tokens": 38596117745.0, + "step": 9234 + }, + { + "epoch": 1.097445038621509, + "grad_norm": 0.2847624865997213, + "learning_rate": 1.0020783974850621e-05, + "loss": 0.8565, + "num_tokens": 38600301679.0, + "step": 9235 + }, + { + "epoch": 1.0975638740344622, + "grad_norm": 0.3563008736539791, + "learning_rate": 1.001906230954372e-05, + "loss": 0.7885, + "num_tokens": 38604491119.0, + "step": 9236 + }, + { + "epoch": 1.0976827094474153, + "grad_norm": 0.2874446693426678, + "learning_rate": 1.0017340680564254e-05, + "loss": 0.8393, + "num_tokens": 38608681090.0, + "step": 9237 + }, + { + "epoch": 1.0978015448603684, + "grad_norm": 0.35992073309341754, + "learning_rate": 1.0015619087975986e-05, + "loss": 0.8154, + "num_tokens": 38612870643.0, + "step": 9238 + }, + { + "epoch": 1.0979203802733215, + "grad_norm": 0.310540117108523, + "learning_rate": 1.0013897531842671e-05, + "loss": 0.8114, + "num_tokens": 38617008461.0, + "step": 9239 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.33557428572112885, + "learning_rate": 1.0012176012228066e-05, + "loss": 0.829, + "num_tokens": 38621198988.0, + "step": 9240 + }, + { + "epoch": 1.0981580510992275, + "grad_norm": 0.3580287606130018, + "learning_rate": 1.0010454529195919e-05, + "loss": 0.8079, + "num_tokens": 38625388439.0, + "step": 9241 + }, + { + "epoch": 1.0982768865121806, + "grad_norm": 0.3412746770509522, + "learning_rate": 1.000873308280999e-05, + "loss": 0.8248, + "num_tokens": 38629570891.0, + "step": 9242 + }, + { + "epoch": 1.0983957219251337, + "grad_norm": 0.4110119590227618, + "learning_rate": 1.0007011673134026e-05, + "loss": 0.7861, + "num_tokens": 38633760117.0, + "step": 9243 + }, + { + "epoch": 1.0985145573380868, + "grad_norm": 0.3271479114996622, + "learning_rate": 1.0005290300231775e-05, + "loss": 0.8121, + "num_tokens": 38637949297.0, + "step": 9244 + }, + { + "epoch": 1.0986333927510399, + "grad_norm": 0.45640190962641664, + "learning_rate": 1.0003568964166992e-05, + "loss": 0.8258, + "num_tokens": 38642091284.0, + "step": 9245 + }, + { + "epoch": 1.0987522281639928, + "grad_norm": 0.37757141576528486, + "learning_rate": 1.0001847665003417e-05, + "loss": 0.8214, + "num_tokens": 38646261805.0, + "step": 9246 + }, + { + "epoch": 1.0988710635769459, + "grad_norm": 0.4143056076350584, + "learning_rate": 1.0000126402804797e-05, + "loss": 0.85, + "num_tokens": 38650450232.0, + "step": 9247 + }, + { + "epoch": 1.098989898989899, + "grad_norm": 0.3657674474693703, + "learning_rate": 9.998405177634875e-06, + "loss": 0.8036, + "num_tokens": 38654638819.0, + "step": 9248 + }, + { + "epoch": 1.099108734402852, + "grad_norm": 0.38273087501889264, + "learning_rate": 9.996683989557398e-06, + "loss": 0.8166, + "num_tokens": 38658825475.0, + "step": 9249 + }, + { + "epoch": 1.0992275698158052, + "grad_norm": 0.3442366255519467, + "learning_rate": 9.994962838636109e-06, + "loss": 0.8007, + "num_tokens": 38662973085.0, + "step": 9250 + }, + { + "epoch": 1.0993464052287583, + "grad_norm": 0.38463016121069316, + "learning_rate": 9.993241724934739e-06, + "loss": 0.8046, + "num_tokens": 38667162836.0, + "step": 9251 + }, + { + "epoch": 1.0994652406417111, + "grad_norm": 0.32768891432006114, + "learning_rate": 9.991520648517034e-06, + "loss": 0.8005, + "num_tokens": 38671350400.0, + "step": 9252 + }, + { + "epoch": 1.0995840760546642, + "grad_norm": 0.4043058863290835, + "learning_rate": 9.98979960944673e-06, + "loss": 0.8162, + "num_tokens": 38675533903.0, + "step": 9253 + }, + { + "epoch": 1.0997029114676173, + "grad_norm": 0.34061465742734853, + "learning_rate": 9.988078607787556e-06, + "loss": 0.796, + "num_tokens": 38679692132.0, + "step": 9254 + }, + { + "epoch": 1.0998217468805704, + "grad_norm": 0.3732128454213554, + "learning_rate": 9.986357643603259e-06, + "loss": 0.8346, + "num_tokens": 38683873805.0, + "step": 9255 + }, + { + "epoch": 1.0999405822935235, + "grad_norm": 0.31184400436145976, + "learning_rate": 9.984636716957563e-06, + "loss": 0.7834, + "num_tokens": 38688045051.0, + "step": 9256 + }, + { + "epoch": 1.1000594177064764, + "grad_norm": 0.36496864326713496, + "learning_rate": 9.982915827914204e-06, + "loss": 0.8272, + "num_tokens": 38692232728.0, + "step": 9257 + }, + { + "epoch": 1.1001782531194295, + "grad_norm": 0.3113786052631124, + "learning_rate": 9.981194976536913e-06, + "loss": 0.8216, + "num_tokens": 38696422395.0, + "step": 9258 + }, + { + "epoch": 1.1002970885323826, + "grad_norm": 0.3570232824162704, + "learning_rate": 9.979474162889413e-06, + "loss": 0.7932, + "num_tokens": 38700610514.0, + "step": 9259 + }, + { + "epoch": 1.1004159239453357, + "grad_norm": 0.31605755275532854, + "learning_rate": 9.977753387035433e-06, + "loss": 0.8158, + "num_tokens": 38704801101.0, + "step": 9260 + }, + { + "epoch": 1.1005347593582888, + "grad_norm": 0.3369086360394239, + "learning_rate": 9.976032649038704e-06, + "loss": 0.8183, + "num_tokens": 38708990637.0, + "step": 9261 + }, + { + "epoch": 1.100653594771242, + "grad_norm": 0.2763108232788998, + "learning_rate": 9.97431194896295e-06, + "loss": 0.8124, + "num_tokens": 38713163597.0, + "step": 9262 + }, + { + "epoch": 1.1007724301841948, + "grad_norm": 0.37398715995476745, + "learning_rate": 9.97259128687189e-06, + "loss": 0.8169, + "num_tokens": 38717324495.0, + "step": 9263 + }, + { + "epoch": 1.100891265597148, + "grad_norm": 0.299039558731866, + "learning_rate": 9.970870662829249e-06, + "loss": 0.8318, + "num_tokens": 38721513844.0, + "step": 9264 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.32167317261286227, + "learning_rate": 9.969150076898748e-06, + "loss": 0.8179, + "num_tokens": 38725702078.0, + "step": 9265 + }, + { + "epoch": 1.101128936423054, + "grad_norm": 0.3245854593600604, + "learning_rate": 9.967429529144106e-06, + "loss": 0.835, + "num_tokens": 38729811010.0, + "step": 9266 + }, + { + "epoch": 1.1012477718360072, + "grad_norm": 0.31944690413302, + "learning_rate": 9.965709019629033e-06, + "loss": 0.8233, + "num_tokens": 38733974165.0, + "step": 9267 + }, + { + "epoch": 1.10136660724896, + "grad_norm": 0.3203052741287054, + "learning_rate": 9.963988548417259e-06, + "loss": 0.8143, + "num_tokens": 38738151962.0, + "step": 9268 + }, + { + "epoch": 1.1014854426619132, + "grad_norm": 0.29765466299245924, + "learning_rate": 9.96226811557249e-06, + "loss": 0.8075, + "num_tokens": 38742340819.0, + "step": 9269 + }, + { + "epoch": 1.1016042780748663, + "grad_norm": 0.36199365078927287, + "learning_rate": 9.96054772115844e-06, + "loss": 0.8101, + "num_tokens": 38746522470.0, + "step": 9270 + }, + { + "epoch": 1.1017231134878194, + "grad_norm": 0.3098669026490225, + "learning_rate": 9.958827365238825e-06, + "loss": 0.8243, + "num_tokens": 38750712943.0, + "step": 9271 + }, + { + "epoch": 1.1018419489007725, + "grad_norm": 0.3094459015522485, + "learning_rate": 9.957107047877352e-06, + "loss": 0.8073, + "num_tokens": 38754838795.0, + "step": 9272 + }, + { + "epoch": 1.1019607843137256, + "grad_norm": 0.34865482125309233, + "learning_rate": 9.955386769137729e-06, + "loss": 0.8203, + "num_tokens": 38759028960.0, + "step": 9273 + }, + { + "epoch": 1.1020796197266784, + "grad_norm": 0.3277759585716001, + "learning_rate": 9.953666529083668e-06, + "loss": 0.866, + "num_tokens": 38763216457.0, + "step": 9274 + }, + { + "epoch": 1.1021984551396315, + "grad_norm": 0.3119221725199175, + "learning_rate": 9.951946327778872e-06, + "loss": 0.8249, + "num_tokens": 38767369255.0, + "step": 9275 + }, + { + "epoch": 1.1023172905525846, + "grad_norm": 0.33569980035569236, + "learning_rate": 9.95022616528705e-06, + "loss": 0.8245, + "num_tokens": 38771559997.0, + "step": 9276 + }, + { + "epoch": 1.1024361259655377, + "grad_norm": 0.3239058431811422, + "learning_rate": 9.9485060416719e-06, + "loss": 0.8027, + "num_tokens": 38775708418.0, + "step": 9277 + }, + { + "epoch": 1.1025549613784908, + "grad_norm": 0.3019593963755523, + "learning_rate": 9.946785956997124e-06, + "loss": 0.7951, + "num_tokens": 38779878880.0, + "step": 9278 + }, + { + "epoch": 1.102673796791444, + "grad_norm": 0.3364817007718158, + "learning_rate": 9.94506591132643e-06, + "loss": 0.8439, + "num_tokens": 38784066014.0, + "step": 9279 + }, + { + "epoch": 1.1027926322043968, + "grad_norm": 0.3251439101381994, + "learning_rate": 9.943345904723504e-06, + "loss": 0.8507, + "num_tokens": 38788255233.0, + "step": 9280 + }, + { + "epoch": 1.10291146761735, + "grad_norm": 0.307275266180141, + "learning_rate": 9.941625937252061e-06, + "loss": 0.8069, + "num_tokens": 38792442799.0, + "step": 9281 + }, + { + "epoch": 1.103030303030303, + "grad_norm": 0.3400156649605421, + "learning_rate": 9.939906008975784e-06, + "loss": 0.8239, + "num_tokens": 38796600647.0, + "step": 9282 + }, + { + "epoch": 1.1031491384432561, + "grad_norm": 0.38671870576570067, + "learning_rate": 9.938186119958372e-06, + "loss": 0.8493, + "num_tokens": 38800789761.0, + "step": 9283 + }, + { + "epoch": 1.1032679738562092, + "grad_norm": 0.2779739446801777, + "learning_rate": 9.93646627026352e-06, + "loss": 0.8223, + "num_tokens": 38804978586.0, + "step": 9284 + }, + { + "epoch": 1.1033868092691623, + "grad_norm": 0.3535148888740646, + "learning_rate": 9.934746459954915e-06, + "loss": 0.8238, + "num_tokens": 38809168307.0, + "step": 9285 + }, + { + "epoch": 1.1035056446821152, + "grad_norm": 0.31927764384182433, + "learning_rate": 9.93302668909625e-06, + "loss": 0.8192, + "num_tokens": 38813357540.0, + "step": 9286 + }, + { + "epoch": 1.1036244800950683, + "grad_norm": 0.3056321350809453, + "learning_rate": 9.931306957751216e-06, + "loss": 0.8376, + "num_tokens": 38817528364.0, + "step": 9287 + }, + { + "epoch": 1.1037433155080214, + "grad_norm": 0.32137605481887516, + "learning_rate": 9.929587265983497e-06, + "loss": 0.8266, + "num_tokens": 38821717259.0, + "step": 9288 + }, + { + "epoch": 1.1038621509209745, + "grad_norm": 0.35558623044797644, + "learning_rate": 9.927867613856786e-06, + "loss": 0.8141, + "num_tokens": 38825905084.0, + "step": 9289 + }, + { + "epoch": 1.1039809863339276, + "grad_norm": 0.307784015100488, + "learning_rate": 9.926148001434756e-06, + "loss": 0.8407, + "num_tokens": 38830083266.0, + "step": 9290 + }, + { + "epoch": 1.1040998217468805, + "grad_norm": 0.3847722747713379, + "learning_rate": 9.924428428781101e-06, + "loss": 0.8221, + "num_tokens": 38834272409.0, + "step": 9291 + }, + { + "epoch": 1.1042186571598336, + "grad_norm": 0.3389782969897782, + "learning_rate": 9.922708895959492e-06, + "loss": 0.8045, + "num_tokens": 38838453586.0, + "step": 9292 + }, + { + "epoch": 1.1043374925727867, + "grad_norm": 0.3253536279166395, + "learning_rate": 9.920989403033618e-06, + "loss": 0.8459, + "num_tokens": 38842590835.0, + "step": 9293 + }, + { + "epoch": 1.1044563279857398, + "grad_norm": 0.38309009782864745, + "learning_rate": 9.919269950067158e-06, + "loss": 0.8379, + "num_tokens": 38846695832.0, + "step": 9294 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.29606000152906614, + "learning_rate": 9.917550537123781e-06, + "loss": 0.8407, + "num_tokens": 38850873038.0, + "step": 9295 + }, + { + "epoch": 1.104693998811646, + "grad_norm": 0.38951901203207034, + "learning_rate": 9.915831164267169e-06, + "loss": 0.7907, + "num_tokens": 38855063591.0, + "step": 9296 + }, + { + "epoch": 1.1048128342245989, + "grad_norm": 0.3183859756051199, + "learning_rate": 9.914111831560997e-06, + "loss": 0.8185, + "num_tokens": 38859206430.0, + "step": 9297 + }, + { + "epoch": 1.104931669637552, + "grad_norm": 0.3675235163063829, + "learning_rate": 9.912392539068934e-06, + "loss": 0.8257, + "num_tokens": 38863395612.0, + "step": 9298 + }, + { + "epoch": 1.105050505050505, + "grad_norm": 0.35071242986932233, + "learning_rate": 9.91067328685465e-06, + "loss": 0.7874, + "num_tokens": 38867584900.0, + "step": 9299 + }, + { + "epoch": 1.1051693404634582, + "grad_norm": 0.3503977674410302, + "learning_rate": 9.908954074981818e-06, + "loss": 0.7717, + "num_tokens": 38871774748.0, + "step": 9300 + }, + { + "epoch": 1.1052881758764113, + "grad_norm": 0.3604288674260963, + "learning_rate": 9.907234903514107e-06, + "loss": 0.8342, + "num_tokens": 38875958695.0, + "step": 9301 + }, + { + "epoch": 1.1054070112893641, + "grad_norm": 0.3655111115302162, + "learning_rate": 9.905515772515184e-06, + "loss": 0.8318, + "num_tokens": 38880146593.0, + "step": 9302 + }, + { + "epoch": 1.1055258467023172, + "grad_norm": 0.31583540553533707, + "learning_rate": 9.903796682048709e-06, + "loss": 0.8358, + "num_tokens": 38884336230.0, + "step": 9303 + }, + { + "epoch": 1.1056446821152703, + "grad_norm": 0.39256686995692297, + "learning_rate": 9.902077632178353e-06, + "loss": 0.8077, + "num_tokens": 38888523478.0, + "step": 9304 + }, + { + "epoch": 1.1057635175282234, + "grad_norm": 0.3121573289477786, + "learning_rate": 9.900358622967767e-06, + "loss": 0.8276, + "num_tokens": 38892713995.0, + "step": 9305 + }, + { + "epoch": 1.1058823529411765, + "grad_norm": 0.37987247246299766, + "learning_rate": 9.898639654480626e-06, + "loss": 0.842, + "num_tokens": 38896903304.0, + "step": 9306 + }, + { + "epoch": 1.1060011883541296, + "grad_norm": 0.35974083928608014, + "learning_rate": 9.896920726780584e-06, + "loss": 0.8139, + "num_tokens": 38901091403.0, + "step": 9307 + }, + { + "epoch": 1.1061200237670825, + "grad_norm": 0.34419468290493027, + "learning_rate": 9.895201839931292e-06, + "loss": 0.8485, + "num_tokens": 38905280695.0, + "step": 9308 + }, + { + "epoch": 1.1062388591800356, + "grad_norm": 0.36719461184720026, + "learning_rate": 9.893482993996413e-06, + "loss": 0.8385, + "num_tokens": 38909468996.0, + "step": 9309 + }, + { + "epoch": 1.1063576945929887, + "grad_norm": 0.2940431034520997, + "learning_rate": 9.8917641890396e-06, + "loss": 0.8443, + "num_tokens": 38913656687.0, + "step": 9310 + }, + { + "epoch": 1.1064765300059418, + "grad_norm": 0.34377142531724697, + "learning_rate": 9.890045425124503e-06, + "loss": 0.85, + "num_tokens": 38917845028.0, + "step": 9311 + }, + { + "epoch": 1.106595365418895, + "grad_norm": 0.29965747606839577, + "learning_rate": 9.88832670231478e-06, + "loss": 0.7749, + "num_tokens": 38922017885.0, + "step": 9312 + }, + { + "epoch": 1.1067142008318478, + "grad_norm": 0.323434143377644, + "learning_rate": 9.88660802067408e-06, + "loss": 0.7863, + "num_tokens": 38926147280.0, + "step": 9313 + }, + { + "epoch": 1.106833036244801, + "grad_norm": 0.30791687305420895, + "learning_rate": 9.884889380266045e-06, + "loss": 0.8303, + "num_tokens": 38930295021.0, + "step": 9314 + }, + { + "epoch": 1.106951871657754, + "grad_norm": 0.2918463529215142, + "learning_rate": 9.883170781154331e-06, + "loss": 0.8363, + "num_tokens": 38934452870.0, + "step": 9315 + }, + { + "epoch": 1.107070707070707, + "grad_norm": 0.36469945186185004, + "learning_rate": 9.881452223402577e-06, + "loss": 0.8335, + "num_tokens": 38938631682.0, + "step": 9316 + }, + { + "epoch": 1.1071895424836602, + "grad_norm": 0.3187529775371754, + "learning_rate": 9.879733707074429e-06, + "loss": 0.8325, + "num_tokens": 38942800530.0, + "step": 9317 + }, + { + "epoch": 1.1073083778966133, + "grad_norm": 0.3415659281068235, + "learning_rate": 9.878015232233527e-06, + "loss": 0.8049, + "num_tokens": 38946989984.0, + "step": 9318 + }, + { + "epoch": 1.1074272133095662, + "grad_norm": 0.32667191401866663, + "learning_rate": 9.876296798943517e-06, + "loss": 0.8503, + "num_tokens": 38951166208.0, + "step": 9319 + }, + { + "epoch": 1.1075460487225193, + "grad_norm": 0.3268372688276311, + "learning_rate": 9.874578407268036e-06, + "loss": 0.8045, + "num_tokens": 38955354574.0, + "step": 9320 + }, + { + "epoch": 1.1076648841354724, + "grad_norm": 0.3306102750075251, + "learning_rate": 9.87286005727072e-06, + "loss": 0.8156, + "num_tokens": 38959544199.0, + "step": 9321 + }, + { + "epoch": 1.1077837195484255, + "grad_norm": 0.3736206210396257, + "learning_rate": 9.87114174901521e-06, + "loss": 0.8118, + "num_tokens": 38963733730.0, + "step": 9322 + }, + { + "epoch": 1.1079025549613786, + "grad_norm": 0.2974676811104245, + "learning_rate": 9.86942348256514e-06, + "loss": 0.8546, + "num_tokens": 38967907566.0, + "step": 9323 + }, + { + "epoch": 1.1080213903743314, + "grad_norm": 0.3681672812844157, + "learning_rate": 9.86770525798413e-06, + "loss": 0.8005, + "num_tokens": 38972079202.0, + "step": 9324 + }, + { + "epoch": 1.1081402257872845, + "grad_norm": 0.31052457757644913, + "learning_rate": 9.865987075335831e-06, + "loss": 0.7906, + "num_tokens": 38976270468.0, + "step": 9325 + }, + { + "epoch": 1.1082590612002376, + "grad_norm": 0.40872662986008396, + "learning_rate": 9.864268934683865e-06, + "loss": 0.8022, + "num_tokens": 38980459430.0, + "step": 9326 + }, + { + "epoch": 1.1083778966131907, + "grad_norm": 0.28737138143167373, + "learning_rate": 9.86255083609186e-06, + "loss": 0.8304, + "num_tokens": 38984613035.0, + "step": 9327 + }, + { + "epoch": 1.1084967320261438, + "grad_norm": 0.3760899267969455, + "learning_rate": 9.860832779623444e-06, + "loss": 0.842, + "num_tokens": 38988782997.0, + "step": 9328 + }, + { + "epoch": 1.108615567439097, + "grad_norm": 0.32285500403339945, + "learning_rate": 9.859114765342239e-06, + "loss": 0.7875, + "num_tokens": 38992974083.0, + "step": 9329 + }, + { + "epoch": 1.1087344028520498, + "grad_norm": 0.3681402812857367, + "learning_rate": 9.85739679331187e-06, + "loss": 0.8303, + "num_tokens": 38997163218.0, + "step": 9330 + }, + { + "epoch": 1.108853238265003, + "grad_norm": 0.30707864672495455, + "learning_rate": 9.855678863595966e-06, + "loss": 0.8477, + "num_tokens": 39001354343.0, + "step": 9331 + }, + { + "epoch": 1.108972073677956, + "grad_norm": 0.44918606929262495, + "learning_rate": 9.853960976258138e-06, + "loss": 0.8405, + "num_tokens": 39005543860.0, + "step": 9332 + }, + { + "epoch": 1.1090909090909091, + "grad_norm": 0.3249959308774681, + "learning_rate": 9.852243131362016e-06, + "loss": 0.8232, + "num_tokens": 39009715228.0, + "step": 9333 + }, + { + "epoch": 1.1092097445038622, + "grad_norm": 0.40623441235646396, + "learning_rate": 9.850525328971208e-06, + "loss": 0.8368, + "num_tokens": 39013855993.0, + "step": 9334 + }, + { + "epoch": 1.109328579916815, + "grad_norm": 0.361302764805906, + "learning_rate": 9.848807569149332e-06, + "loss": 0.8582, + "num_tokens": 39018040539.0, + "step": 9335 + }, + { + "epoch": 1.1094474153297682, + "grad_norm": 0.3890530431598459, + "learning_rate": 9.847089851960008e-06, + "loss": 0.8134, + "num_tokens": 39022229951.0, + "step": 9336 + }, + { + "epoch": 1.1095662507427213, + "grad_norm": 0.32701519761498593, + "learning_rate": 9.845372177466839e-06, + "loss": 0.8243, + "num_tokens": 39026419162.0, + "step": 9337 + }, + { + "epoch": 1.1096850861556744, + "grad_norm": 0.3765942764497685, + "learning_rate": 9.843654545733448e-06, + "loss": 0.7947, + "num_tokens": 39030608757.0, + "step": 9338 + }, + { + "epoch": 1.1098039215686275, + "grad_norm": 0.32963100514755594, + "learning_rate": 9.841936956823435e-06, + "loss": 0.8146, + "num_tokens": 39034798357.0, + "step": 9339 + }, + { + "epoch": 1.1099227569815806, + "grad_norm": 0.3273056115553339, + "learning_rate": 9.840219410800413e-06, + "loss": 0.8321, + "num_tokens": 39038987326.0, + "step": 9340 + }, + { + "epoch": 1.1100415923945335, + "grad_norm": 0.3392229163197035, + "learning_rate": 9.83850190772799e-06, + "loss": 0.8069, + "num_tokens": 39043177340.0, + "step": 9341 + }, + { + "epoch": 1.1101604278074866, + "grad_norm": 0.33255400887490394, + "learning_rate": 9.836784447669765e-06, + "loss": 0.801, + "num_tokens": 39047348811.0, + "step": 9342 + }, + { + "epoch": 1.1102792632204397, + "grad_norm": 0.40532461396841346, + "learning_rate": 9.835067030689345e-06, + "loss": 0.8054, + "num_tokens": 39051536139.0, + "step": 9343 + }, + { + "epoch": 1.1103980986333928, + "grad_norm": 0.3276635343091244, + "learning_rate": 9.833349656850332e-06, + "loss": 0.8568, + "num_tokens": 39055724643.0, + "step": 9344 + }, + { + "epoch": 1.1105169340463459, + "grad_norm": 0.3508734679944272, + "learning_rate": 9.831632326216327e-06, + "loss": 0.8314, + "num_tokens": 39059913861.0, + "step": 9345 + }, + { + "epoch": 1.1106357694592988, + "grad_norm": 0.4087188004990305, + "learning_rate": 9.829915038850928e-06, + "loss": 0.8617, + "num_tokens": 39064101094.0, + "step": 9346 + }, + { + "epoch": 1.1107546048722519, + "grad_norm": 0.3172667202750143, + "learning_rate": 9.82819779481773e-06, + "loss": 0.8191, + "num_tokens": 39068289260.0, + "step": 9347 + }, + { + "epoch": 1.110873440285205, + "grad_norm": 0.3923720232472573, + "learning_rate": 9.82648059418033e-06, + "loss": 0.8222, + "num_tokens": 39072478673.0, + "step": 9348 + }, + { + "epoch": 1.110992275698158, + "grad_norm": 0.30812533927531455, + "learning_rate": 9.824763437002319e-06, + "loss": 0.7907, + "num_tokens": 39076658824.0, + "step": 9349 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.38244121318652463, + "learning_rate": 9.823046323347294e-06, + "loss": 0.8355, + "num_tokens": 39080847701.0, + "step": 9350 + }, + { + "epoch": 1.1112299465240643, + "grad_norm": 0.3407948924284398, + "learning_rate": 9.821329253278845e-06, + "loss": 0.8274, + "num_tokens": 39085036162.0, + "step": 9351 + }, + { + "epoch": 1.1113487819370171, + "grad_norm": 0.36366613700641304, + "learning_rate": 9.819612226860556e-06, + "loss": 0.8235, + "num_tokens": 39089212071.0, + "step": 9352 + }, + { + "epoch": 1.1114676173499702, + "grad_norm": 0.3881016044980381, + "learning_rate": 9.817895244156016e-06, + "loss": 0.8572, + "num_tokens": 39093401562.0, + "step": 9353 + }, + { + "epoch": 1.1115864527629233, + "grad_norm": 0.37352062828054383, + "learning_rate": 9.816178305228815e-06, + "loss": 0.853, + "num_tokens": 39097566620.0, + "step": 9354 + }, + { + "epoch": 1.1117052881758764, + "grad_norm": 0.3781032752249264, + "learning_rate": 9.814461410142532e-06, + "loss": 0.8423, + "num_tokens": 39101756262.0, + "step": 9355 + }, + { + "epoch": 1.1118241235888295, + "grad_norm": 0.31827450265682866, + "learning_rate": 9.812744558960752e-06, + "loss": 0.812, + "num_tokens": 39105945140.0, + "step": 9356 + }, + { + "epoch": 1.1119429590017824, + "grad_norm": 0.4107723445877098, + "learning_rate": 9.811027751747054e-06, + "loss": 0.8478, + "num_tokens": 39110133867.0, + "step": 9357 + }, + { + "epoch": 1.1120617944147355, + "grad_norm": 0.35910625432403076, + "learning_rate": 9.809310988565019e-06, + "loss": 0.8158, + "num_tokens": 39114295801.0, + "step": 9358 + }, + { + "epoch": 1.1121806298276886, + "grad_norm": 0.39393311714210993, + "learning_rate": 9.807594269478224e-06, + "loss": 0.8592, + "num_tokens": 39118456208.0, + "step": 9359 + }, + { + "epoch": 1.1122994652406417, + "grad_norm": 0.38011213147863937, + "learning_rate": 9.805877594550248e-06, + "loss": 0.8323, + "num_tokens": 39122645268.0, + "step": 9360 + }, + { + "epoch": 1.1124183006535948, + "grad_norm": 0.4135679595927608, + "learning_rate": 9.804160963844658e-06, + "loss": 0.8357, + "num_tokens": 39126833371.0, + "step": 9361 + }, + { + "epoch": 1.112537136066548, + "grad_norm": 0.3447623120815891, + "learning_rate": 9.802444377425032e-06, + "loss": 0.8512, + "num_tokens": 39131023345.0, + "step": 9362 + }, + { + "epoch": 1.1126559714795008, + "grad_norm": 0.4453567579071938, + "learning_rate": 9.80072783535494e-06, + "loss": 0.8595, + "num_tokens": 39135190414.0, + "step": 9363 + }, + { + "epoch": 1.112774806892454, + "grad_norm": 0.37322241129338246, + "learning_rate": 9.799011337697952e-06, + "loss": 0.7919, + "num_tokens": 39139378973.0, + "step": 9364 + }, + { + "epoch": 1.112893642305407, + "grad_norm": 0.47622955585065263, + "learning_rate": 9.797294884517632e-06, + "loss": 0.86, + "num_tokens": 39143538223.0, + "step": 9365 + }, + { + "epoch": 1.11301247771836, + "grad_norm": 0.3956616073226233, + "learning_rate": 9.795578475877551e-06, + "loss": 0.8008, + "num_tokens": 39147727950.0, + "step": 9366 + }, + { + "epoch": 1.1131313131313132, + "grad_norm": 0.4437810751529796, + "learning_rate": 9.79386211184127e-06, + "loss": 0.8119, + "num_tokens": 39151917956.0, + "step": 9367 + }, + { + "epoch": 1.1132501485442663, + "grad_norm": 0.3740164223468583, + "learning_rate": 9.792145792472355e-06, + "loss": 0.8046, + "num_tokens": 39156107316.0, + "step": 9368 + }, + { + "epoch": 1.1133689839572192, + "grad_norm": 0.42092395204901184, + "learning_rate": 9.790429517834363e-06, + "loss": 0.7968, + "num_tokens": 39160287155.0, + "step": 9369 + }, + { + "epoch": 1.1134878193701723, + "grad_norm": 0.37203428376105435, + "learning_rate": 9.788713287990857e-06, + "loss": 0.7659, + "num_tokens": 39164475826.0, + "step": 9370 + }, + { + "epoch": 1.1136066547831254, + "grad_norm": 0.39862158889443144, + "learning_rate": 9.786997103005393e-06, + "loss": 0.7825, + "num_tokens": 39168659943.0, + "step": 9371 + }, + { + "epoch": 1.1137254901960785, + "grad_norm": 0.37653757153853734, + "learning_rate": 9.785280962941528e-06, + "loss": 0.8299, + "num_tokens": 39172850119.0, + "step": 9372 + }, + { + "epoch": 1.1138443256090316, + "grad_norm": 0.42339418111671084, + "learning_rate": 9.783564867862816e-06, + "loss": 0.8572, + "num_tokens": 39177022690.0, + "step": 9373 + }, + { + "epoch": 1.1139631610219844, + "grad_norm": 0.362074303732406, + "learning_rate": 9.781848817832811e-06, + "loss": 0.8414, + "num_tokens": 39181205665.0, + "step": 9374 + }, + { + "epoch": 1.1140819964349375, + "grad_norm": 0.4296286401554878, + "learning_rate": 9.780132812915057e-06, + "loss": 0.8442, + "num_tokens": 39185390605.0, + "step": 9375 + }, + { + "epoch": 1.1142008318478906, + "grad_norm": 0.39154324008636454, + "learning_rate": 9.778416853173114e-06, + "loss": 0.8298, + "num_tokens": 39189578032.0, + "step": 9376 + }, + { + "epoch": 1.1143196672608437, + "grad_norm": 0.42877225284911835, + "learning_rate": 9.776700938670527e-06, + "loss": 0.8182, + "num_tokens": 39193765345.0, + "step": 9377 + }, + { + "epoch": 1.1144385026737968, + "grad_norm": 0.3844217896190223, + "learning_rate": 9.77498506947084e-06, + "loss": 0.8519, + "num_tokens": 39197937315.0, + "step": 9378 + }, + { + "epoch": 1.11455733808675, + "grad_norm": 0.37575852116727493, + "learning_rate": 9.773269245637595e-06, + "loss": 0.823, + "num_tokens": 39202127473.0, + "step": 9379 + }, + { + "epoch": 1.1146761734997028, + "grad_norm": 0.3492366287375475, + "learning_rate": 9.771553467234339e-06, + "loss": 0.7966, + "num_tokens": 39206318105.0, + "step": 9380 + }, + { + "epoch": 1.114795008912656, + "grad_norm": 0.3971450655124219, + "learning_rate": 9.769837734324608e-06, + "loss": 0.8242, + "num_tokens": 39210476951.0, + "step": 9381 + }, + { + "epoch": 1.114913844325609, + "grad_norm": 0.355074556511705, + "learning_rate": 9.768122046971949e-06, + "loss": 0.8272, + "num_tokens": 39214664454.0, + "step": 9382 + }, + { + "epoch": 1.1150326797385621, + "grad_norm": 0.3932818718669861, + "learning_rate": 9.766406405239896e-06, + "loss": 0.7691, + "num_tokens": 39218853781.0, + "step": 9383 + }, + { + "epoch": 1.1151515151515152, + "grad_norm": 0.3711347713938806, + "learning_rate": 9.764690809191983e-06, + "loss": 0.8099, + "num_tokens": 39223043313.0, + "step": 9384 + }, + { + "epoch": 1.1152703505644683, + "grad_norm": 0.4599086446456247, + "learning_rate": 9.762975258891745e-06, + "loss": 0.8228, + "num_tokens": 39227231354.0, + "step": 9385 + }, + { + "epoch": 1.1153891859774212, + "grad_norm": 0.3872360889087131, + "learning_rate": 9.761259754402719e-06, + "loss": 0.8473, + "num_tokens": 39231421069.0, + "step": 9386 + }, + { + "epoch": 1.1155080213903743, + "grad_norm": 0.4037117115131431, + "learning_rate": 9.75954429578843e-06, + "loss": 0.8501, + "num_tokens": 39235610447.0, + "step": 9387 + }, + { + "epoch": 1.1156268568033274, + "grad_norm": 0.3706755098644681, + "learning_rate": 9.75782888311241e-06, + "loss": 0.7843, + "num_tokens": 39239785487.0, + "step": 9388 + }, + { + "epoch": 1.1157456922162805, + "grad_norm": 0.3875759303087992, + "learning_rate": 9.756113516438185e-06, + "loss": 0.7943, + "num_tokens": 39243951443.0, + "step": 9389 + }, + { + "epoch": 1.1158645276292336, + "grad_norm": 0.36350093443531883, + "learning_rate": 9.754398195829286e-06, + "loss": 0.8701, + "num_tokens": 39248120818.0, + "step": 9390 + }, + { + "epoch": 1.1159833630421865, + "grad_norm": 0.37501509514952475, + "learning_rate": 9.752682921349233e-06, + "loss": 0.8522, + "num_tokens": 39252292950.0, + "step": 9391 + }, + { + "epoch": 1.1161021984551396, + "grad_norm": 0.3263831049029492, + "learning_rate": 9.750967693061546e-06, + "loss": 0.8266, + "num_tokens": 39256481316.0, + "step": 9392 + }, + { + "epoch": 1.1162210338680927, + "grad_norm": 0.34139902995566573, + "learning_rate": 9.74925251102975e-06, + "loss": 0.7998, + "num_tokens": 39260671094.0, + "step": 9393 + }, + { + "epoch": 1.1163398692810458, + "grad_norm": 0.2764966847579725, + "learning_rate": 9.74753737531736e-06, + "loss": 0.8002, + "num_tokens": 39264861533.0, + "step": 9394 + }, + { + "epoch": 1.1164587046939989, + "grad_norm": 0.3326507659788377, + "learning_rate": 9.745822285987896e-06, + "loss": 0.776, + "num_tokens": 39269050171.0, + "step": 9395 + }, + { + "epoch": 1.116577540106952, + "grad_norm": 0.3461394204177718, + "learning_rate": 9.744107243104876e-06, + "loss": 0.8086, + "num_tokens": 39273238619.0, + "step": 9396 + }, + { + "epoch": 1.1166963755199049, + "grad_norm": 0.37467105438295567, + "learning_rate": 9.74239224673181e-06, + "loss": 0.8051, + "num_tokens": 39277427675.0, + "step": 9397 + }, + { + "epoch": 1.116815210932858, + "grad_norm": 0.2916288302945085, + "learning_rate": 9.740677296932211e-06, + "loss": 0.819, + "num_tokens": 39281598632.0, + "step": 9398 + }, + { + "epoch": 1.116934046345811, + "grad_norm": 0.4103334667017666, + "learning_rate": 9.738962393769592e-06, + "loss": 0.7984, + "num_tokens": 39285766934.0, + "step": 9399 + }, + { + "epoch": 1.1170528817587642, + "grad_norm": 0.3452311012319792, + "learning_rate": 9.737247537307453e-06, + "loss": 0.8339, + "num_tokens": 39289950462.0, + "step": 9400 + }, + { + "epoch": 1.1171717171717173, + "grad_norm": 0.3502336404688059, + "learning_rate": 9.73553272760931e-06, + "loss": 0.8302, + "num_tokens": 39294083511.0, + "step": 9401 + }, + { + "epoch": 1.1172905525846701, + "grad_norm": 0.3880052368700058, + "learning_rate": 9.733817964738668e-06, + "loss": 0.8298, + "num_tokens": 39298271972.0, + "step": 9402 + }, + { + "epoch": 1.1174093879976232, + "grad_norm": 0.3477759777486034, + "learning_rate": 9.732103248759026e-06, + "loss": 0.8248, + "num_tokens": 39302433837.0, + "step": 9403 + }, + { + "epoch": 1.1175282234105763, + "grad_norm": 0.34386539942981986, + "learning_rate": 9.730388579733889e-06, + "loss": 0.8204, + "num_tokens": 39306589998.0, + "step": 9404 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.3111275097838947, + "learning_rate": 9.728673957726753e-06, + "loss": 0.8068, + "num_tokens": 39310769997.0, + "step": 9405 + }, + { + "epoch": 1.1177658942364825, + "grad_norm": 0.28698073079935577, + "learning_rate": 9.726959382801123e-06, + "loss": 0.8144, + "num_tokens": 39314928100.0, + "step": 9406 + }, + { + "epoch": 1.1178847296494356, + "grad_norm": 0.3491268235672046, + "learning_rate": 9.725244855020487e-06, + "loss": 0.8275, + "num_tokens": 39319116959.0, + "step": 9407 + }, + { + "epoch": 1.1180035650623885, + "grad_norm": 0.2934620010201988, + "learning_rate": 9.723530374448343e-06, + "loss": 0.7836, + "num_tokens": 39323305731.0, + "step": 9408 + }, + { + "epoch": 1.1181224004753416, + "grad_norm": 0.32444160427233115, + "learning_rate": 9.72181594114819e-06, + "loss": 0.8179, + "num_tokens": 39327494004.0, + "step": 9409 + }, + { + "epoch": 1.1182412358882947, + "grad_norm": 0.3336581801548868, + "learning_rate": 9.72010155518351e-06, + "loss": 0.8232, + "num_tokens": 39331639038.0, + "step": 9410 + }, + { + "epoch": 1.1183600713012478, + "grad_norm": 0.3383407822948116, + "learning_rate": 9.7183872166178e-06, + "loss": 0.8068, + "num_tokens": 39335827306.0, + "step": 9411 + }, + { + "epoch": 1.118478906714201, + "grad_norm": 0.33824052647288844, + "learning_rate": 9.716672925514542e-06, + "loss": 0.8415, + "num_tokens": 39340016657.0, + "step": 9412 + }, + { + "epoch": 1.1185977421271538, + "grad_norm": 0.3509438357520945, + "learning_rate": 9.714958681937223e-06, + "loss": 0.8005, + "num_tokens": 39344206590.0, + "step": 9413 + }, + { + "epoch": 1.118716577540107, + "grad_norm": 0.35229158640759584, + "learning_rate": 9.713244485949333e-06, + "loss": 0.8322, + "num_tokens": 39348358284.0, + "step": 9414 + }, + { + "epoch": 1.11883541295306, + "grad_norm": 0.29389460975507314, + "learning_rate": 9.71153033761435e-06, + "loss": 0.7901, + "num_tokens": 39352547965.0, + "step": 9415 + }, + { + "epoch": 1.118954248366013, + "grad_norm": 0.3462585925663254, + "learning_rate": 9.709816236995755e-06, + "loss": 0.862, + "num_tokens": 39356694074.0, + "step": 9416 + }, + { + "epoch": 1.1190730837789662, + "grad_norm": 0.3022228910055204, + "learning_rate": 9.708102184157026e-06, + "loss": 0.8339, + "num_tokens": 39360856703.0, + "step": 9417 + }, + { + "epoch": 1.1191919191919193, + "grad_norm": 0.3678838542432856, + "learning_rate": 9.70638817916164e-06, + "loss": 0.8666, + "num_tokens": 39365044597.0, + "step": 9418 + }, + { + "epoch": 1.1193107546048722, + "grad_norm": 0.3267111625413054, + "learning_rate": 9.704674222073075e-06, + "loss": 0.8158, + "num_tokens": 39369234346.0, + "step": 9419 + }, + { + "epoch": 1.1194295900178253, + "grad_norm": 0.4271511726289496, + "learning_rate": 9.702960312954802e-06, + "loss": 0.8177, + "num_tokens": 39373424379.0, + "step": 9420 + }, + { + "epoch": 1.1195484254307784, + "grad_norm": 0.3322130205004261, + "learning_rate": 9.701246451870294e-06, + "loss": 0.8376, + "num_tokens": 39377613542.0, + "step": 9421 + }, + { + "epoch": 1.1196672608437315, + "grad_norm": 0.39657736201627175, + "learning_rate": 9.699532638883025e-06, + "loss": 0.8044, + "num_tokens": 39381802960.0, + "step": 9422 + }, + { + "epoch": 1.1197860962566846, + "grad_norm": 0.33978820319441116, + "learning_rate": 9.697818874056454e-06, + "loss": 0.8187, + "num_tokens": 39385987452.0, + "step": 9423 + }, + { + "epoch": 1.1199049316696374, + "grad_norm": 0.38961668551938183, + "learning_rate": 9.696105157454057e-06, + "loss": 0.8104, + "num_tokens": 39390153849.0, + "step": 9424 + }, + { + "epoch": 1.1200237670825905, + "grad_norm": 0.3340116789055425, + "learning_rate": 9.694391489139293e-06, + "loss": 0.858, + "num_tokens": 39394319732.0, + "step": 9425 + }, + { + "epoch": 1.1201426024955436, + "grad_norm": 0.37658951506262195, + "learning_rate": 9.692677869175622e-06, + "loss": 0.7901, + "num_tokens": 39398483324.0, + "step": 9426 + }, + { + "epoch": 1.1202614379084967, + "grad_norm": 0.31104587766752767, + "learning_rate": 9.690964297626514e-06, + "loss": 0.8113, + "num_tokens": 39402672650.0, + "step": 9427 + }, + { + "epoch": 1.1203802733214498, + "grad_norm": 0.41209299229017016, + "learning_rate": 9.689250774555425e-06, + "loss": 0.8066, + "num_tokens": 39406860624.0, + "step": 9428 + }, + { + "epoch": 1.120499108734403, + "grad_norm": 0.32354566705304355, + "learning_rate": 9.687537300025812e-06, + "loss": 0.8043, + "num_tokens": 39411050689.0, + "step": 9429 + }, + { + "epoch": 1.1206179441473558, + "grad_norm": 0.4428964087541905, + "learning_rate": 9.68582387410113e-06, + "loss": 0.8309, + "num_tokens": 39415241242.0, + "step": 9430 + }, + { + "epoch": 1.120736779560309, + "grad_norm": 0.3434555047628161, + "learning_rate": 9.684110496844832e-06, + "loss": 0.8231, + "num_tokens": 39419417529.0, + "step": 9431 + }, + { + "epoch": 1.120855614973262, + "grad_norm": 0.4302064503131208, + "learning_rate": 9.68239716832037e-06, + "loss": 0.8526, + "num_tokens": 39423574012.0, + "step": 9432 + }, + { + "epoch": 1.1209744503862151, + "grad_norm": 0.4200913986766032, + "learning_rate": 9.680683888591196e-06, + "loss": 0.8099, + "num_tokens": 39427761948.0, + "step": 9433 + }, + { + "epoch": 1.1210932857991682, + "grad_norm": 0.39141735374702447, + "learning_rate": 9.67897065772076e-06, + "loss": 0.7959, + "num_tokens": 39431951063.0, + "step": 9434 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 0.4782069186146784, + "learning_rate": 9.677257475772508e-06, + "loss": 0.7998, + "num_tokens": 39436109771.0, + "step": 9435 + }, + { + "epoch": 1.1213309566250742, + "grad_norm": 0.3490199943277635, + "learning_rate": 9.675544342809885e-06, + "loss": 0.8198, + "num_tokens": 39440298648.0, + "step": 9436 + }, + { + "epoch": 1.1214497920380273, + "grad_norm": 0.4778245202310405, + "learning_rate": 9.673831258896329e-06, + "loss": 0.7995, + "num_tokens": 39444450708.0, + "step": 9437 + }, + { + "epoch": 1.1215686274509804, + "grad_norm": 0.4124319929398048, + "learning_rate": 9.672118224095288e-06, + "loss": 0.7988, + "num_tokens": 39448640140.0, + "step": 9438 + }, + { + "epoch": 1.1216874628639335, + "grad_norm": 0.4279420336588376, + "learning_rate": 9.670405238470193e-06, + "loss": 0.8308, + "num_tokens": 39452828476.0, + "step": 9439 + }, + { + "epoch": 1.1218062982768866, + "grad_norm": 0.40622405406101425, + "learning_rate": 9.668692302084495e-06, + "loss": 0.8529, + "num_tokens": 39457015386.0, + "step": 9440 + }, + { + "epoch": 1.1219251336898395, + "grad_norm": 0.3863096888769269, + "learning_rate": 9.66697941500162e-06, + "loss": 0.8222, + "num_tokens": 39461203349.0, + "step": 9441 + }, + { + "epoch": 1.1220439691027926, + "grad_norm": 0.3528685247255354, + "learning_rate": 9.665266577285003e-06, + "loss": 0.828, + "num_tokens": 39465393267.0, + "step": 9442 + }, + { + "epoch": 1.1221628045157457, + "grad_norm": 0.38197188902916007, + "learning_rate": 9.66355378899808e-06, + "loss": 0.8003, + "num_tokens": 39469583306.0, + "step": 9443 + }, + { + "epoch": 1.1222816399286988, + "grad_norm": 0.3452852549662644, + "learning_rate": 9.661841050204276e-06, + "loss": 0.8438, + "num_tokens": 39473773795.0, + "step": 9444 + }, + { + "epoch": 1.1224004753416519, + "grad_norm": 0.37440215877952626, + "learning_rate": 9.660128360967023e-06, + "loss": 0.7472, + "num_tokens": 39477964106.0, + "step": 9445 + }, + { + "epoch": 1.1225193107546048, + "grad_norm": 0.39853336519388044, + "learning_rate": 9.658415721349748e-06, + "loss": 0.8202, + "num_tokens": 39482152875.0, + "step": 9446 + }, + { + "epoch": 1.1226381461675579, + "grad_norm": 0.3204384565993133, + "learning_rate": 9.656703131415874e-06, + "loss": 0.7916, + "num_tokens": 39486309525.0, + "step": 9447 + }, + { + "epoch": 1.122756981580511, + "grad_norm": 0.5002390578852716, + "learning_rate": 9.654990591228825e-06, + "loss": 0.8215, + "num_tokens": 39490498004.0, + "step": 9448 + }, + { + "epoch": 1.122875816993464, + "grad_norm": 0.38529734416339934, + "learning_rate": 9.653278100852023e-06, + "loss": 0.8378, + "num_tokens": 39494687459.0, + "step": 9449 + }, + { + "epoch": 1.1229946524064172, + "grad_norm": 0.4622840006099246, + "learning_rate": 9.651565660348887e-06, + "loss": 0.7935, + "num_tokens": 39498859758.0, + "step": 9450 + }, + { + "epoch": 1.1231134878193703, + "grad_norm": 0.411015283460898, + "learning_rate": 9.64985326978283e-06, + "loss": 0.8235, + "num_tokens": 39503039383.0, + "step": 9451 + }, + { + "epoch": 1.1232323232323231, + "grad_norm": 0.4350091556974199, + "learning_rate": 9.648140929217274e-06, + "loss": 0.8061, + "num_tokens": 39507185170.0, + "step": 9452 + }, + { + "epoch": 1.1233511586452762, + "grad_norm": 0.38530656907425437, + "learning_rate": 9.64642863871563e-06, + "loss": 0.8374, + "num_tokens": 39511373318.0, + "step": 9453 + }, + { + "epoch": 1.1234699940582293, + "grad_norm": 0.39125213229923594, + "learning_rate": 9.644716398341313e-06, + "loss": 0.8161, + "num_tokens": 39515563811.0, + "step": 9454 + }, + { + "epoch": 1.1235888294711824, + "grad_norm": 0.3826239393186516, + "learning_rate": 9.643004208157725e-06, + "loss": 0.8309, + "num_tokens": 39519717998.0, + "step": 9455 + }, + { + "epoch": 1.1237076648841355, + "grad_norm": 0.39402010480145633, + "learning_rate": 9.641292068228282e-06, + "loss": 0.8319, + "num_tokens": 39523908089.0, + "step": 9456 + }, + { + "epoch": 1.1238265002970884, + "grad_norm": 0.38324992148171755, + "learning_rate": 9.639579978616388e-06, + "loss": 0.7951, + "num_tokens": 39528098023.0, + "step": 9457 + }, + { + "epoch": 1.1239453357100415, + "grad_norm": 0.344709901653849, + "learning_rate": 9.637867939385444e-06, + "loss": 0.8325, + "num_tokens": 39532267825.0, + "step": 9458 + }, + { + "epoch": 1.1240641711229946, + "grad_norm": 0.45138045572618385, + "learning_rate": 9.636155950598857e-06, + "loss": 0.8113, + "num_tokens": 39536456075.0, + "step": 9459 + }, + { + "epoch": 1.1241830065359477, + "grad_norm": 0.3900464729859959, + "learning_rate": 9.634444012320027e-06, + "loss": 0.8301, + "num_tokens": 39540646131.0, + "step": 9460 + }, + { + "epoch": 1.1243018419489008, + "grad_norm": 0.5134069395985962, + "learning_rate": 9.632732124612354e-06, + "loss": 0.7934, + "num_tokens": 39544806969.0, + "step": 9461 + }, + { + "epoch": 1.124420677361854, + "grad_norm": 0.42789143478252595, + "learning_rate": 9.631020287539232e-06, + "loss": 0.842, + "num_tokens": 39548996128.0, + "step": 9462 + }, + { + "epoch": 1.1245395127748068, + "grad_norm": 0.5223840700136051, + "learning_rate": 9.629308501164056e-06, + "loss": 0.8336, + "num_tokens": 39553167531.0, + "step": 9463 + }, + { + "epoch": 1.12465834818776, + "grad_norm": 0.48593858190773237, + "learning_rate": 9.62759676555022e-06, + "loss": 0.8467, + "num_tokens": 39557324409.0, + "step": 9464 + }, + { + "epoch": 1.124777183600713, + "grad_norm": 0.4310217699001562, + "learning_rate": 9.625885080761116e-06, + "loss": 0.8211, + "num_tokens": 39561513857.0, + "step": 9465 + }, + { + "epoch": 1.124896019013666, + "grad_norm": 0.4557542119109567, + "learning_rate": 9.624173446860139e-06, + "loss": 0.8249, + "num_tokens": 39565656973.0, + "step": 9466 + }, + { + "epoch": 1.1250148544266192, + "grad_norm": 0.4356884193734658, + "learning_rate": 9.622461863910666e-06, + "loss": 0.8438, + "num_tokens": 39569846020.0, + "step": 9467 + }, + { + "epoch": 1.125133689839572, + "grad_norm": 0.4166694596735148, + "learning_rate": 9.62075033197609e-06, + "loss": 0.8138, + "num_tokens": 39574010806.0, + "step": 9468 + }, + { + "epoch": 1.1252525252525252, + "grad_norm": 0.42056326018339824, + "learning_rate": 9.619038851119793e-06, + "loss": 0.8278, + "num_tokens": 39578177197.0, + "step": 9469 + }, + { + "epoch": 1.1253713606654783, + "grad_norm": 0.3927753258949744, + "learning_rate": 9.61732742140515e-06, + "loss": 0.8093, + "num_tokens": 39582350161.0, + "step": 9470 + }, + { + "epoch": 1.1254901960784314, + "grad_norm": 0.3959461204804612, + "learning_rate": 9.615616042895555e-06, + "loss": 0.8128, + "num_tokens": 39586540219.0, + "step": 9471 + }, + { + "epoch": 1.1256090314913845, + "grad_norm": 0.3706158913332788, + "learning_rate": 9.613904715654373e-06, + "loss": 0.8006, + "num_tokens": 39590725595.0, + "step": 9472 + }, + { + "epoch": 1.1257278669043376, + "grad_norm": 0.4317265788522778, + "learning_rate": 9.61219343974499e-06, + "loss": 0.8281, + "num_tokens": 39594915273.0, + "step": 9473 + }, + { + "epoch": 1.1258467023172907, + "grad_norm": 0.31908783093306564, + "learning_rate": 9.610482215230778e-06, + "loss": 0.7996, + "num_tokens": 39599095974.0, + "step": 9474 + }, + { + "epoch": 1.1259655377302435, + "grad_norm": 0.4611159372433412, + "learning_rate": 9.608771042175105e-06, + "loss": 0.794, + "num_tokens": 39603278508.0, + "step": 9475 + }, + { + "epoch": 1.1260843731431966, + "grad_norm": 0.3447105123877168, + "learning_rate": 9.607059920641345e-06, + "loss": 0.7667, + "num_tokens": 39607468170.0, + "step": 9476 + }, + { + "epoch": 1.1262032085561497, + "grad_norm": 0.42435519832210017, + "learning_rate": 9.605348850692862e-06, + "loss": 0.8011, + "num_tokens": 39611639941.0, + "step": 9477 + }, + { + "epoch": 1.1263220439691028, + "grad_norm": 0.37833336499713716, + "learning_rate": 9.603637832393028e-06, + "loss": 0.8573, + "num_tokens": 39615822295.0, + "step": 9478 + }, + { + "epoch": 1.126440879382056, + "grad_norm": 0.3597418496612223, + "learning_rate": 9.60192686580521e-06, + "loss": 0.8198, + "num_tokens": 39620009026.0, + "step": 9479 + }, + { + "epoch": 1.1265597147950088, + "grad_norm": 0.3581091606437025, + "learning_rate": 9.600215950992765e-06, + "loss": 0.8088, + "num_tokens": 39624172141.0, + "step": 9480 + }, + { + "epoch": 1.126678550207962, + "grad_norm": 0.3519530073428541, + "learning_rate": 9.598505088019054e-06, + "loss": 0.8338, + "num_tokens": 39628320987.0, + "step": 9481 + }, + { + "epoch": 1.126797385620915, + "grad_norm": 0.3294521503396705, + "learning_rate": 9.59679427694744e-06, + "loss": 0.8612, + "num_tokens": 39632511336.0, + "step": 9482 + }, + { + "epoch": 1.1269162210338681, + "grad_norm": 0.35151549429183787, + "learning_rate": 9.595083517841277e-06, + "loss": 0.8364, + "num_tokens": 39636700598.0, + "step": 9483 + }, + { + "epoch": 1.1270350564468212, + "grad_norm": 0.32878879426380964, + "learning_rate": 9.593372810763924e-06, + "loss": 0.8471, + "num_tokens": 39640889635.0, + "step": 9484 + }, + { + "epoch": 1.1271538918597743, + "grad_norm": 0.2896782961255378, + "learning_rate": 9.59166215577873e-06, + "loss": 0.8245, + "num_tokens": 39645050659.0, + "step": 9485 + }, + { + "epoch": 1.1272727272727272, + "grad_norm": 0.3658773784195406, + "learning_rate": 9.589951552949048e-06, + "loss": 0.8288, + "num_tokens": 39649239690.0, + "step": 9486 + }, + { + "epoch": 1.1273915626856803, + "grad_norm": 0.3448467306384251, + "learning_rate": 9.588241002338227e-06, + "loss": 0.8491, + "num_tokens": 39653427228.0, + "step": 9487 + }, + { + "epoch": 1.1275103980986334, + "grad_norm": 0.27659475893368, + "learning_rate": 9.586530504009619e-06, + "loss": 0.8519, + "num_tokens": 39657615360.0, + "step": 9488 + }, + { + "epoch": 1.1276292335115865, + "grad_norm": 0.34240987744871076, + "learning_rate": 9.584820058026561e-06, + "loss": 0.8177, + "num_tokens": 39661804449.0, + "step": 9489 + }, + { + "epoch": 1.1277480689245396, + "grad_norm": 0.40596370328605436, + "learning_rate": 9.583109664452401e-06, + "loss": 0.8432, + "num_tokens": 39665994192.0, + "step": 9490 + }, + { + "epoch": 1.1278669043374925, + "grad_norm": 0.3067646713414572, + "learning_rate": 9.581399323350482e-06, + "loss": 0.8388, + "num_tokens": 39670184182.0, + "step": 9491 + }, + { + "epoch": 1.1279857397504456, + "grad_norm": 0.3576075194113202, + "learning_rate": 9.579689034784144e-06, + "loss": 0.8128, + "num_tokens": 39674329570.0, + "step": 9492 + }, + { + "epoch": 1.1281045751633987, + "grad_norm": 0.40009214228265483, + "learning_rate": 9.577978798816724e-06, + "loss": 0.8416, + "num_tokens": 39678520764.0, + "step": 9493 + }, + { + "epoch": 1.1282234105763518, + "grad_norm": 0.28685682139666013, + "learning_rate": 9.576268615511552e-06, + "loss": 0.8105, + "num_tokens": 39682696292.0, + "step": 9494 + }, + { + "epoch": 1.1283422459893049, + "grad_norm": 0.38131738287359374, + "learning_rate": 9.574558484931971e-06, + "loss": 0.8472, + "num_tokens": 39686885813.0, + "step": 9495 + }, + { + "epoch": 1.128461081402258, + "grad_norm": 0.3711209427296969, + "learning_rate": 9.572848407141306e-06, + "loss": 0.8293, + "num_tokens": 39691075137.0, + "step": 9496 + }, + { + "epoch": 1.1285799168152109, + "grad_norm": 0.3291522564375883, + "learning_rate": 9.571138382202892e-06, + "loss": 0.8387, + "num_tokens": 39695263843.0, + "step": 9497 + }, + { + "epoch": 1.128698752228164, + "grad_norm": 0.39255167413089526, + "learning_rate": 9.569428410180055e-06, + "loss": 0.8461, + "num_tokens": 39699448196.0, + "step": 9498 + }, + { + "epoch": 1.128817587641117, + "grad_norm": 0.33665139958328033, + "learning_rate": 9.56771849113612e-06, + "loss": 0.828, + "num_tokens": 39703637436.0, + "step": 9499 + }, + { + "epoch": 1.1289364230540702, + "grad_norm": 0.3034159481656335, + "learning_rate": 9.566008625134413e-06, + "loss": 0.8316, + "num_tokens": 39707825563.0, + "step": 9500 + }, + { + "epoch": 1.1290552584670233, + "grad_norm": 0.42255518195984215, + "learning_rate": 9.564298812238256e-06, + "loss": 0.7907, + "num_tokens": 39712014544.0, + "step": 9501 + }, + { + "epoch": 1.1291740938799761, + "grad_norm": 0.3412855808403358, + "learning_rate": 9.562589052510963e-06, + "loss": 0.8602, + "num_tokens": 39716203423.0, + "step": 9502 + }, + { + "epoch": 1.1292929292929292, + "grad_norm": 0.34459090265938774, + "learning_rate": 9.56087934601586e-06, + "loss": 0.8672, + "num_tokens": 39720392942.0, + "step": 9503 + }, + { + "epoch": 1.1294117647058823, + "grad_norm": 0.31595265578392834, + "learning_rate": 9.559169692816264e-06, + "loss": 0.823, + "num_tokens": 39724568602.0, + "step": 9504 + }, + { + "epoch": 1.1295306001188354, + "grad_norm": 0.3444165198811399, + "learning_rate": 9.557460092975481e-06, + "loss": 0.7959, + "num_tokens": 39728742226.0, + "step": 9505 + }, + { + "epoch": 1.1296494355317885, + "grad_norm": 0.3252065006657142, + "learning_rate": 9.555750546556833e-06, + "loss": 0.7888, + "num_tokens": 39732931601.0, + "step": 9506 + }, + { + "epoch": 1.1297682709447416, + "grad_norm": 0.32592452347089096, + "learning_rate": 9.554041053623622e-06, + "loss": 0.8141, + "num_tokens": 39737097464.0, + "step": 9507 + }, + { + "epoch": 1.1298871063576945, + "grad_norm": 0.30692532730660843, + "learning_rate": 9.552331614239159e-06, + "loss": 0.843, + "num_tokens": 39741270122.0, + "step": 9508 + }, + { + "epoch": 1.1300059417706476, + "grad_norm": 0.34362558656911985, + "learning_rate": 9.550622228466751e-06, + "loss": 0.829, + "num_tokens": 39745451603.0, + "step": 9509 + }, + { + "epoch": 1.1301247771836007, + "grad_norm": 0.3641052376232078, + "learning_rate": 9.548912896369705e-06, + "loss": 0.8418, + "num_tokens": 39749621399.0, + "step": 9510 + }, + { + "epoch": 1.1302436125965538, + "grad_norm": 0.2852238938653478, + "learning_rate": 9.54720361801132e-06, + "loss": 0.7978, + "num_tokens": 39753777816.0, + "step": 9511 + }, + { + "epoch": 1.130362448009507, + "grad_norm": 0.353974969529905, + "learning_rate": 9.545494393454897e-06, + "loss": 0.8051, + "num_tokens": 39757924238.0, + "step": 9512 + }, + { + "epoch": 1.1304812834224598, + "grad_norm": 0.2910327856172789, + "learning_rate": 9.543785222763733e-06, + "loss": 0.8405, + "num_tokens": 39762111257.0, + "step": 9513 + }, + { + "epoch": 1.130600118835413, + "grad_norm": 0.3251159853715275, + "learning_rate": 9.542076106001127e-06, + "loss": 0.7756, + "num_tokens": 39766299660.0, + "step": 9514 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.3036490694490864, + "learning_rate": 9.54036704323037e-06, + "loss": 0.8448, + "num_tokens": 39770422128.0, + "step": 9515 + }, + { + "epoch": 1.130837789661319, + "grad_norm": 0.3345791625585364, + "learning_rate": 9.538658034514759e-06, + "loss": 0.8401, + "num_tokens": 39774611232.0, + "step": 9516 + }, + { + "epoch": 1.1309566250742722, + "grad_norm": 0.30627632267374705, + "learning_rate": 9.536949079917583e-06, + "loss": 0.8077, + "num_tokens": 39778780595.0, + "step": 9517 + }, + { + "epoch": 1.1310754604872253, + "grad_norm": 0.31308979194761016, + "learning_rate": 9.535240179502127e-06, + "loss": 0.8268, + "num_tokens": 39782970971.0, + "step": 9518 + }, + { + "epoch": 1.1311942959001782, + "grad_norm": 0.3349850983549235, + "learning_rate": 9.53353133333168e-06, + "loss": 0.8023, + "num_tokens": 39787136355.0, + "step": 9519 + }, + { + "epoch": 1.1313131313131313, + "grad_norm": 0.2900209773427419, + "learning_rate": 9.531822541469526e-06, + "loss": 0.8612, + "num_tokens": 39791317680.0, + "step": 9520 + }, + { + "epoch": 1.1314319667260844, + "grad_norm": 0.32147522507320314, + "learning_rate": 9.530113803978946e-06, + "loss": 0.808, + "num_tokens": 39795457620.0, + "step": 9521 + }, + { + "epoch": 1.1315508021390375, + "grad_norm": 0.38104786123458345, + "learning_rate": 9.528405120923224e-06, + "loss": 0.7905, + "num_tokens": 39799646044.0, + "step": 9522 + }, + { + "epoch": 1.1316696375519906, + "grad_norm": 0.346320709717194, + "learning_rate": 9.526696492365635e-06, + "loss": 0.7908, + "num_tokens": 39803809577.0, + "step": 9523 + }, + { + "epoch": 1.1317884729649434, + "grad_norm": 0.3201900065278196, + "learning_rate": 9.524987918369458e-06, + "loss": 0.8453, + "num_tokens": 39807972624.0, + "step": 9524 + }, + { + "epoch": 1.1319073083778965, + "grad_norm": 0.35764648067513294, + "learning_rate": 9.523279398997961e-06, + "loss": 0.8267, + "num_tokens": 39812162582.0, + "step": 9525 + }, + { + "epoch": 1.1320261437908496, + "grad_norm": 0.3512320939903049, + "learning_rate": 9.521570934314424e-06, + "loss": 0.8142, + "num_tokens": 39816352233.0, + "step": 9526 + }, + { + "epoch": 1.1321449792038027, + "grad_norm": 0.2887262213986323, + "learning_rate": 9.519862524382115e-06, + "loss": 0.8521, + "num_tokens": 39820543548.0, + "step": 9527 + }, + { + "epoch": 1.1322638146167558, + "grad_norm": 0.3533699152652718, + "learning_rate": 9.518154169264297e-06, + "loss": 0.8561, + "num_tokens": 39824716518.0, + "step": 9528 + }, + { + "epoch": 1.132382650029709, + "grad_norm": 0.3442430432586423, + "learning_rate": 9.516445869024244e-06, + "loss": 0.8203, + "num_tokens": 39828872811.0, + "step": 9529 + }, + { + "epoch": 1.1325014854426618, + "grad_norm": 0.3035329623246699, + "learning_rate": 9.514737623725216e-06, + "loss": 0.8112, + "num_tokens": 39833062760.0, + "step": 9530 + }, + { + "epoch": 1.132620320855615, + "grad_norm": 0.3686940191059174, + "learning_rate": 9.513029433430474e-06, + "loss": 0.843, + "num_tokens": 39837253852.0, + "step": 9531 + }, + { + "epoch": 1.132739156268568, + "grad_norm": 0.28445837208991814, + "learning_rate": 9.511321298203284e-06, + "loss": 0.7983, + "num_tokens": 39841385853.0, + "step": 9532 + }, + { + "epoch": 1.1328579916815211, + "grad_norm": 0.32280150201818775, + "learning_rate": 9.509613218106896e-06, + "loss": 0.8025, + "num_tokens": 39845551819.0, + "step": 9533 + }, + { + "epoch": 1.1329768270944742, + "grad_norm": 0.30461038747693003, + "learning_rate": 9.507905193204568e-06, + "loss": 0.8203, + "num_tokens": 39849739794.0, + "step": 9534 + }, + { + "epoch": 1.133095662507427, + "grad_norm": 0.28605996850941745, + "learning_rate": 9.506197223559556e-06, + "loss": 0.7996, + "num_tokens": 39853923216.0, + "step": 9535 + }, + { + "epoch": 1.1332144979203802, + "grad_norm": 0.30792320094796627, + "learning_rate": 9.504489309235114e-06, + "loss": 0.7951, + "num_tokens": 39858113183.0, + "step": 9536 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3388007320997698, + "learning_rate": 9.50278145029449e-06, + "loss": 0.8188, + "num_tokens": 39862303183.0, + "step": 9537 + }, + { + "epoch": 1.1334521687462864, + "grad_norm": 0.31441595081747437, + "learning_rate": 9.501073646800929e-06, + "loss": 0.8021, + "num_tokens": 39866491938.0, + "step": 9538 + }, + { + "epoch": 1.1335710041592395, + "grad_norm": 0.3277129047885582, + "learning_rate": 9.49936589881768e-06, + "loss": 0.8403, + "num_tokens": 39870679634.0, + "step": 9539 + }, + { + "epoch": 1.1336898395721926, + "grad_norm": 0.3440471236357926, + "learning_rate": 9.497658206407983e-06, + "loss": 0.7879, + "num_tokens": 39874847271.0, + "step": 9540 + }, + { + "epoch": 1.1338086749851455, + "grad_norm": 0.3519061446059776, + "learning_rate": 9.495950569635083e-06, + "loss": 0.8408, + "num_tokens": 39879037304.0, + "step": 9541 + }, + { + "epoch": 1.1339275103980986, + "grad_norm": 0.30766044027444583, + "learning_rate": 9.494242988562223e-06, + "loss": 0.8136, + "num_tokens": 39883225668.0, + "step": 9542 + }, + { + "epoch": 1.1340463458110517, + "grad_norm": 0.33640866401547403, + "learning_rate": 9.492535463252632e-06, + "loss": 0.7925, + "num_tokens": 39887404631.0, + "step": 9543 + }, + { + "epoch": 1.1341651812240048, + "grad_norm": 0.3282409621825515, + "learning_rate": 9.490827993769552e-06, + "loss": 0.8128, + "num_tokens": 39891589654.0, + "step": 9544 + }, + { + "epoch": 1.1342840166369579, + "grad_norm": 0.3528109669223978, + "learning_rate": 9.489120580176215e-06, + "loss": 0.8252, + "num_tokens": 39895762171.0, + "step": 9545 + }, + { + "epoch": 1.1344028520499108, + "grad_norm": 0.35543562575486165, + "learning_rate": 9.487413222535849e-06, + "loss": 0.8228, + "num_tokens": 39899952183.0, + "step": 9546 + }, + { + "epoch": 1.1345216874628639, + "grad_norm": 0.38141941737681184, + "learning_rate": 9.485705920911681e-06, + "loss": 0.8475, + "num_tokens": 39904129491.0, + "step": 9547 + }, + { + "epoch": 1.134640522875817, + "grad_norm": 0.32476675244109016, + "learning_rate": 9.483998675366947e-06, + "loss": 0.8288, + "num_tokens": 39908261089.0, + "step": 9548 + }, + { + "epoch": 1.13475935828877, + "grad_norm": 0.32824927417444477, + "learning_rate": 9.48229148596487e-06, + "loss": 0.8013, + "num_tokens": 39912419983.0, + "step": 9549 + }, + { + "epoch": 1.1348781937017232, + "grad_norm": 0.3600740199739685, + "learning_rate": 9.48058435276867e-06, + "loss": 0.8327, + "num_tokens": 39916609258.0, + "step": 9550 + }, + { + "epoch": 1.1349970291146763, + "grad_norm": 0.29658472885427106, + "learning_rate": 9.478877275841567e-06, + "loss": 0.8102, + "num_tokens": 39920780059.0, + "step": 9551 + }, + { + "epoch": 1.1351158645276291, + "grad_norm": 0.354847975271213, + "learning_rate": 9.477170255246781e-06, + "loss": 0.8337, + "num_tokens": 39924959707.0, + "step": 9552 + }, + { + "epoch": 1.1352346999405822, + "grad_norm": 0.34431980264681716, + "learning_rate": 9.475463291047529e-06, + "loss": 0.8009, + "num_tokens": 39929139340.0, + "step": 9553 + }, + { + "epoch": 1.1353535353535353, + "grad_norm": 0.3063052976394073, + "learning_rate": 9.473756383307027e-06, + "loss": 0.8143, + "num_tokens": 39933328844.0, + "step": 9554 + }, + { + "epoch": 1.1354723707664884, + "grad_norm": 0.31419751202535484, + "learning_rate": 9.472049532088487e-06, + "loss": 0.7992, + "num_tokens": 39937512732.0, + "step": 9555 + }, + { + "epoch": 1.1355912061794415, + "grad_norm": 0.3127742258245993, + "learning_rate": 9.47034273745512e-06, + "loss": 0.8401, + "num_tokens": 39941702029.0, + "step": 9556 + }, + { + "epoch": 1.1357100415923944, + "grad_norm": 0.2965857991316688, + "learning_rate": 9.46863599947013e-06, + "loss": 0.8221, + "num_tokens": 39945878846.0, + "step": 9557 + }, + { + "epoch": 1.1358288770053475, + "grad_norm": 0.3405251051995562, + "learning_rate": 9.46692931819673e-06, + "loss": 0.8106, + "num_tokens": 39950067838.0, + "step": 9558 + }, + { + "epoch": 1.1359477124183006, + "grad_norm": 0.4134679770290998, + "learning_rate": 9.465222693698117e-06, + "loss": 0.8415, + "num_tokens": 39954225391.0, + "step": 9559 + }, + { + "epoch": 1.1360665478312537, + "grad_norm": 0.33009062249052334, + "learning_rate": 9.463516126037502e-06, + "loss": 0.7792, + "num_tokens": 39958413666.0, + "step": 9560 + }, + { + "epoch": 1.1361853832442068, + "grad_norm": 0.3719218760349707, + "learning_rate": 9.461809615278075e-06, + "loss": 0.7867, + "num_tokens": 39962602961.0, + "step": 9561 + }, + { + "epoch": 1.13630421865716, + "grad_norm": 0.281501872669342, + "learning_rate": 9.460103161483043e-06, + "loss": 0.8309, + "num_tokens": 39966765966.0, + "step": 9562 + }, + { + "epoch": 1.136423054070113, + "grad_norm": 0.40356193963043935, + "learning_rate": 9.458396764715597e-06, + "loss": 0.8199, + "num_tokens": 39970954996.0, + "step": 9563 + }, + { + "epoch": 1.136541889483066, + "grad_norm": 0.2987424959939145, + "learning_rate": 9.456690425038929e-06, + "loss": 0.817, + "num_tokens": 39975136009.0, + "step": 9564 + }, + { + "epoch": 1.136660724896019, + "grad_norm": 0.355583742090648, + "learning_rate": 9.45498414251623e-06, + "loss": 0.7932, + "num_tokens": 39979296876.0, + "step": 9565 + }, + { + "epoch": 1.136779560308972, + "grad_norm": 0.3149245146553371, + "learning_rate": 9.453277917210694e-06, + "loss": 0.8049, + "num_tokens": 39983485566.0, + "step": 9566 + }, + { + "epoch": 1.1368983957219252, + "grad_norm": 0.33124635842228617, + "learning_rate": 9.451571749185507e-06, + "loss": 0.8355, + "num_tokens": 39987672733.0, + "step": 9567 + }, + { + "epoch": 1.137017231134878, + "grad_norm": 0.32452951058665847, + "learning_rate": 9.449865638503854e-06, + "loss": 0.8437, + "num_tokens": 39991862579.0, + "step": 9568 + }, + { + "epoch": 1.1371360665478312, + "grad_norm": 0.31123710699065205, + "learning_rate": 9.44815958522892e-06, + "loss": 0.794, + "num_tokens": 39996051619.0, + "step": 9569 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.35119168124737865, + "learning_rate": 9.446453589423878e-06, + "loss": 0.8251, + "num_tokens": 40000238574.0, + "step": 9570 + }, + { + "epoch": 1.1373737373737374, + "grad_norm": 0.3203477886416217, + "learning_rate": 9.444747651151918e-06, + "loss": 0.7994, + "num_tokens": 40004393628.0, + "step": 9571 + }, + { + "epoch": 1.1374925727866905, + "grad_norm": 0.3269996573447957, + "learning_rate": 9.443041770476205e-06, + "loss": 0.8342, + "num_tokens": 40008582151.0, + "step": 9572 + }, + { + "epoch": 1.1376114081996436, + "grad_norm": 0.30488697101856377, + "learning_rate": 9.441335947459924e-06, + "loss": 0.8372, + "num_tokens": 40012769713.0, + "step": 9573 + }, + { + "epoch": 1.1377302436125967, + "grad_norm": 0.33278220378977824, + "learning_rate": 9.43963018216624e-06, + "loss": 0.8, + "num_tokens": 40016959774.0, + "step": 9574 + }, + { + "epoch": 1.1378490790255495, + "grad_norm": 0.3151451689238281, + "learning_rate": 9.437924474658326e-06, + "loss": 0.8769, + "num_tokens": 40021146635.0, + "step": 9575 + }, + { + "epoch": 1.1379679144385026, + "grad_norm": 0.3433911813849565, + "learning_rate": 9.436218824999354e-06, + "loss": 0.821, + "num_tokens": 40025317331.0, + "step": 9576 + }, + { + "epoch": 1.1380867498514557, + "grad_norm": 0.3218966487453889, + "learning_rate": 9.434513233252485e-06, + "loss": 0.7825, + "num_tokens": 40029476409.0, + "step": 9577 + }, + { + "epoch": 1.1382055852644088, + "grad_norm": 0.37044673391766164, + "learning_rate": 9.43280769948088e-06, + "loss": 0.8142, + "num_tokens": 40033664182.0, + "step": 9578 + }, + { + "epoch": 1.138324420677362, + "grad_norm": 0.2938467292373317, + "learning_rate": 9.431102223747706e-06, + "loss": 0.8236, + "num_tokens": 40037852730.0, + "step": 9579 + }, + { + "epoch": 1.1384432560903148, + "grad_norm": 0.34559230975370175, + "learning_rate": 9.429396806116122e-06, + "loss": 0.8153, + "num_tokens": 40042040234.0, + "step": 9580 + }, + { + "epoch": 1.138562091503268, + "grad_norm": 0.28789108564769045, + "learning_rate": 9.427691446649287e-06, + "loss": 0.8186, + "num_tokens": 40046207065.0, + "step": 9581 + }, + { + "epoch": 1.138680926916221, + "grad_norm": 0.3508586502831294, + "learning_rate": 9.42598614541035e-06, + "loss": 0.7924, + "num_tokens": 40050396418.0, + "step": 9582 + }, + { + "epoch": 1.1387997623291741, + "grad_norm": 0.347010901711274, + "learning_rate": 9.424280902462472e-06, + "loss": 0.8262, + "num_tokens": 40054586456.0, + "step": 9583 + }, + { + "epoch": 1.1389185977421272, + "grad_norm": 0.3537675741282131, + "learning_rate": 9.422575717868798e-06, + "loss": 0.8212, + "num_tokens": 40058757839.0, + "step": 9584 + }, + { + "epoch": 1.1390374331550803, + "grad_norm": 0.35191947740853396, + "learning_rate": 9.420870591692474e-06, + "loss": 0.8467, + "num_tokens": 40062945103.0, + "step": 9585 + }, + { + "epoch": 1.1391562685680332, + "grad_norm": 0.3718785474781868, + "learning_rate": 9.419165523996659e-06, + "loss": 0.8146, + "num_tokens": 40067127559.0, + "step": 9586 + }, + { + "epoch": 1.1392751039809863, + "grad_norm": 0.3427203968145105, + "learning_rate": 9.417460514844486e-06, + "loss": 0.8073, + "num_tokens": 40071295987.0, + "step": 9587 + }, + { + "epoch": 1.1393939393939394, + "grad_norm": 0.3369078166644527, + "learning_rate": 9.4157555642991e-06, + "loss": 0.8159, + "num_tokens": 40075485763.0, + "step": 9588 + }, + { + "epoch": 1.1395127748068925, + "grad_norm": 0.3691367550650936, + "learning_rate": 9.414050672423644e-06, + "loss": 0.8316, + "num_tokens": 40079674855.0, + "step": 9589 + }, + { + "epoch": 1.1396316102198456, + "grad_norm": 0.3070275429749288, + "learning_rate": 9.412345839281253e-06, + "loss": 0.8269, + "num_tokens": 40083849507.0, + "step": 9590 + }, + { + "epoch": 1.1397504456327985, + "grad_norm": 0.33421604235401065, + "learning_rate": 9.41064106493506e-06, + "loss": 0.8118, + "num_tokens": 40088018101.0, + "step": 9591 + }, + { + "epoch": 1.1398692810457516, + "grad_norm": 0.3348689456673201, + "learning_rate": 9.408936349448205e-06, + "loss": 0.8326, + "num_tokens": 40092199909.0, + "step": 9592 + }, + { + "epoch": 1.1399881164587047, + "grad_norm": 0.30348760146050763, + "learning_rate": 9.407231692883818e-06, + "loss": 0.812, + "num_tokens": 40096388876.0, + "step": 9593 + }, + { + "epoch": 1.1401069518716578, + "grad_norm": 0.33995967232953717, + "learning_rate": 9.405527095305027e-06, + "loss": 0.8033, + "num_tokens": 40100551496.0, + "step": 9594 + }, + { + "epoch": 1.1402257872846109, + "grad_norm": 0.31876176038100285, + "learning_rate": 9.403822556774958e-06, + "loss": 0.8259, + "num_tokens": 40104740200.0, + "step": 9595 + }, + { + "epoch": 1.140344622697564, + "grad_norm": 0.29619512422320626, + "learning_rate": 9.402118077356734e-06, + "loss": 0.8073, + "num_tokens": 40108901440.0, + "step": 9596 + }, + { + "epoch": 1.1404634581105169, + "grad_norm": 0.3345129193595888, + "learning_rate": 9.40041365711348e-06, + "loss": 0.8679, + "num_tokens": 40113091607.0, + "step": 9597 + }, + { + "epoch": 1.14058229352347, + "grad_norm": 0.3229016683960852, + "learning_rate": 9.398709296108319e-06, + "loss": 0.8008, + "num_tokens": 40117278689.0, + "step": 9598 + }, + { + "epoch": 1.140701128936423, + "grad_norm": 0.30405186613670965, + "learning_rate": 9.397004994404367e-06, + "loss": 0.8038, + "num_tokens": 40121451918.0, + "step": 9599 + }, + { + "epoch": 1.1408199643493762, + "grad_norm": 0.3237210209079534, + "learning_rate": 9.395300752064738e-06, + "loss": 0.8292, + "num_tokens": 40125621917.0, + "step": 9600 + }, + { + "epoch": 1.1409387997623293, + "grad_norm": 0.35460617531942024, + "learning_rate": 9.39359656915255e-06, + "loss": 0.8395, + "num_tokens": 40129810548.0, + "step": 9601 + }, + { + "epoch": 1.1410576351752821, + "grad_norm": 0.33499675055586475, + "learning_rate": 9.391892445730913e-06, + "loss": 0.7903, + "num_tokens": 40134000006.0, + "step": 9602 + }, + { + "epoch": 1.1411764705882352, + "grad_norm": 0.31166428165400956, + "learning_rate": 9.390188381862935e-06, + "loss": 0.8239, + "num_tokens": 40138174721.0, + "step": 9603 + }, + { + "epoch": 1.1412953060011883, + "grad_norm": 0.337317570460865, + "learning_rate": 9.388484377611719e-06, + "loss": 0.8009, + "num_tokens": 40142358278.0, + "step": 9604 + }, + { + "epoch": 1.1414141414141414, + "grad_norm": 0.354753064749244, + "learning_rate": 9.386780433040382e-06, + "loss": 0.8055, + "num_tokens": 40146546507.0, + "step": 9605 + }, + { + "epoch": 1.1415329768270945, + "grad_norm": 0.29582832883321536, + "learning_rate": 9.385076548212016e-06, + "loss": 0.7927, + "num_tokens": 40150691467.0, + "step": 9606 + }, + { + "epoch": 1.1416518122400476, + "grad_norm": 0.32581350282291166, + "learning_rate": 9.383372723189728e-06, + "loss": 0.8296, + "num_tokens": 40154859705.0, + "step": 9607 + }, + { + "epoch": 1.1417706476530005, + "grad_norm": 0.3077988926168239, + "learning_rate": 9.381668958036614e-06, + "loss": 0.8128, + "num_tokens": 40159049796.0, + "step": 9608 + }, + { + "epoch": 1.1418894830659536, + "grad_norm": 0.33361430299881734, + "learning_rate": 9.379965252815768e-06, + "loss": 0.8166, + "num_tokens": 40163239308.0, + "step": 9609 + }, + { + "epoch": 1.1420083184789067, + "grad_norm": 0.3112279166046637, + "learning_rate": 9.378261607590287e-06, + "loss": 0.7987, + "num_tokens": 40167429149.0, + "step": 9610 + }, + { + "epoch": 1.1421271538918598, + "grad_norm": 0.3809777096704196, + "learning_rate": 9.37655802242326e-06, + "loss": 0.7812, + "num_tokens": 40171614843.0, + "step": 9611 + }, + { + "epoch": 1.142245989304813, + "grad_norm": 0.2874720146480385, + "learning_rate": 9.374854497377784e-06, + "loss": 0.8222, + "num_tokens": 40175804961.0, + "step": 9612 + }, + { + "epoch": 1.1423648247177658, + "grad_norm": 0.44242993258750646, + "learning_rate": 9.373151032516935e-06, + "loss": 0.8104, + "num_tokens": 40179994234.0, + "step": 9613 + }, + { + "epoch": 1.142483660130719, + "grad_norm": 0.32736670501971, + "learning_rate": 9.37144762790381e-06, + "loss": 0.7741, + "num_tokens": 40184143312.0, + "step": 9614 + }, + { + "epoch": 1.142602495543672, + "grad_norm": 0.47853399528321855, + "learning_rate": 9.36974428360148e-06, + "loss": 0.8388, + "num_tokens": 40188304341.0, + "step": 9615 + }, + { + "epoch": 1.142721330956625, + "grad_norm": 0.4143921857918844, + "learning_rate": 9.368040999673033e-06, + "loss": 0.828, + "num_tokens": 40192493807.0, + "step": 9616 + }, + { + "epoch": 1.1428401663695782, + "grad_norm": 0.41576239563156214, + "learning_rate": 9.366337776181545e-06, + "loss": 0.8424, + "num_tokens": 40196621099.0, + "step": 9617 + }, + { + "epoch": 1.1429590017825313, + "grad_norm": 0.3862920876377372, + "learning_rate": 9.364634613190093e-06, + "loss": 0.8288, + "num_tokens": 40200810984.0, + "step": 9618 + }, + { + "epoch": 1.1430778371954842, + "grad_norm": 0.3288328866016613, + "learning_rate": 9.362931510761753e-06, + "loss": 0.8064, + "num_tokens": 40205001323.0, + "step": 9619 + }, + { + "epoch": 1.1431966726084373, + "grad_norm": 0.3230987737781103, + "learning_rate": 9.36122846895959e-06, + "loss": 0.7573, + "num_tokens": 40209191164.0, + "step": 9620 + }, + { + "epoch": 1.1433155080213904, + "grad_norm": 0.38133089840555284, + "learning_rate": 9.359525487846682e-06, + "loss": 0.8414, + "num_tokens": 40213379364.0, + "step": 9621 + }, + { + "epoch": 1.1434343434343435, + "grad_norm": 0.313343556261842, + "learning_rate": 9.357822567486088e-06, + "loss": 0.812, + "num_tokens": 40217567000.0, + "step": 9622 + }, + { + "epoch": 1.1435531788472966, + "grad_norm": 0.35917279528645435, + "learning_rate": 9.356119707940876e-06, + "loss": 0.8156, + "num_tokens": 40221755316.0, + "step": 9623 + }, + { + "epoch": 1.1436720142602494, + "grad_norm": 0.3370076586422454, + "learning_rate": 9.354416909274113e-06, + "loss": 0.8043, + "num_tokens": 40225915279.0, + "step": 9624 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 0.32128721266162547, + "learning_rate": 9.352714171548854e-06, + "loss": 0.809, + "num_tokens": 40230105129.0, + "step": 9625 + }, + { + "epoch": 1.1439096850861556, + "grad_norm": 0.3234716183367476, + "learning_rate": 9.35101149482816e-06, + "loss": 0.8038, + "num_tokens": 40234293684.0, + "step": 9626 + }, + { + "epoch": 1.1440285204991087, + "grad_norm": 0.35583946400427635, + "learning_rate": 9.349308879175085e-06, + "loss": 0.799, + "num_tokens": 40238482828.0, + "step": 9627 + }, + { + "epoch": 1.1441473559120618, + "grad_norm": 0.33177600262319773, + "learning_rate": 9.347606324652684e-06, + "loss": 0.8129, + "num_tokens": 40242666388.0, + "step": 9628 + }, + { + "epoch": 1.144266191325015, + "grad_norm": 0.3003356583161238, + "learning_rate": 9.345903831324004e-06, + "loss": 0.8213, + "num_tokens": 40246855504.0, + "step": 9629 + }, + { + "epoch": 1.1443850267379678, + "grad_norm": 0.3637686027495213, + "learning_rate": 9.344201399252099e-06, + "loss": 0.8271, + "num_tokens": 40251044572.0, + "step": 9630 + }, + { + "epoch": 1.144503862150921, + "grad_norm": 0.38455031541985973, + "learning_rate": 9.342499028500019e-06, + "loss": 0.8292, + "num_tokens": 40255234871.0, + "step": 9631 + }, + { + "epoch": 1.144622697563874, + "grad_norm": 0.34138219854390306, + "learning_rate": 9.3407967191308e-06, + "loss": 0.8535, + "num_tokens": 40259405181.0, + "step": 9632 + }, + { + "epoch": 1.1447415329768271, + "grad_norm": 0.36844750701313433, + "learning_rate": 9.33909447120749e-06, + "loss": 0.7748, + "num_tokens": 40263594284.0, + "step": 9633 + }, + { + "epoch": 1.1448603683897802, + "grad_norm": 0.3988793813448177, + "learning_rate": 9.337392284793127e-06, + "loss": 0.782, + "num_tokens": 40267784211.0, + "step": 9634 + }, + { + "epoch": 1.144979203802733, + "grad_norm": 0.34071018123460883, + "learning_rate": 9.33569015995075e-06, + "loss": 0.8068, + "num_tokens": 40271973015.0, + "step": 9635 + }, + { + "epoch": 1.1450980392156862, + "grad_norm": 0.38209291358562314, + "learning_rate": 9.33398809674339e-06, + "loss": 0.8046, + "num_tokens": 40276162725.0, + "step": 9636 + }, + { + "epoch": 1.1452168746286393, + "grad_norm": 0.3283999530682231, + "learning_rate": 9.332286095234087e-06, + "loss": 0.7997, + "num_tokens": 40280352261.0, + "step": 9637 + }, + { + "epoch": 1.1453357100415924, + "grad_norm": 0.33677884820039294, + "learning_rate": 9.330584155485867e-06, + "loss": 0.8229, + "num_tokens": 40284504046.0, + "step": 9638 + }, + { + "epoch": 1.1454545454545455, + "grad_norm": 0.3097985484284245, + "learning_rate": 9.328882277561762e-06, + "loss": 0.7978, + "num_tokens": 40288693764.0, + "step": 9639 + }, + { + "epoch": 1.1455733808674986, + "grad_norm": 0.3591731853243182, + "learning_rate": 9.327180461524797e-06, + "loss": 0.7978, + "num_tokens": 40292852364.0, + "step": 9640 + }, + { + "epoch": 1.1456922162804515, + "grad_norm": 0.28778897118986074, + "learning_rate": 9.325478707437993e-06, + "loss": 0.8205, + "num_tokens": 40297039596.0, + "step": 9641 + }, + { + "epoch": 1.1458110516934046, + "grad_norm": 0.31415408405425127, + "learning_rate": 9.323777015364372e-06, + "loss": 0.8738, + "num_tokens": 40301218432.0, + "step": 9642 + }, + { + "epoch": 1.1459298871063577, + "grad_norm": 0.31757633393831064, + "learning_rate": 9.322075385366958e-06, + "loss": 0.775, + "num_tokens": 40305409713.0, + "step": 9643 + }, + { + "epoch": 1.1460487225193108, + "grad_norm": 0.3175672967954918, + "learning_rate": 9.320373817508768e-06, + "loss": 0.8348, + "num_tokens": 40309597827.0, + "step": 9644 + }, + { + "epoch": 1.1461675579322639, + "grad_norm": 0.2849079947687398, + "learning_rate": 9.318672311852814e-06, + "loss": 0.8509, + "num_tokens": 40313786364.0, + "step": 9645 + }, + { + "epoch": 1.1462863933452168, + "grad_norm": 0.330590198116819, + "learning_rate": 9.316970868462107e-06, + "loss": 0.8069, + "num_tokens": 40317959592.0, + "step": 9646 + }, + { + "epoch": 1.1464052287581699, + "grad_norm": 0.29128538819698, + "learning_rate": 9.315269487399664e-06, + "loss": 0.8416, + "num_tokens": 40322103133.0, + "step": 9647 + }, + { + "epoch": 1.146524064171123, + "grad_norm": 0.29812067720372276, + "learning_rate": 9.313568168728478e-06, + "loss": 0.8329, + "num_tokens": 40326278677.0, + "step": 9648 + }, + { + "epoch": 1.146642899584076, + "grad_norm": 0.359210495142457, + "learning_rate": 9.31186691251157e-06, + "loss": 0.7997, + "num_tokens": 40330465515.0, + "step": 9649 + }, + { + "epoch": 1.1467617349970292, + "grad_norm": 0.290078477370351, + "learning_rate": 9.310165718811941e-06, + "loss": 0.8207, + "num_tokens": 40334654842.0, + "step": 9650 + }, + { + "epoch": 1.1468805704099823, + "grad_norm": 0.3557762018925027, + "learning_rate": 9.308464587692588e-06, + "loss": 0.7794, + "num_tokens": 40338844530.0, + "step": 9651 + }, + { + "epoch": 1.1469994058229354, + "grad_norm": 0.33486126697031243, + "learning_rate": 9.306763519216508e-06, + "loss": 0.8097, + "num_tokens": 40343033220.0, + "step": 9652 + }, + { + "epoch": 1.1471182412358882, + "grad_norm": 0.3590372924445172, + "learning_rate": 9.305062513446701e-06, + "loss": 0.8317, + "num_tokens": 40347197473.0, + "step": 9653 + }, + { + "epoch": 1.1472370766488413, + "grad_norm": 0.323411653698304, + "learning_rate": 9.303361570446161e-06, + "loss": 0.8407, + "num_tokens": 40351378656.0, + "step": 9654 + }, + { + "epoch": 1.1473559120617944, + "grad_norm": 0.3779571237163483, + "learning_rate": 9.301660690277874e-06, + "loss": 0.8323, + "num_tokens": 40355566754.0, + "step": 9655 + }, + { + "epoch": 1.1474747474747475, + "grad_norm": 0.3032759407304565, + "learning_rate": 9.299959873004838e-06, + "loss": 0.861, + "num_tokens": 40359755251.0, + "step": 9656 + }, + { + "epoch": 1.1475935828877004, + "grad_norm": 0.36762890668646264, + "learning_rate": 9.298259118690036e-06, + "loss": 0.8381, + "num_tokens": 40363943586.0, + "step": 9657 + }, + { + "epoch": 1.1477124183006535, + "grad_norm": 0.28975908035107545, + "learning_rate": 9.296558427396452e-06, + "loss": 0.8227, + "num_tokens": 40368122858.0, + "step": 9658 + }, + { + "epoch": 1.1478312537136066, + "grad_norm": 0.34914204564595247, + "learning_rate": 9.294857799187067e-06, + "loss": 0.8206, + "num_tokens": 40372312772.0, + "step": 9659 + }, + { + "epoch": 1.1479500891265597, + "grad_norm": 0.3291293069859636, + "learning_rate": 9.293157234124867e-06, + "loss": 0.8197, + "num_tokens": 40376453849.0, + "step": 9660 + }, + { + "epoch": 1.1480689245395128, + "grad_norm": 0.3337425700252343, + "learning_rate": 9.29145673227282e-06, + "loss": 0.8474, + "num_tokens": 40380642819.0, + "step": 9661 + }, + { + "epoch": 1.148187759952466, + "grad_norm": 0.3161061165543772, + "learning_rate": 9.28975629369391e-06, + "loss": 0.8287, + "num_tokens": 40384807674.0, + "step": 9662 + }, + { + "epoch": 1.148306595365419, + "grad_norm": 0.3005147801046064, + "learning_rate": 9.288055918451108e-06, + "loss": 0.7713, + "num_tokens": 40388992179.0, + "step": 9663 + }, + { + "epoch": 1.148425430778372, + "grad_norm": 0.3463292335971532, + "learning_rate": 9.286355606607382e-06, + "loss": 0.8291, + "num_tokens": 40393168738.0, + "step": 9664 + }, + { + "epoch": 1.148544266191325, + "grad_norm": 0.30128794918575336, + "learning_rate": 9.284655358225703e-06, + "loss": 0.8228, + "num_tokens": 40397345019.0, + "step": 9665 + }, + { + "epoch": 1.148663101604278, + "grad_norm": 0.3352743044738669, + "learning_rate": 9.282955173369034e-06, + "loss": 0.8336, + "num_tokens": 40401535271.0, + "step": 9666 + }, + { + "epoch": 1.1487819370172312, + "grad_norm": 0.3453120430766762, + "learning_rate": 9.281255052100342e-06, + "loss": 0.7934, + "num_tokens": 40405714390.0, + "step": 9667 + }, + { + "epoch": 1.1489007724301843, + "grad_norm": 0.3258131032160566, + "learning_rate": 9.279554994482585e-06, + "loss": 0.7985, + "num_tokens": 40409882443.0, + "step": 9668 + }, + { + "epoch": 1.1490196078431372, + "grad_norm": 0.39367027579376507, + "learning_rate": 9.277855000578723e-06, + "loss": 0.8265, + "num_tokens": 40414050242.0, + "step": 9669 + }, + { + "epoch": 1.1491384432560903, + "grad_norm": 0.31610237389523876, + "learning_rate": 9.276155070451718e-06, + "loss": 0.777, + "num_tokens": 40418229278.0, + "step": 9670 + }, + { + "epoch": 1.1492572786690434, + "grad_norm": 0.3365580306109463, + "learning_rate": 9.274455204164515e-06, + "loss": 0.7915, + "num_tokens": 40422387687.0, + "step": 9671 + }, + { + "epoch": 1.1493761140819965, + "grad_norm": 0.3052802429769662, + "learning_rate": 9.27275540178007e-06, + "loss": 0.8096, + "num_tokens": 40426576347.0, + "step": 9672 + }, + { + "epoch": 1.1494949494949496, + "grad_norm": 0.3749793439427979, + "learning_rate": 9.271055663361337e-06, + "loss": 0.835, + "num_tokens": 40430764607.0, + "step": 9673 + }, + { + "epoch": 1.1496137849079027, + "grad_norm": 0.302637998882389, + "learning_rate": 9.26935598897125e-06, + "loss": 0.8161, + "num_tokens": 40434951638.0, + "step": 9674 + }, + { + "epoch": 1.1497326203208555, + "grad_norm": 0.37787752577996364, + "learning_rate": 9.26765637867277e-06, + "loss": 0.8293, + "num_tokens": 40439141152.0, + "step": 9675 + }, + { + "epoch": 1.1498514557338086, + "grad_norm": 0.31997829618771617, + "learning_rate": 9.265956832528826e-06, + "loss": 0.8221, + "num_tokens": 40443285203.0, + "step": 9676 + }, + { + "epoch": 1.1499702911467617, + "grad_norm": 0.37972520163643214, + "learning_rate": 9.264257350602366e-06, + "loss": 0.8365, + "num_tokens": 40447474108.0, + "step": 9677 + }, + { + "epoch": 1.1500891265597148, + "grad_norm": 0.3133956529516571, + "learning_rate": 9.262557932956326e-06, + "loss": 0.8249, + "num_tokens": 40451663617.0, + "step": 9678 + }, + { + "epoch": 1.150207961972668, + "grad_norm": 0.3809131510248879, + "learning_rate": 9.26085857965364e-06, + "loss": 0.8289, + "num_tokens": 40455809584.0, + "step": 9679 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.3225838052604376, + "learning_rate": 9.259159290757238e-06, + "loss": 0.8117, + "num_tokens": 40459995837.0, + "step": 9680 + }, + { + "epoch": 1.150445632798574, + "grad_norm": 0.3680269694205231, + "learning_rate": 9.257460066330055e-06, + "loss": 0.8253, + "num_tokens": 40464185110.0, + "step": 9681 + }, + { + "epoch": 1.150564468211527, + "grad_norm": 0.35451140496213496, + "learning_rate": 9.255760906435018e-06, + "loss": 0.8498, + "num_tokens": 40468334276.0, + "step": 9682 + }, + { + "epoch": 1.1506833036244801, + "grad_norm": 0.32165076915843593, + "learning_rate": 9.254061811135054e-06, + "loss": 0.8349, + "num_tokens": 40472522435.0, + "step": 9683 + }, + { + "epoch": 1.1508021390374332, + "grad_norm": 0.3115126612752667, + "learning_rate": 9.25236278049308e-06, + "loss": 0.821, + "num_tokens": 40476679016.0, + "step": 9684 + }, + { + "epoch": 1.1509209744503863, + "grad_norm": 0.7535233151013653, + "learning_rate": 9.250663814572022e-06, + "loss": 0.8351, + "num_tokens": 40480861923.0, + "step": 9685 + }, + { + "epoch": 1.1510398098633392, + "grad_norm": 0.41962474615203527, + "learning_rate": 9.248964913434801e-06, + "loss": 0.8162, + "num_tokens": 40485051919.0, + "step": 9686 + }, + { + "epoch": 1.1511586452762923, + "grad_norm": 0.36094010777176583, + "learning_rate": 9.247266077144326e-06, + "loss": 0.8531, + "num_tokens": 40489241891.0, + "step": 9687 + }, + { + "epoch": 1.1512774806892454, + "grad_norm": 0.34288794637038894, + "learning_rate": 9.245567305763517e-06, + "loss": 0.8605, + "num_tokens": 40493428743.0, + "step": 9688 + }, + { + "epoch": 1.1513963161021985, + "grad_norm": 0.3898348706084818, + "learning_rate": 9.243868599355281e-06, + "loss": 0.8224, + "num_tokens": 40497601960.0, + "step": 9689 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 0.32986868535207103, + "learning_rate": 9.242169957982528e-06, + "loss": 0.8431, + "num_tokens": 40501791533.0, + "step": 9690 + }, + { + "epoch": 1.1516339869281045, + "grad_norm": 0.33375979652610016, + "learning_rate": 9.240471381708167e-06, + "loss": 0.8049, + "num_tokens": 40505980761.0, + "step": 9691 + }, + { + "epoch": 1.1517528223410576, + "grad_norm": 0.3868600156732859, + "learning_rate": 9.238772870595099e-06, + "loss": 0.8429, + "num_tokens": 40510147092.0, + "step": 9692 + }, + { + "epoch": 1.1518716577540107, + "grad_norm": 0.31828852861611046, + "learning_rate": 9.237074424706225e-06, + "loss": 0.8111, + "num_tokens": 40514336997.0, + "step": 9693 + }, + { + "epoch": 1.1519904931669638, + "grad_norm": 0.2969165513979166, + "learning_rate": 9.235376044104446e-06, + "loss": 0.8357, + "num_tokens": 40518494094.0, + "step": 9694 + }, + { + "epoch": 1.1521093285799169, + "grad_norm": 0.36337973995349354, + "learning_rate": 9.233677728852659e-06, + "loss": 0.8322, + "num_tokens": 40522664167.0, + "step": 9695 + }, + { + "epoch": 1.15222816399287, + "grad_norm": 0.30169681303536, + "learning_rate": 9.231979479013757e-06, + "loss": 0.8021, + "num_tokens": 40526833172.0, + "step": 9696 + }, + { + "epoch": 1.1523469994058229, + "grad_norm": 0.34502357651621596, + "learning_rate": 9.230281294650634e-06, + "loss": 0.7584, + "num_tokens": 40531023489.0, + "step": 9697 + }, + { + "epoch": 1.152465834818776, + "grad_norm": 0.31696807396367904, + "learning_rate": 9.228583175826178e-06, + "loss": 0.8507, + "num_tokens": 40535192018.0, + "step": 9698 + }, + { + "epoch": 1.152584670231729, + "grad_norm": 0.31753544237571063, + "learning_rate": 9.226885122603273e-06, + "loss": 0.7799, + "num_tokens": 40539380301.0, + "step": 9699 + }, + { + "epoch": 1.1527035056446822, + "grad_norm": 0.3201997395610484, + "learning_rate": 9.225187135044811e-06, + "loss": 0.81, + "num_tokens": 40543542527.0, + "step": 9700 + }, + { + "epoch": 1.1528223410576353, + "grad_norm": 0.3355180072026388, + "learning_rate": 9.22348921321367e-06, + "loss": 0.8276, + "num_tokens": 40547701666.0, + "step": 9701 + }, + { + "epoch": 1.1529411764705881, + "grad_norm": 0.3068607229111954, + "learning_rate": 9.221791357172729e-06, + "loss": 0.8286, + "num_tokens": 40551861233.0, + "step": 9702 + }, + { + "epoch": 1.1530600118835412, + "grad_norm": 0.3149657566366952, + "learning_rate": 9.220093566984867e-06, + "loss": 0.85, + "num_tokens": 40556044757.0, + "step": 9703 + }, + { + "epoch": 1.1531788472964943, + "grad_norm": 0.2813752609860136, + "learning_rate": 9.21839584271296e-06, + "loss": 0.81, + "num_tokens": 40560224652.0, + "step": 9704 + }, + { + "epoch": 1.1532976827094474, + "grad_norm": 0.3579282664062531, + "learning_rate": 9.216698184419875e-06, + "loss": 0.8244, + "num_tokens": 40564414114.0, + "step": 9705 + }, + { + "epoch": 1.1534165181224005, + "grad_norm": 0.29420074526682066, + "learning_rate": 9.215000592168484e-06, + "loss": 0.8341, + "num_tokens": 40568603461.0, + "step": 9706 + }, + { + "epoch": 1.1535353535353536, + "grad_norm": 0.3250338043625462, + "learning_rate": 9.21330306602166e-06, + "loss": 0.8053, + "num_tokens": 40572789788.0, + "step": 9707 + }, + { + "epoch": 1.1536541889483065, + "grad_norm": 0.7917137442717801, + "learning_rate": 9.211605606042262e-06, + "loss": 0.8356, + "num_tokens": 40576945704.0, + "step": 9708 + }, + { + "epoch": 1.1537730243612596, + "grad_norm": 0.396177395766831, + "learning_rate": 9.20990821229316e-06, + "loss": 0.7923, + "num_tokens": 40581120314.0, + "step": 9709 + }, + { + "epoch": 1.1538918597742127, + "grad_norm": 0.3472737345904707, + "learning_rate": 9.208210884837205e-06, + "loss": 0.7668, + "num_tokens": 40585309176.0, + "step": 9710 + }, + { + "epoch": 1.1540106951871658, + "grad_norm": 0.370764949180795, + "learning_rate": 9.20651362373726e-06, + "loss": 0.8375, + "num_tokens": 40589498076.0, + "step": 9711 + }, + { + "epoch": 1.154129530600119, + "grad_norm": 0.31733773123209996, + "learning_rate": 9.204816429056178e-06, + "loss": 0.7966, + "num_tokens": 40593688653.0, + "step": 9712 + }, + { + "epoch": 1.1542483660130718, + "grad_norm": 0.29876895839652445, + "learning_rate": 9.203119300856815e-06, + "loss": 0.8297, + "num_tokens": 40597878090.0, + "step": 9713 + }, + { + "epoch": 1.154367201426025, + "grad_norm": 0.3230087186375443, + "learning_rate": 9.201422239202023e-06, + "loss": 0.8327, + "num_tokens": 40602059961.0, + "step": 9714 + }, + { + "epoch": 1.154486036838978, + "grad_norm": 0.31720961524067454, + "learning_rate": 9.199725244154643e-06, + "loss": 0.839, + "num_tokens": 40606250808.0, + "step": 9715 + }, + { + "epoch": 1.154604872251931, + "grad_norm": 0.33592779182813143, + "learning_rate": 9.198028315777522e-06, + "loss": 0.8469, + "num_tokens": 40610409703.0, + "step": 9716 + }, + { + "epoch": 1.1547237076648842, + "grad_norm": 0.3010852743258021, + "learning_rate": 9.19633145413351e-06, + "loss": 0.8364, + "num_tokens": 40614598509.0, + "step": 9717 + }, + { + "epoch": 1.1548425430778373, + "grad_norm": 0.3082813761638695, + "learning_rate": 9.194634659285435e-06, + "loss": 0.8415, + "num_tokens": 40618786879.0, + "step": 9718 + }, + { + "epoch": 1.1549613784907902, + "grad_norm": 0.33458184081290554, + "learning_rate": 9.192937931296152e-06, + "loss": 0.7903, + "num_tokens": 40622921773.0, + "step": 9719 + }, + { + "epoch": 1.1550802139037433, + "grad_norm": 0.3152767000461962, + "learning_rate": 9.191241270228482e-06, + "loss": 0.7877, + "num_tokens": 40627111781.0, + "step": 9720 + }, + { + "epoch": 1.1551990493166964, + "grad_norm": 0.29843498455536194, + "learning_rate": 9.189544676145264e-06, + "loss": 0.823, + "num_tokens": 40631300609.0, + "step": 9721 + }, + { + "epoch": 1.1553178847296495, + "grad_norm": 0.3108272098256191, + "learning_rate": 9.187848149109329e-06, + "loss": 0.8385, + "num_tokens": 40635489996.0, + "step": 9722 + }, + { + "epoch": 1.1554367201426026, + "grad_norm": 0.29530338320032745, + "learning_rate": 9.186151689183505e-06, + "loss": 0.8467, + "num_tokens": 40639647151.0, + "step": 9723 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.309034189774746, + "learning_rate": 9.184455296430615e-06, + "loss": 0.8455, + "num_tokens": 40643835626.0, + "step": 9724 + }, + { + "epoch": 1.1556743909685085, + "grad_norm": 0.31082766423226926, + "learning_rate": 9.18275897091348e-06, + "loss": 0.7987, + "num_tokens": 40648025200.0, + "step": 9725 + }, + { + "epoch": 1.1557932263814616, + "grad_norm": 0.27073325400391945, + "learning_rate": 9.181062712694928e-06, + "loss": 0.8003, + "num_tokens": 40652172179.0, + "step": 9726 + }, + { + "epoch": 1.1559120617944147, + "grad_norm": 0.3143438636436784, + "learning_rate": 9.179366521837775e-06, + "loss": 0.8372, + "num_tokens": 40656361293.0, + "step": 9727 + }, + { + "epoch": 1.1560308972073678, + "grad_norm": 0.30910691590936024, + "learning_rate": 9.177670398404835e-06, + "loss": 0.7863, + "num_tokens": 40660551744.0, + "step": 9728 + }, + { + "epoch": 1.156149732620321, + "grad_norm": 0.2902497681093304, + "learning_rate": 9.175974342458923e-06, + "loss": 0.8067, + "num_tokens": 40664710338.0, + "step": 9729 + }, + { + "epoch": 1.1562685680332738, + "grad_norm": 0.29655749912235896, + "learning_rate": 9.174278354062844e-06, + "loss": 0.8183, + "num_tokens": 40668858408.0, + "step": 9730 + }, + { + "epoch": 1.156387403446227, + "grad_norm": 0.32760281028665644, + "learning_rate": 9.172582433279411e-06, + "loss": 0.8146, + "num_tokens": 40673047362.0, + "step": 9731 + }, + { + "epoch": 1.15650623885918, + "grad_norm": 0.3068497163465103, + "learning_rate": 9.170886580171432e-06, + "loss": 0.8121, + "num_tokens": 40677213914.0, + "step": 9732 + }, + { + "epoch": 1.1566250742721331, + "grad_norm": 0.3519855091641257, + "learning_rate": 9.169190794801707e-06, + "loss": 0.8565, + "num_tokens": 40681403609.0, + "step": 9733 + }, + { + "epoch": 1.1567439096850862, + "grad_norm": 0.33556064898732596, + "learning_rate": 9.167495077233038e-06, + "loss": 0.8167, + "num_tokens": 40685592419.0, + "step": 9734 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.32895723401638244, + "learning_rate": 9.165799427528218e-06, + "loss": 0.8358, + "num_tokens": 40689782871.0, + "step": 9735 + }, + { + "epoch": 1.1569815805109922, + "grad_norm": 0.3233151032769251, + "learning_rate": 9.164103845750052e-06, + "loss": 0.7929, + "num_tokens": 40693972378.0, + "step": 9736 + }, + { + "epoch": 1.1571004159239453, + "grad_norm": 0.31944903767239496, + "learning_rate": 9.162408331961322e-06, + "loss": 0.8095, + "num_tokens": 40698161046.0, + "step": 9737 + }, + { + "epoch": 1.1572192513368984, + "grad_norm": 0.33697739342480526, + "learning_rate": 9.160712886224828e-06, + "loss": 0.7855, + "num_tokens": 40702348722.0, + "step": 9738 + }, + { + "epoch": 1.1573380867498515, + "grad_norm": 0.34550592793778484, + "learning_rate": 9.159017508603354e-06, + "loss": 0.7972, + "num_tokens": 40706509846.0, + "step": 9739 + }, + { + "epoch": 1.1574569221628046, + "grad_norm": 0.35273120879815223, + "learning_rate": 9.157322199159685e-06, + "loss": 0.8516, + "num_tokens": 40710697928.0, + "step": 9740 + }, + { + "epoch": 1.1575757575757575, + "grad_norm": 0.33406190646223155, + "learning_rate": 9.15562695795661e-06, + "loss": 0.8084, + "num_tokens": 40714887304.0, + "step": 9741 + }, + { + "epoch": 1.1576945929887106, + "grad_norm": 0.3359009597226671, + "learning_rate": 9.1539317850569e-06, + "loss": 0.8146, + "num_tokens": 40719058373.0, + "step": 9742 + }, + { + "epoch": 1.1578134284016637, + "grad_norm": 0.32977877977054976, + "learning_rate": 9.152236680523338e-06, + "loss": 0.8102, + "num_tokens": 40723247280.0, + "step": 9743 + }, + { + "epoch": 1.1579322638146168, + "grad_norm": 0.3603445827748465, + "learning_rate": 9.150541644418701e-06, + "loss": 0.8243, + "num_tokens": 40727436659.0, + "step": 9744 + }, + { + "epoch": 1.1580510992275699, + "grad_norm": 0.3417770184506429, + "learning_rate": 9.148846676805758e-06, + "loss": 0.8322, + "num_tokens": 40731597541.0, + "step": 9745 + }, + { + "epoch": 1.1581699346405228, + "grad_norm": 0.3155117654816788, + "learning_rate": 9.147151777747286e-06, + "loss": 0.8463, + "num_tokens": 40735785687.0, + "step": 9746 + }, + { + "epoch": 1.1582887700534759, + "grad_norm": 0.37339368460558103, + "learning_rate": 9.145456947306041e-06, + "loss": 0.846, + "num_tokens": 40739975756.0, + "step": 9747 + }, + { + "epoch": 1.158407605466429, + "grad_norm": 0.29522988871445666, + "learning_rate": 9.1437621855448e-06, + "loss": 0.8068, + "num_tokens": 40744143843.0, + "step": 9748 + }, + { + "epoch": 1.158526440879382, + "grad_norm": 0.3875925349740572, + "learning_rate": 9.142067492526323e-06, + "loss": 0.8229, + "num_tokens": 40748333394.0, + "step": 9749 + }, + { + "epoch": 1.1586452762923352, + "grad_norm": 0.33277943728150394, + "learning_rate": 9.140372868313366e-06, + "loss": 0.8084, + "num_tokens": 40752521007.0, + "step": 9750 + }, + { + "epoch": 1.1587641117052883, + "grad_norm": 0.4364628086157625, + "learning_rate": 9.13867831296869e-06, + "loss": 0.8463, + "num_tokens": 40756674700.0, + "step": 9751 + }, + { + "epoch": 1.1588829471182414, + "grad_norm": 0.3019050559293005, + "learning_rate": 9.136983826555047e-06, + "loss": 0.8275, + "num_tokens": 40760862190.0, + "step": 9752 + }, + { + "epoch": 1.1590017825311942, + "grad_norm": 0.3864893044961412, + "learning_rate": 9.135289409135195e-06, + "loss": 0.8126, + "num_tokens": 40765051927.0, + "step": 9753 + }, + { + "epoch": 1.1591206179441473, + "grad_norm": 0.3300769425899869, + "learning_rate": 9.133595060771883e-06, + "loss": 0.8349, + "num_tokens": 40769231496.0, + "step": 9754 + }, + { + "epoch": 1.1592394533571004, + "grad_norm": 0.3453299175182728, + "learning_rate": 9.131900781527852e-06, + "loss": 0.7949, + "num_tokens": 40773420036.0, + "step": 9755 + }, + { + "epoch": 1.1593582887700535, + "grad_norm": 0.3702022091916504, + "learning_rate": 9.130206571465849e-06, + "loss": 0.8363, + "num_tokens": 40777598719.0, + "step": 9756 + }, + { + "epoch": 1.1594771241830064, + "grad_norm": 0.37039851398848694, + "learning_rate": 9.128512430648623e-06, + "loss": 0.8296, + "num_tokens": 40781787411.0, + "step": 9757 + }, + { + "epoch": 1.1595959595959595, + "grad_norm": 0.3713199974025909, + "learning_rate": 9.126818359138906e-06, + "loss": 0.8303, + "num_tokens": 40785976821.0, + "step": 9758 + }, + { + "epoch": 1.1597147950089126, + "grad_norm": 0.40857449770908116, + "learning_rate": 9.125124356999445e-06, + "loss": 0.8437, + "num_tokens": 40790166618.0, + "step": 9759 + }, + { + "epoch": 1.1598336304218657, + "grad_norm": 0.37220262360735556, + "learning_rate": 9.123430424292966e-06, + "loss": 0.8205, + "num_tokens": 40794356879.0, + "step": 9760 + }, + { + "epoch": 1.1599524658348188, + "grad_norm": 0.3360719182158104, + "learning_rate": 9.1217365610822e-06, + "loss": 0.851, + "num_tokens": 40798525562.0, + "step": 9761 + }, + { + "epoch": 1.160071301247772, + "grad_norm": 0.47983292903638625, + "learning_rate": 9.120042767429883e-06, + "loss": 0.8297, + "num_tokens": 40802713398.0, + "step": 9762 + }, + { + "epoch": 1.160190136660725, + "grad_norm": 0.3593871635694528, + "learning_rate": 9.11834904339873e-06, + "loss": 0.8437, + "num_tokens": 40806901358.0, + "step": 9763 + }, + { + "epoch": 1.160308972073678, + "grad_norm": 0.47306094117555325, + "learning_rate": 9.116655389051483e-06, + "loss": 0.8024, + "num_tokens": 40811074111.0, + "step": 9764 + }, + { + "epoch": 1.160427807486631, + "grad_norm": 0.36950107095035944, + "learning_rate": 9.114961804450853e-06, + "loss": 0.8111, + "num_tokens": 40815263867.0, + "step": 9765 + }, + { + "epoch": 1.160546642899584, + "grad_norm": 0.5107625156173096, + "learning_rate": 9.113268289659561e-06, + "loss": 0.8187, + "num_tokens": 40819453247.0, + "step": 9766 + }, + { + "epoch": 1.1606654783125372, + "grad_norm": 0.3991978812933918, + "learning_rate": 9.111574844740324e-06, + "loss": 0.8016, + "num_tokens": 40823642591.0, + "step": 9767 + }, + { + "epoch": 1.1607843137254903, + "grad_norm": 0.4502887844931089, + "learning_rate": 9.109881469755855e-06, + "loss": 0.7908, + "num_tokens": 40827831433.0, + "step": 9768 + }, + { + "epoch": 1.1609031491384432, + "grad_norm": 0.3937257803920428, + "learning_rate": 9.108188164768861e-06, + "loss": 0.7803, + "num_tokens": 40832007447.0, + "step": 9769 + }, + { + "epoch": 1.1610219845513963, + "grad_norm": 0.4109402677867984, + "learning_rate": 9.106494929842058e-06, + "loss": 0.8047, + "num_tokens": 40836182047.0, + "step": 9770 + }, + { + "epoch": 1.1611408199643494, + "grad_norm": 0.3625957810164411, + "learning_rate": 9.104801765038151e-06, + "loss": 0.82, + "num_tokens": 40840357382.0, + "step": 9771 + }, + { + "epoch": 1.1612596553773025, + "grad_norm": 0.4334162210105866, + "learning_rate": 9.103108670419845e-06, + "loss": 0.7995, + "num_tokens": 40844532811.0, + "step": 9772 + }, + { + "epoch": 1.1613784907902556, + "grad_norm": 0.331000713324505, + "learning_rate": 9.101415646049835e-06, + "loss": 0.8291, + "num_tokens": 40848717631.0, + "step": 9773 + }, + { + "epoch": 1.1614973262032087, + "grad_norm": 0.44701778512803914, + "learning_rate": 9.099722691990824e-06, + "loss": 0.8481, + "num_tokens": 40852906777.0, + "step": 9774 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.4016145107887866, + "learning_rate": 9.098029808305511e-06, + "loss": 0.8218, + "num_tokens": 40857077225.0, + "step": 9775 + }, + { + "epoch": 1.1617349970291146, + "grad_norm": 0.41375826788051395, + "learning_rate": 9.096336995056576e-06, + "loss": 0.8264, + "num_tokens": 40861247861.0, + "step": 9776 + }, + { + "epoch": 1.1618538324420677, + "grad_norm": 0.4189827832164728, + "learning_rate": 9.094644252306725e-06, + "loss": 0.8554, + "num_tokens": 40865437450.0, + "step": 9777 + }, + { + "epoch": 1.1619726678550208, + "grad_norm": 0.37845744255435954, + "learning_rate": 9.092951580118639e-06, + "loss": 0.8319, + "num_tokens": 40869619157.0, + "step": 9778 + }, + { + "epoch": 1.162091503267974, + "grad_norm": 0.3975438680710618, + "learning_rate": 9.091258978555e-06, + "loss": 0.8387, + "num_tokens": 40873807583.0, + "step": 9779 + }, + { + "epoch": 1.1622103386809268, + "grad_norm": 0.3858417903161389, + "learning_rate": 9.089566447678499e-06, + "loss": 0.8421, + "num_tokens": 40877997714.0, + "step": 9780 + }, + { + "epoch": 1.16232917409388, + "grad_norm": 0.3535515884332668, + "learning_rate": 9.087873987551811e-06, + "loss": 0.8364, + "num_tokens": 40882186099.0, + "step": 9781 + }, + { + "epoch": 1.162448009506833, + "grad_norm": 0.34384189706678486, + "learning_rate": 9.086181598237611e-06, + "loss": 0.8126, + "num_tokens": 40886364095.0, + "step": 9782 + }, + { + "epoch": 1.1625668449197861, + "grad_norm": 0.33461497484323566, + "learning_rate": 9.08448927979858e-06, + "loss": 0.8309, + "num_tokens": 40890553311.0, + "step": 9783 + }, + { + "epoch": 1.1626856803327392, + "grad_norm": 0.39515039866090357, + "learning_rate": 9.082797032297386e-06, + "loss": 0.8008, + "num_tokens": 40894671982.0, + "step": 9784 + }, + { + "epoch": 1.1628045157456923, + "grad_norm": 0.3270268951366013, + "learning_rate": 9.081104855796704e-06, + "loss": 0.8481, + "num_tokens": 40898861254.0, + "step": 9785 + }, + { + "epoch": 1.1629233511586452, + "grad_norm": 0.4273785925487552, + "learning_rate": 9.079412750359193e-06, + "loss": 0.8002, + "num_tokens": 40903050837.0, + "step": 9786 + }, + { + "epoch": 1.1630421865715983, + "grad_norm": 0.31575240540762783, + "learning_rate": 9.077720716047522e-06, + "loss": 0.811, + "num_tokens": 40907241359.0, + "step": 9787 + }, + { + "epoch": 1.1631610219845514, + "grad_norm": 0.39226542366054107, + "learning_rate": 9.07602875292435e-06, + "loss": 0.795, + "num_tokens": 40911432446.0, + "step": 9788 + }, + { + "epoch": 1.1632798573975045, + "grad_norm": 0.33406484768802935, + "learning_rate": 9.074336861052343e-06, + "loss": 0.8297, + "num_tokens": 40915621479.0, + "step": 9789 + }, + { + "epoch": 1.1633986928104576, + "grad_norm": 0.3730178649630084, + "learning_rate": 9.072645040494153e-06, + "loss": 0.7934, + "num_tokens": 40919759117.0, + "step": 9790 + }, + { + "epoch": 1.1635175282234105, + "grad_norm": 0.3525573066939503, + "learning_rate": 9.07095329131243e-06, + "loss": 0.7932, + "num_tokens": 40923922474.0, + "step": 9791 + }, + { + "epoch": 1.1636363636363636, + "grad_norm": 0.3605001002102898, + "learning_rate": 9.069261613569833e-06, + "loss": 0.8343, + "num_tokens": 40928111422.0, + "step": 9792 + }, + { + "epoch": 1.1637551990493167, + "grad_norm": 0.3037867420326157, + "learning_rate": 9.067570007329006e-06, + "loss": 0.8428, + "num_tokens": 40932300861.0, + "step": 9793 + }, + { + "epoch": 1.1638740344622698, + "grad_norm": 0.3451734294395418, + "learning_rate": 9.065878472652591e-06, + "loss": 0.8036, + "num_tokens": 40936490149.0, + "step": 9794 + }, + { + "epoch": 1.1639928698752229, + "grad_norm": 0.35289982525110447, + "learning_rate": 9.064187009603236e-06, + "loss": 0.8169, + "num_tokens": 40940660280.0, + "step": 9795 + }, + { + "epoch": 1.164111705288176, + "grad_norm": 0.4046556153299146, + "learning_rate": 9.062495618243585e-06, + "loss": 0.8551, + "num_tokens": 40944851032.0, + "step": 9796 + }, + { + "epoch": 1.1642305407011289, + "grad_norm": 0.3267674640697363, + "learning_rate": 9.06080429863627e-06, + "loss": 0.8008, + "num_tokens": 40949015561.0, + "step": 9797 + }, + { + "epoch": 1.164349376114082, + "grad_norm": 0.42203867342718543, + "learning_rate": 9.05911305084393e-06, + "loss": 0.8572, + "num_tokens": 40953182567.0, + "step": 9798 + }, + { + "epoch": 1.164468211527035, + "grad_norm": 0.309918481944124, + "learning_rate": 9.057421874929195e-06, + "loss": 0.7947, + "num_tokens": 40957347666.0, + "step": 9799 + }, + { + "epoch": 1.1645870469399882, + "grad_norm": 0.49298700170833964, + "learning_rate": 9.055730770954697e-06, + "loss": 0.851, + "num_tokens": 40961500308.0, + "step": 9800 + }, + { + "epoch": 1.1647058823529413, + "grad_norm": 0.3578335324907177, + "learning_rate": 9.054039738983058e-06, + "loss": 0.8312, + "num_tokens": 40965689105.0, + "step": 9801 + }, + { + "epoch": 1.1648247177658941, + "grad_norm": 0.4313409500939293, + "learning_rate": 9.05234877907691e-06, + "loss": 0.8125, + "num_tokens": 40969877321.0, + "step": 9802 + }, + { + "epoch": 1.1649435531788472, + "grad_norm": 0.37729150272339507, + "learning_rate": 9.050657891298872e-06, + "loss": 0.7894, + "num_tokens": 40974032386.0, + "step": 9803 + }, + { + "epoch": 1.1650623885918003, + "grad_norm": 0.43213773209760126, + "learning_rate": 9.048967075711563e-06, + "loss": 0.8152, + "num_tokens": 40978221785.0, + "step": 9804 + }, + { + "epoch": 1.1651812240047534, + "grad_norm": 0.3556196974760427, + "learning_rate": 9.047276332377602e-06, + "loss": 0.8254, + "num_tokens": 40982411423.0, + "step": 9805 + }, + { + "epoch": 1.1653000594177065, + "grad_norm": 0.36505639714754223, + "learning_rate": 9.045585661359597e-06, + "loss": 0.8193, + "num_tokens": 40986572546.0, + "step": 9806 + }, + { + "epoch": 1.1654188948306596, + "grad_norm": 0.343404007850398, + "learning_rate": 9.043895062720164e-06, + "loss": 0.7944, + "num_tokens": 40990762087.0, + "step": 9807 + }, + { + "epoch": 1.1655377302436125, + "grad_norm": 0.39504216960159716, + "learning_rate": 9.04220453652191e-06, + "loss": 0.8126, + "num_tokens": 40994930951.0, + "step": 9808 + }, + { + "epoch": 1.1656565656565656, + "grad_norm": 0.34853193235557983, + "learning_rate": 9.040514082827443e-06, + "loss": 0.803, + "num_tokens": 40999092757.0, + "step": 9809 + }, + { + "epoch": 1.1657754010695187, + "grad_norm": 0.3870265856941057, + "learning_rate": 9.038823701699365e-06, + "loss": 0.8022, + "num_tokens": 41003277678.0, + "step": 9810 + }, + { + "epoch": 1.1658942364824718, + "grad_norm": 0.3725994150167966, + "learning_rate": 9.037133393200277e-06, + "loss": 0.8233, + "num_tokens": 41007468598.0, + "step": 9811 + }, + { + "epoch": 1.166013071895425, + "grad_norm": 0.3203222711707088, + "learning_rate": 9.035443157392773e-06, + "loss": 0.7756, + "num_tokens": 41011656532.0, + "step": 9812 + }, + { + "epoch": 1.1661319073083778, + "grad_norm": 0.3460699584173271, + "learning_rate": 9.033752994339453e-06, + "loss": 0.8252, + "num_tokens": 41015831771.0, + "step": 9813 + }, + { + "epoch": 1.166250742721331, + "grad_norm": 0.3003342289693502, + "learning_rate": 9.032062904102907e-06, + "loss": 0.791, + "num_tokens": 41020020810.0, + "step": 9814 + }, + { + "epoch": 1.166369578134284, + "grad_norm": 0.32395178495174665, + "learning_rate": 9.030372886745724e-06, + "loss": 0.8179, + "num_tokens": 41024184534.0, + "step": 9815 + }, + { + "epoch": 1.166488413547237, + "grad_norm": 0.30239463900479374, + "learning_rate": 9.028682942330498e-06, + "loss": 0.8478, + "num_tokens": 41028374456.0, + "step": 9816 + }, + { + "epoch": 1.1666072489601902, + "grad_norm": 0.29524677315932357, + "learning_rate": 9.026993070919803e-06, + "loss": 0.819, + "num_tokens": 41032543614.0, + "step": 9817 + }, + { + "epoch": 1.1667260843731433, + "grad_norm": 0.297016189227381, + "learning_rate": 9.025303272576227e-06, + "loss": 0.8651, + "num_tokens": 41036724159.0, + "step": 9818 + }, + { + "epoch": 1.1668449197860962, + "grad_norm": 0.34704149313774907, + "learning_rate": 9.023613547362348e-06, + "loss": 0.8452, + "num_tokens": 41040911030.0, + "step": 9819 + }, + { + "epoch": 1.1669637551990493, + "grad_norm": 0.27513565092856423, + "learning_rate": 9.021923895340736e-06, + "loss": 0.7819, + "num_tokens": 41045098282.0, + "step": 9820 + }, + { + "epoch": 1.1670825906120024, + "grad_norm": 0.3183274583289233, + "learning_rate": 9.020234316573977e-06, + "loss": 0.8333, + "num_tokens": 41049287343.0, + "step": 9821 + }, + { + "epoch": 1.1672014260249555, + "grad_norm": 0.3100831401462159, + "learning_rate": 9.018544811124634e-06, + "loss": 0.8091, + "num_tokens": 41053477065.0, + "step": 9822 + }, + { + "epoch": 1.1673202614379086, + "grad_norm": 0.30561260734958495, + "learning_rate": 9.016855379055274e-06, + "loss": 0.8163, + "num_tokens": 41057649639.0, + "step": 9823 + }, + { + "epoch": 1.1674390968508614, + "grad_norm": 0.33960954262623977, + "learning_rate": 9.015166020428468e-06, + "loss": 0.837, + "num_tokens": 41061837845.0, + "step": 9824 + }, + { + "epoch": 1.1675579322638145, + "grad_norm": 0.28200861950551137, + "learning_rate": 9.013476735306773e-06, + "loss": 0.8078, + "num_tokens": 41066006538.0, + "step": 9825 + }, + { + "epoch": 1.1676767676767676, + "grad_norm": 0.30753244435497434, + "learning_rate": 9.011787523752748e-06, + "loss": 0.8105, + "num_tokens": 41070142724.0, + "step": 9826 + }, + { + "epoch": 1.1677956030897207, + "grad_norm": 0.30710475316171854, + "learning_rate": 9.010098385828955e-06, + "loss": 0.7896, + "num_tokens": 41074311639.0, + "step": 9827 + }, + { + "epoch": 1.1679144385026738, + "grad_norm": 0.27904994269076, + "learning_rate": 9.008409321597949e-06, + "loss": 0.8236, + "num_tokens": 41078501531.0, + "step": 9828 + }, + { + "epoch": 1.168033273915627, + "grad_norm": 0.33952827339959923, + "learning_rate": 9.00672033112228e-06, + "loss": 0.8127, + "num_tokens": 41082663492.0, + "step": 9829 + }, + { + "epoch": 1.1681521093285798, + "grad_norm": 0.3240508137484174, + "learning_rate": 9.005031414464496e-06, + "loss": 0.8046, + "num_tokens": 41086851295.0, + "step": 9830 + }, + { + "epoch": 1.168270944741533, + "grad_norm": 0.32070059645443955, + "learning_rate": 9.003342571687141e-06, + "loss": 0.7969, + "num_tokens": 41091038160.0, + "step": 9831 + }, + { + "epoch": 1.168389780154486, + "grad_norm": 0.32526168122187926, + "learning_rate": 9.001653802852766e-06, + "loss": 0.8378, + "num_tokens": 41095206678.0, + "step": 9832 + }, + { + "epoch": 1.1685086155674391, + "grad_norm": 0.2757660814672968, + "learning_rate": 8.9999651080239e-06, + "loss": 0.8266, + "num_tokens": 41099396143.0, + "step": 9833 + }, + { + "epoch": 1.1686274509803922, + "grad_norm": 0.2974487535444315, + "learning_rate": 8.998276487263093e-06, + "loss": 0.8468, + "num_tokens": 41103584928.0, + "step": 9834 + }, + { + "epoch": 1.168746286393345, + "grad_norm": 0.305891983928791, + "learning_rate": 8.996587940632875e-06, + "loss": 0.812, + "num_tokens": 41107774301.0, + "step": 9835 + }, + { + "epoch": 1.1688651218062982, + "grad_norm": 0.32564949931695436, + "learning_rate": 8.994899468195781e-06, + "loss": 0.8186, + "num_tokens": 41111963961.0, + "step": 9836 + }, + { + "epoch": 1.1689839572192513, + "grad_norm": 0.2995637253944648, + "learning_rate": 8.993211070014338e-06, + "loss": 0.8324, + "num_tokens": 41116153869.0, + "step": 9837 + }, + { + "epoch": 1.1691027926322044, + "grad_norm": 0.33000217976808965, + "learning_rate": 8.991522746151076e-06, + "loss": 0.8242, + "num_tokens": 41120342384.0, + "step": 9838 + }, + { + "epoch": 1.1692216280451575, + "grad_norm": 0.3260274681472151, + "learning_rate": 8.989834496668512e-06, + "loss": 0.7993, + "num_tokens": 41124522789.0, + "step": 9839 + }, + { + "epoch": 1.1693404634581106, + "grad_norm": 0.3558646280869766, + "learning_rate": 8.988146321629176e-06, + "loss": 0.8156, + "num_tokens": 41128683258.0, + "step": 9840 + }, + { + "epoch": 1.1694592988710637, + "grad_norm": 0.2930861018886482, + "learning_rate": 8.986458221095587e-06, + "loss": 0.8444, + "num_tokens": 41132846623.0, + "step": 9841 + }, + { + "epoch": 1.1695781342840166, + "grad_norm": 0.29676731317636107, + "learning_rate": 8.984770195130257e-06, + "loss": 0.8066, + "num_tokens": 41137036929.0, + "step": 9842 + }, + { + "epoch": 1.1696969696969697, + "grad_norm": 0.30922659140902964, + "learning_rate": 8.983082243795697e-06, + "loss": 0.8155, + "num_tokens": 41141195588.0, + "step": 9843 + }, + { + "epoch": 1.1698158051099228, + "grad_norm": 0.2923302450355453, + "learning_rate": 8.981394367154424e-06, + "loss": 0.815, + "num_tokens": 41145355893.0, + "step": 9844 + }, + { + "epoch": 1.1699346405228759, + "grad_norm": 0.3654465048224207, + "learning_rate": 8.97970656526894e-06, + "loss": 0.834, + "num_tokens": 41149543659.0, + "step": 9845 + }, + { + "epoch": 1.1700534759358288, + "grad_norm": 0.28065625405934724, + "learning_rate": 8.978018838201752e-06, + "loss": 0.8027, + "num_tokens": 41153733037.0, + "step": 9846 + }, + { + "epoch": 1.1701723113487819, + "grad_norm": 0.3580433559997853, + "learning_rate": 8.976331186015368e-06, + "loss": 0.804, + "num_tokens": 41157892941.0, + "step": 9847 + }, + { + "epoch": 1.170291146761735, + "grad_norm": 0.29582684656474706, + "learning_rate": 8.974643608772279e-06, + "loss": 0.8443, + "num_tokens": 41162080841.0, + "step": 9848 + }, + { + "epoch": 1.170409982174688, + "grad_norm": 0.33427451807688124, + "learning_rate": 8.972956106534984e-06, + "loss": 0.8025, + "num_tokens": 41166271421.0, + "step": 9849 + }, + { + "epoch": 1.1705288175876412, + "grad_norm": 0.2958452739884194, + "learning_rate": 8.971268679365983e-06, + "loss": 0.7967, + "num_tokens": 41170462280.0, + "step": 9850 + }, + { + "epoch": 1.1706476530005943, + "grad_norm": 0.3289297512554793, + "learning_rate": 8.969581327327759e-06, + "loss": 0.8327, + "num_tokens": 41174641605.0, + "step": 9851 + }, + { + "epoch": 1.1707664884135474, + "grad_norm": 0.31423461330883207, + "learning_rate": 8.967894050482799e-06, + "loss": 0.7922, + "num_tokens": 41178831781.0, + "step": 9852 + }, + { + "epoch": 1.1708853238265002, + "grad_norm": 0.3574365006551595, + "learning_rate": 8.966206848893598e-06, + "loss": 0.8262, + "num_tokens": 41183020487.0, + "step": 9853 + }, + { + "epoch": 1.1710041592394533, + "grad_norm": 0.3341713476288558, + "learning_rate": 8.964519722622634e-06, + "loss": 0.8002, + "num_tokens": 41187183482.0, + "step": 9854 + }, + { + "epoch": 1.1711229946524064, + "grad_norm": 0.3470403997580934, + "learning_rate": 8.962832671732387e-06, + "loss": 0.8222, + "num_tokens": 41191368434.0, + "step": 9855 + }, + { + "epoch": 1.1712418300653595, + "grad_norm": 0.34341233345243016, + "learning_rate": 8.961145696285331e-06, + "loss": 0.7754, + "num_tokens": 41195557596.0, + "step": 9856 + }, + { + "epoch": 1.1713606654783124, + "grad_norm": 0.3078574698113713, + "learning_rate": 8.959458796343946e-06, + "loss": 0.8406, + "num_tokens": 41199747364.0, + "step": 9857 + }, + { + "epoch": 1.1714795008912655, + "grad_norm": 0.35795164885689895, + "learning_rate": 8.957771971970696e-06, + "loss": 0.8508, + "num_tokens": 41203916320.0, + "step": 9858 + }, + { + "epoch": 1.1715983363042186, + "grad_norm": 0.29330328398873595, + "learning_rate": 8.956085223228059e-06, + "loss": 0.8489, + "num_tokens": 41208106644.0, + "step": 9859 + }, + { + "epoch": 1.1717171717171717, + "grad_norm": 0.3505003415150207, + "learning_rate": 8.9543985501785e-06, + "loss": 0.789, + "num_tokens": 41212282735.0, + "step": 9860 + }, + { + "epoch": 1.1718360071301248, + "grad_norm": 0.3344778191111047, + "learning_rate": 8.952711952884474e-06, + "loss": 0.7786, + "num_tokens": 41216471170.0, + "step": 9861 + }, + { + "epoch": 1.171954842543078, + "grad_norm": 0.32110124709607024, + "learning_rate": 8.951025431408449e-06, + "loss": 0.8296, + "num_tokens": 41220632577.0, + "step": 9862 + }, + { + "epoch": 1.172073677956031, + "grad_norm": 0.3460230676195287, + "learning_rate": 8.949338985812878e-06, + "loss": 0.8308, + "num_tokens": 41224820467.0, + "step": 9863 + }, + { + "epoch": 1.172192513368984, + "grad_norm": 0.352656441430803, + "learning_rate": 8.94765261616022e-06, + "loss": 0.8368, + "num_tokens": 41228979544.0, + "step": 9864 + }, + { + "epoch": 1.172311348781937, + "grad_norm": 0.33050703569964734, + "learning_rate": 8.945966322512922e-06, + "loss": 0.8237, + "num_tokens": 41233168565.0, + "step": 9865 + }, + { + "epoch": 1.17243018419489, + "grad_norm": 0.3321386162664655, + "learning_rate": 8.944280104933439e-06, + "loss": 0.7762, + "num_tokens": 41237344514.0, + "step": 9866 + }, + { + "epoch": 1.1725490196078432, + "grad_norm": 0.32248894486023283, + "learning_rate": 8.942593963484211e-06, + "loss": 0.8209, + "num_tokens": 41241503291.0, + "step": 9867 + }, + { + "epoch": 1.1726678550207963, + "grad_norm": 0.3491676356042468, + "learning_rate": 8.940907898227688e-06, + "loss": 0.8242, + "num_tokens": 41245674035.0, + "step": 9868 + }, + { + "epoch": 1.1727866904337492, + "grad_norm": 0.3110928658234267, + "learning_rate": 8.939221909226307e-06, + "loss": 0.8056, + "num_tokens": 41249861881.0, + "step": 9869 + }, + { + "epoch": 1.1729055258467023, + "grad_norm": 0.34050754558763713, + "learning_rate": 8.937535996542505e-06, + "loss": 0.8066, + "num_tokens": 41254051524.0, + "step": 9870 + }, + { + "epoch": 1.1730243612596554, + "grad_norm": 0.3152012738752913, + "learning_rate": 8.935850160238717e-06, + "loss": 0.7718, + "num_tokens": 41258241424.0, + "step": 9871 + }, + { + "epoch": 1.1731431966726085, + "grad_norm": 0.35119273180016447, + "learning_rate": 8.934164400377382e-06, + "loss": 0.809, + "num_tokens": 41262430763.0, + "step": 9872 + }, + { + "epoch": 1.1732620320855616, + "grad_norm": 0.39278117022414516, + "learning_rate": 8.932478717020918e-06, + "loss": 0.8074, + "num_tokens": 41266620056.0, + "step": 9873 + }, + { + "epoch": 1.1733808674985147, + "grad_norm": 0.30056859434219246, + "learning_rate": 8.930793110231763e-06, + "loss": 0.8569, + "num_tokens": 41270807138.0, + "step": 9874 + }, + { + "epoch": 1.1734997029114675, + "grad_norm": 0.2889852995222132, + "learning_rate": 8.929107580072331e-06, + "loss": 0.8372, + "num_tokens": 41274996657.0, + "step": 9875 + }, + { + "epoch": 1.1736185383244206, + "grad_norm": 0.35208304594733264, + "learning_rate": 8.927422126605048e-06, + "loss": 0.8263, + "num_tokens": 41279181569.0, + "step": 9876 + }, + { + "epoch": 1.1737373737373737, + "grad_norm": 0.3116657854488748, + "learning_rate": 8.92573674989233e-06, + "loss": 0.7957, + "num_tokens": 41283372595.0, + "step": 9877 + }, + { + "epoch": 1.1738562091503268, + "grad_norm": 0.30521737718150893, + "learning_rate": 8.924051449996596e-06, + "loss": 0.7832, + "num_tokens": 41287562044.0, + "step": 9878 + }, + { + "epoch": 1.17397504456328, + "grad_norm": 0.3459265823596117, + "learning_rate": 8.922366226980257e-06, + "loss": 0.8292, + "num_tokens": 41291733585.0, + "step": 9879 + }, + { + "epoch": 1.1740938799762328, + "grad_norm": 0.3058596604867799, + "learning_rate": 8.92068108090572e-06, + "loss": 0.8043, + "num_tokens": 41295921016.0, + "step": 9880 + }, + { + "epoch": 1.174212715389186, + "grad_norm": 0.29695028857175665, + "learning_rate": 8.918996011835391e-06, + "loss": 0.8269, + "num_tokens": 41300076160.0, + "step": 9881 + }, + { + "epoch": 1.174331550802139, + "grad_norm": 0.3537932996827625, + "learning_rate": 8.917311019831678e-06, + "loss": 0.8231, + "num_tokens": 41304265246.0, + "step": 9882 + }, + { + "epoch": 1.1744503862150921, + "grad_norm": 0.3151970797692106, + "learning_rate": 8.915626104956978e-06, + "loss": 0.7963, + "num_tokens": 41308454789.0, + "step": 9883 + }, + { + "epoch": 1.1745692216280452, + "grad_norm": 0.3452133287166752, + "learning_rate": 8.91394126727369e-06, + "loss": 0.7953, + "num_tokens": 41312644763.0, + "step": 9884 + }, + { + "epoch": 1.1746880570409983, + "grad_norm": 0.3182491391451872, + "learning_rate": 8.91225650684421e-06, + "loss": 0.8276, + "num_tokens": 41316834217.0, + "step": 9885 + }, + { + "epoch": 1.1748068924539512, + "grad_norm": 0.3419549200274512, + "learning_rate": 8.910571823730932e-06, + "loss": 0.8272, + "num_tokens": 41320992483.0, + "step": 9886 + }, + { + "epoch": 1.1749257278669043, + "grad_norm": 0.3045948015093808, + "learning_rate": 8.908887217996245e-06, + "loss": 0.8289, + "num_tokens": 41325179475.0, + "step": 9887 + }, + { + "epoch": 1.1750445632798574, + "grad_norm": 0.3333191682826972, + "learning_rate": 8.907202689702534e-06, + "loss": 0.8251, + "num_tokens": 41329367328.0, + "step": 9888 + }, + { + "epoch": 1.1751633986928105, + "grad_norm": 0.32567828463217185, + "learning_rate": 8.905518238912182e-06, + "loss": 0.8388, + "num_tokens": 41333528727.0, + "step": 9889 + }, + { + "epoch": 1.1752822341057636, + "grad_norm": 0.34446388340716505, + "learning_rate": 8.90383386568757e-06, + "loss": 0.8402, + "num_tokens": 41337717562.0, + "step": 9890 + }, + { + "epoch": 1.1754010695187165, + "grad_norm": 0.34204963104635355, + "learning_rate": 8.902149570091078e-06, + "loss": 0.8197, + "num_tokens": 41341906446.0, + "step": 9891 + }, + { + "epoch": 1.1755199049316696, + "grad_norm": 0.32205862971410715, + "learning_rate": 8.900465352185083e-06, + "loss": 0.796, + "num_tokens": 41346068140.0, + "step": 9892 + }, + { + "epoch": 1.1756387403446227, + "grad_norm": 0.35605057532167533, + "learning_rate": 8.898781212031952e-06, + "loss": 0.8566, + "num_tokens": 41350228936.0, + "step": 9893 + }, + { + "epoch": 1.1757575757575758, + "grad_norm": 0.32817388068279435, + "learning_rate": 8.897097149694058e-06, + "loss": 0.8194, + "num_tokens": 41354417949.0, + "step": 9894 + }, + { + "epoch": 1.1758764111705289, + "grad_norm": 0.3189467219097131, + "learning_rate": 8.89541316523377e-06, + "loss": 0.824, + "num_tokens": 41358592142.0, + "step": 9895 + }, + { + "epoch": 1.175995246583482, + "grad_norm": 0.30559117470705666, + "learning_rate": 8.893729258713442e-06, + "loss": 0.8162, + "num_tokens": 41362755036.0, + "step": 9896 + }, + { + "epoch": 1.1761140819964349, + "grad_norm": 0.3221698345463138, + "learning_rate": 8.892045430195444e-06, + "loss": 0.7996, + "num_tokens": 41366944178.0, + "step": 9897 + }, + { + "epoch": 1.176232917409388, + "grad_norm": 0.30497347921184326, + "learning_rate": 8.890361679742131e-06, + "loss": 0.8252, + "num_tokens": 41371131106.0, + "step": 9898 + }, + { + "epoch": 1.176351752822341, + "grad_norm": 0.3146031838888548, + "learning_rate": 8.888678007415858e-06, + "loss": 0.8247, + "num_tokens": 41375291973.0, + "step": 9899 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.2737604192386401, + "learning_rate": 8.886994413278978e-06, + "loss": 0.8467, + "num_tokens": 41379481193.0, + "step": 9900 + }, + { + "epoch": 1.1765894236482473, + "grad_norm": 0.3036585635995862, + "learning_rate": 8.885310897393837e-06, + "loss": 0.8163, + "num_tokens": 41383624305.0, + "step": 9901 + }, + { + "epoch": 1.1767082590612001, + "grad_norm": 0.312569476983016, + "learning_rate": 8.883627459822785e-06, + "loss": 0.7981, + "num_tokens": 41387797121.0, + "step": 9902 + }, + { + "epoch": 1.1768270944741532, + "grad_norm": 0.29807501154233923, + "learning_rate": 8.881944100628162e-06, + "loss": 0.7917, + "num_tokens": 41391973338.0, + "step": 9903 + }, + { + "epoch": 1.1769459298871063, + "grad_norm": 0.2961687521255679, + "learning_rate": 8.88026081987231e-06, + "loss": 0.8307, + "num_tokens": 41396160959.0, + "step": 9904 + }, + { + "epoch": 1.1770647653000594, + "grad_norm": 0.3080182532707849, + "learning_rate": 8.87857761761757e-06, + "loss": 0.8279, + "num_tokens": 41400350086.0, + "step": 9905 + }, + { + "epoch": 1.1771836007130125, + "grad_norm": 0.29808163293362244, + "learning_rate": 8.876894493926272e-06, + "loss": 0.8211, + "num_tokens": 41404526329.0, + "step": 9906 + }, + { + "epoch": 1.1773024361259656, + "grad_norm": 0.3350310738783993, + "learning_rate": 8.875211448860749e-06, + "loss": 0.8458, + "num_tokens": 41408697064.0, + "step": 9907 + }, + { + "epoch": 1.1774212715389185, + "grad_norm": 0.3065100616827958, + "learning_rate": 8.873528482483332e-06, + "loss": 0.7862, + "num_tokens": 41412862014.0, + "step": 9908 + }, + { + "epoch": 1.1775401069518716, + "grad_norm": 0.3126663049824943, + "learning_rate": 8.871845594856338e-06, + "loss": 0.7944, + "num_tokens": 41417028359.0, + "step": 9909 + }, + { + "epoch": 1.1776589423648247, + "grad_norm": 0.31206207881408604, + "learning_rate": 8.870162786042104e-06, + "loss": 0.7932, + "num_tokens": 41421216898.0, + "step": 9910 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.28972205524312045, + "learning_rate": 8.868480056102941e-06, + "loss": 0.8567, + "num_tokens": 41425362520.0, + "step": 9911 + }, + { + "epoch": 1.177896613190731, + "grad_norm": 0.3105185444998113, + "learning_rate": 8.86679740510117e-06, + "loss": 0.819, + "num_tokens": 41429542051.0, + "step": 9912 + }, + { + "epoch": 1.1780154486036838, + "grad_norm": 0.266923729735533, + "learning_rate": 8.865114833099104e-06, + "loss": 0.8021, + "num_tokens": 41433731528.0, + "step": 9913 + }, + { + "epoch": 1.178134284016637, + "grad_norm": 0.3141015620441825, + "learning_rate": 8.863432340159053e-06, + "loss": 0.8004, + "num_tokens": 41437921225.0, + "step": 9914 + }, + { + "epoch": 1.17825311942959, + "grad_norm": 0.285245811011188, + "learning_rate": 8.861749926343323e-06, + "loss": 0.8425, + "num_tokens": 41442110120.0, + "step": 9915 + }, + { + "epoch": 1.178371954842543, + "grad_norm": 0.3149216008628084, + "learning_rate": 8.860067591714225e-06, + "loss": 0.8416, + "num_tokens": 41446245514.0, + "step": 9916 + }, + { + "epoch": 1.1784907902554962, + "grad_norm": 0.32466429715881995, + "learning_rate": 8.858385336334061e-06, + "loss": 0.8513, + "num_tokens": 41450406644.0, + "step": 9917 + }, + { + "epoch": 1.1786096256684493, + "grad_norm": 0.3025927848047521, + "learning_rate": 8.856703160265128e-06, + "loss": 0.8017, + "num_tokens": 41454596098.0, + "step": 9918 + }, + { + "epoch": 1.1787284610814022, + "grad_norm": 0.30700142388886015, + "learning_rate": 8.855021063569724e-06, + "loss": 0.7829, + "num_tokens": 41458775036.0, + "step": 9919 + }, + { + "epoch": 1.1788472964943553, + "grad_norm": 0.36024768322380596, + "learning_rate": 8.853339046310141e-06, + "loss": 0.7756, + "num_tokens": 41462964513.0, + "step": 9920 + }, + { + "epoch": 1.1789661319073084, + "grad_norm": 0.32101054759793113, + "learning_rate": 8.851657108548674e-06, + "loss": 0.8151, + "num_tokens": 41467152907.0, + "step": 9921 + }, + { + "epoch": 1.1790849673202615, + "grad_norm": 0.3004898680788448, + "learning_rate": 8.849975250347603e-06, + "loss": 0.8307, + "num_tokens": 41471322264.0, + "step": 9922 + }, + { + "epoch": 1.1792038027332146, + "grad_norm": 0.30522672332986805, + "learning_rate": 8.848293471769222e-06, + "loss": 0.8201, + "num_tokens": 41475512547.0, + "step": 9923 + }, + { + "epoch": 1.1793226381461674, + "grad_norm": 0.34477222277180847, + "learning_rate": 8.846611772875808e-06, + "loss": 0.8187, + "num_tokens": 41479699715.0, + "step": 9924 + }, + { + "epoch": 1.1794414735591205, + "grad_norm": 0.3037540915864265, + "learning_rate": 8.84493015372964e-06, + "loss": 0.837, + "num_tokens": 41483879766.0, + "step": 9925 + }, + { + "epoch": 1.1795603089720736, + "grad_norm": 0.32569548534724585, + "learning_rate": 8.843248614392997e-06, + "loss": 0.8203, + "num_tokens": 41488067142.0, + "step": 9926 + }, + { + "epoch": 1.1796791443850267, + "grad_norm": 0.33516659782362535, + "learning_rate": 8.841567154928148e-06, + "loss": 0.786, + "num_tokens": 41492211674.0, + "step": 9927 + }, + { + "epoch": 1.1797979797979798, + "grad_norm": 0.31385956712031216, + "learning_rate": 8.839885775397364e-06, + "loss": 0.7773, + "num_tokens": 41496400510.0, + "step": 9928 + }, + { + "epoch": 1.179916815210933, + "grad_norm": 0.30138221730548415, + "learning_rate": 8.838204475862915e-06, + "loss": 0.7723, + "num_tokens": 41500545831.0, + "step": 9929 + }, + { + "epoch": 1.1800356506238858, + "grad_norm": 0.313592243737519, + "learning_rate": 8.836523256387062e-06, + "loss": 0.8044, + "num_tokens": 41504735190.0, + "step": 9930 + }, + { + "epoch": 1.180154486036839, + "grad_norm": 0.3450140447028418, + "learning_rate": 8.834842117032073e-06, + "loss": 0.8165, + "num_tokens": 41508902194.0, + "step": 9931 + }, + { + "epoch": 1.180273321449792, + "grad_norm": 0.27530438346927427, + "learning_rate": 8.833161057860196e-06, + "loss": 0.8371, + "num_tokens": 41513090056.0, + "step": 9932 + }, + { + "epoch": 1.1803921568627451, + "grad_norm": 0.312917431193626, + "learning_rate": 8.831480078933691e-06, + "loss": 0.8156, + "num_tokens": 41517279683.0, + "step": 9933 + }, + { + "epoch": 1.1805109922756982, + "grad_norm": 0.35810051642756313, + "learning_rate": 8.829799180314814e-06, + "loss": 0.8513, + "num_tokens": 41521469937.0, + "step": 9934 + }, + { + "epoch": 1.180629827688651, + "grad_norm": 0.2780644522902646, + "learning_rate": 8.828118362065802e-06, + "loss": 0.8106, + "num_tokens": 41525659116.0, + "step": 9935 + }, + { + "epoch": 1.1807486631016042, + "grad_norm": 0.3208795196430173, + "learning_rate": 8.82643762424892e-06, + "loss": 0.8354, + "num_tokens": 41529836942.0, + "step": 9936 + }, + { + "epoch": 1.1808674985145573, + "grad_norm": 0.37218500272945826, + "learning_rate": 8.8247569669264e-06, + "loss": 0.7989, + "num_tokens": 41534003655.0, + "step": 9937 + }, + { + "epoch": 1.1809863339275104, + "grad_norm": 0.296160159274765, + "learning_rate": 8.823076390160483e-06, + "loss": 0.822, + "num_tokens": 41538191934.0, + "step": 9938 + }, + { + "epoch": 1.1811051693404635, + "grad_norm": 0.34639922153598474, + "learning_rate": 8.821395894013409e-06, + "loss": 0.8197, + "num_tokens": 41542359368.0, + "step": 9939 + }, + { + "epoch": 1.1812240047534166, + "grad_norm": 0.30429072452583156, + "learning_rate": 8.81971547854741e-06, + "loss": 0.8308, + "num_tokens": 41546537684.0, + "step": 9940 + }, + { + "epoch": 1.1813428401663697, + "grad_norm": 0.3541220097834834, + "learning_rate": 8.818035143824716e-06, + "loss": 0.8252, + "num_tokens": 41550726624.0, + "step": 9941 + }, + { + "epoch": 1.1814616755793226, + "grad_norm": 0.33389749839885813, + "learning_rate": 8.81635488990756e-06, + "loss": 0.7866, + "num_tokens": 41554908371.0, + "step": 9942 + }, + { + "epoch": 1.1815805109922757, + "grad_norm": 0.35393505238882633, + "learning_rate": 8.814674716858165e-06, + "loss": 0.9101, + "num_tokens": 41559070238.0, + "step": 9943 + }, + { + "epoch": 1.1816993464052288, + "grad_norm": 0.35388616512796217, + "learning_rate": 8.812994624738758e-06, + "loss": 0.7707, + "num_tokens": 41563259403.0, + "step": 9944 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 0.3263251739946966, + "learning_rate": 8.81131461361155e-06, + "loss": 0.832, + "num_tokens": 41567448366.0, + "step": 9945 + }, + { + "epoch": 1.1819370172311348, + "grad_norm": 0.4092734607255134, + "learning_rate": 8.809634683538764e-06, + "loss": 0.7724, + "num_tokens": 41571638245.0, + "step": 9946 + }, + { + "epoch": 1.1820558526440879, + "grad_norm": 0.3205302009388943, + "learning_rate": 8.807954834582608e-06, + "loss": 0.8448, + "num_tokens": 41575800063.0, + "step": 9947 + }, + { + "epoch": 1.182174688057041, + "grad_norm": 0.40818074979387986, + "learning_rate": 8.8062750668053e-06, + "loss": 0.8281, + "num_tokens": 41579988564.0, + "step": 9948 + }, + { + "epoch": 1.182293523469994, + "grad_norm": 0.3269121257480915, + "learning_rate": 8.804595380269043e-06, + "loss": 0.8075, + "num_tokens": 41584117644.0, + "step": 9949 + }, + { + "epoch": 1.1824123588829472, + "grad_norm": 0.32327450139276365, + "learning_rate": 8.80291577503604e-06, + "loss": 0.8255, + "num_tokens": 41588306539.0, + "step": 9950 + }, + { + "epoch": 1.1825311942959003, + "grad_norm": 0.3731141383250833, + "learning_rate": 8.801236251168496e-06, + "loss": 0.8178, + "num_tokens": 41592494283.0, + "step": 9951 + }, + { + "epoch": 1.1826500297088534, + "grad_norm": 0.286719432879587, + "learning_rate": 8.799556808728609e-06, + "loss": 0.8221, + "num_tokens": 41596682142.0, + "step": 9952 + }, + { + "epoch": 1.1827688651218062, + "grad_norm": 0.33990850037466624, + "learning_rate": 8.797877447778572e-06, + "loss": 0.8272, + "num_tokens": 41600870918.0, + "step": 9953 + }, + { + "epoch": 1.1828877005347593, + "grad_norm": 0.33617661179009456, + "learning_rate": 8.796198168380573e-06, + "loss": 0.84, + "num_tokens": 41605058838.0, + "step": 9954 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.2836334422793206, + "learning_rate": 8.794518970596815e-06, + "loss": 0.8383, + "num_tokens": 41609249340.0, + "step": 9955 + }, + { + "epoch": 1.1831253713606655, + "grad_norm": 0.31271734044714866, + "learning_rate": 8.792839854489471e-06, + "loss": 0.8159, + "num_tokens": 41613439130.0, + "step": 9956 + }, + { + "epoch": 1.1832442067736186, + "grad_norm": 0.3291559226020011, + "learning_rate": 8.791160820120737e-06, + "loss": 0.8481, + "num_tokens": 41617629077.0, + "step": 9957 + }, + { + "epoch": 1.1833630421865715, + "grad_norm": 0.29615599974165485, + "learning_rate": 8.78948186755278e-06, + "loss": 0.8187, + "num_tokens": 41621817530.0, + "step": 9958 + }, + { + "epoch": 1.1834818775995246, + "grad_norm": 0.3234580020478942, + "learning_rate": 8.787802996847785e-06, + "loss": 0.8073, + "num_tokens": 41626007650.0, + "step": 9959 + }, + { + "epoch": 1.1836007130124777, + "grad_norm": 0.2984504627008417, + "learning_rate": 8.78612420806792e-06, + "loss": 0.8357, + "num_tokens": 41630180633.0, + "step": 9960 + }, + { + "epoch": 1.1837195484254308, + "grad_norm": 0.28288657965673975, + "learning_rate": 8.784445501275367e-06, + "loss": 0.8049, + "num_tokens": 41634363800.0, + "step": 9961 + }, + { + "epoch": 1.183838383838384, + "grad_norm": 0.3154264555185354, + "learning_rate": 8.78276687653229e-06, + "loss": 0.8195, + "num_tokens": 41638552231.0, + "step": 9962 + }, + { + "epoch": 1.183957219251337, + "grad_norm": 0.31892929950145765, + "learning_rate": 8.781088333900847e-06, + "loss": 0.857, + "num_tokens": 41642716987.0, + "step": 9963 + }, + { + "epoch": 1.18407605466429, + "grad_norm": 0.29209974194288246, + "learning_rate": 8.779409873443208e-06, + "loss": 0.8522, + "num_tokens": 41646902999.0, + "step": 9964 + }, + { + "epoch": 1.184194890077243, + "grad_norm": 0.32238186517125333, + "learning_rate": 8.777731495221528e-06, + "loss": 0.8451, + "num_tokens": 41651076579.0, + "step": 9965 + }, + { + "epoch": 1.184313725490196, + "grad_norm": 0.3414113092254629, + "learning_rate": 8.776053199297962e-06, + "loss": 0.8203, + "num_tokens": 41655265690.0, + "step": 9966 + }, + { + "epoch": 1.1844325609031492, + "grad_norm": 0.28600831575171454, + "learning_rate": 8.774374985734671e-06, + "loss": 0.7884, + "num_tokens": 41659455340.0, + "step": 9967 + }, + { + "epoch": 1.1845513963161023, + "grad_norm": 0.33601861689244955, + "learning_rate": 8.772696854593796e-06, + "loss": 0.8093, + "num_tokens": 41663607622.0, + "step": 9968 + }, + { + "epoch": 1.1846702317290552, + "grad_norm": 0.3011549579634649, + "learning_rate": 8.771018805937486e-06, + "loss": 0.7746, + "num_tokens": 41667795670.0, + "step": 9969 + }, + { + "epoch": 1.1847890671420083, + "grad_norm": 0.3052489696505535, + "learning_rate": 8.769340839827889e-06, + "loss": 0.8128, + "num_tokens": 41671974550.0, + "step": 9970 + }, + { + "epoch": 1.1849079025549614, + "grad_norm": 0.2945036725199957, + "learning_rate": 8.767662956327142e-06, + "loss": 0.8361, + "num_tokens": 41676119197.0, + "step": 9971 + }, + { + "epoch": 1.1850267379679145, + "grad_norm": 0.2935523790473644, + "learning_rate": 8.765985155497383e-06, + "loss": 0.8065, + "num_tokens": 41680307900.0, + "step": 9972 + }, + { + "epoch": 1.1851455733808676, + "grad_norm": 0.3066677789840895, + "learning_rate": 8.764307437400744e-06, + "loss": 0.8443, + "num_tokens": 41684498825.0, + "step": 9973 + }, + { + "epoch": 1.1852644087938207, + "grad_norm": 0.3083320233958982, + "learning_rate": 8.762629802099363e-06, + "loss": 0.7987, + "num_tokens": 41688684506.0, + "step": 9974 + }, + { + "epoch": 1.1853832442067735, + "grad_norm": 0.3434617871903674, + "learning_rate": 8.760952249655365e-06, + "loss": 0.8016, + "num_tokens": 41692851631.0, + "step": 9975 + }, + { + "epoch": 1.1855020796197266, + "grad_norm": 0.31685522619733086, + "learning_rate": 8.759274780130875e-06, + "loss": 0.8361, + "num_tokens": 41697040853.0, + "step": 9976 + }, + { + "epoch": 1.1856209150326797, + "grad_norm": 0.3132199697041444, + "learning_rate": 8.757597393588017e-06, + "loss": 0.7889, + "num_tokens": 41701189821.0, + "step": 9977 + }, + { + "epoch": 1.1857397504456328, + "grad_norm": 0.2993068090236095, + "learning_rate": 8.755920090088908e-06, + "loss": 0.8025, + "num_tokens": 41705379632.0, + "step": 9978 + }, + { + "epoch": 1.185858585858586, + "grad_norm": 0.29800597859089056, + "learning_rate": 8.754242869695664e-06, + "loss": 0.7939, + "num_tokens": 41709569626.0, + "step": 9979 + }, + { + "epoch": 1.1859774212715388, + "grad_norm": 0.298460354568791, + "learning_rate": 8.752565732470404e-06, + "loss": 0.7886, + "num_tokens": 41713737655.0, + "step": 9980 + }, + { + "epoch": 1.186096256684492, + "grad_norm": 0.3161755348578694, + "learning_rate": 8.75088867847523e-06, + "loss": 0.8099, + "num_tokens": 41717927169.0, + "step": 9981 + }, + { + "epoch": 1.186215092097445, + "grad_norm": 0.3505381185323983, + "learning_rate": 8.749211707772255e-06, + "loss": 0.825, + "num_tokens": 41722115024.0, + "step": 9982 + }, + { + "epoch": 1.1863339275103981, + "grad_norm": 0.31847780863047404, + "learning_rate": 8.747534820423579e-06, + "loss": 0.823, + "num_tokens": 41726304537.0, + "step": 9983 + }, + { + "epoch": 1.1864527629233512, + "grad_norm": 0.30663944062232634, + "learning_rate": 8.745858016491307e-06, + "loss": 0.8054, + "num_tokens": 41730492222.0, + "step": 9984 + }, + { + "epoch": 1.1865715983363043, + "grad_norm": 0.3291759584063407, + "learning_rate": 8.744181296037529e-06, + "loss": 0.8577, + "num_tokens": 41734681585.0, + "step": 9985 + }, + { + "epoch": 1.1866904337492572, + "grad_norm": 0.2989032970002092, + "learning_rate": 8.742504659124347e-06, + "loss": 0.831, + "num_tokens": 41738870634.0, + "step": 9986 + }, + { + "epoch": 1.1868092691622103, + "grad_norm": 0.3174581901496095, + "learning_rate": 8.74082810581385e-06, + "loss": 0.7961, + "num_tokens": 41743050423.0, + "step": 9987 + }, + { + "epoch": 1.1869281045751634, + "grad_norm": 0.3148956095533394, + "learning_rate": 8.739151636168129e-06, + "loss": 0.8334, + "num_tokens": 41747240558.0, + "step": 9988 + }, + { + "epoch": 1.1870469399881165, + "grad_norm": 0.331061561734645, + "learning_rate": 8.737475250249266e-06, + "loss": 0.7691, + "num_tokens": 41751430666.0, + "step": 9989 + }, + { + "epoch": 1.1871657754010696, + "grad_norm": 0.32446671920692727, + "learning_rate": 8.735798948119343e-06, + "loss": 0.8424, + "num_tokens": 41755606105.0, + "step": 9990 + }, + { + "epoch": 1.1872846108140225, + "grad_norm": 0.29509884785200036, + "learning_rate": 8.734122729840442e-06, + "loss": 0.7939, + "num_tokens": 41759795422.0, + "step": 9991 + }, + { + "epoch": 1.1874034462269756, + "grad_norm": 0.32935215725623573, + "learning_rate": 8.732446595474635e-06, + "loss": 0.8296, + "num_tokens": 41763920728.0, + "step": 9992 + }, + { + "epoch": 1.1875222816399287, + "grad_norm": 0.28108910776441415, + "learning_rate": 8.730770545084002e-06, + "loss": 0.8033, + "num_tokens": 41768108858.0, + "step": 9993 + }, + { + "epoch": 1.1876411170528818, + "grad_norm": 0.33199051796258555, + "learning_rate": 8.729094578730604e-06, + "loss": 0.8261, + "num_tokens": 41772268117.0, + "step": 9994 + }, + { + "epoch": 1.1877599524658349, + "grad_norm": 0.29582826661368866, + "learning_rate": 8.727418696476513e-06, + "loss": 0.8214, + "num_tokens": 41776441879.0, + "step": 9995 + }, + { + "epoch": 1.187878787878788, + "grad_norm": 0.3033734741261522, + "learning_rate": 8.725742898383791e-06, + "loss": 0.8234, + "num_tokens": 41780632887.0, + "step": 9996 + }, + { + "epoch": 1.1879976232917409, + "grad_norm": 0.31154520472652225, + "learning_rate": 8.7240671845145e-06, + "loss": 0.8462, + "num_tokens": 41784822097.0, + "step": 9997 + }, + { + "epoch": 1.188116458704694, + "grad_norm": 0.28497641720458206, + "learning_rate": 8.722391554930693e-06, + "loss": 0.8234, + "num_tokens": 41789010265.0, + "step": 9998 + }, + { + "epoch": 1.188235294117647, + "grad_norm": 0.31995640625582006, + "learning_rate": 8.720716009694432e-06, + "loss": 0.8083, + "num_tokens": 41793200391.0, + "step": 9999 + }, + { + "epoch": 1.1883541295306002, + "grad_norm": 0.297636474324413, + "learning_rate": 8.719040548867762e-06, + "loss": 0.7916, + "num_tokens": 41797390872.0, + "step": 10000 + }, + { + "epoch": 1.1884729649435533, + "grad_norm": 0.29250656149569476, + "learning_rate": 8.71736517251273e-06, + "loss": 0.8361, + "num_tokens": 41801546229.0, + "step": 10001 + }, + { + "epoch": 1.1885918003565061, + "grad_norm": 0.3127489906564999, + "learning_rate": 8.715689880691389e-06, + "loss": 0.8183, + "num_tokens": 41805707855.0, + "step": 10002 + }, + { + "epoch": 1.1887106357694592, + "grad_norm": 0.3270926460023109, + "learning_rate": 8.71401467346577e-06, + "loss": 0.7899, + "num_tokens": 41809879309.0, + "step": 10003 + }, + { + "epoch": 1.1888294711824123, + "grad_norm": 0.32270947509858416, + "learning_rate": 8.71233955089792e-06, + "loss": 0.8157, + "num_tokens": 41814068842.0, + "step": 10004 + }, + { + "epoch": 1.1889483065953654, + "grad_norm": 0.30536208200232245, + "learning_rate": 8.710664513049866e-06, + "loss": 0.844, + "num_tokens": 41818251132.0, + "step": 10005 + }, + { + "epoch": 1.1890671420083185, + "grad_norm": 0.2705313867366819, + "learning_rate": 8.708989559983652e-06, + "loss": 0.876, + "num_tokens": 41822433046.0, + "step": 10006 + }, + { + "epoch": 1.1891859774212716, + "grad_norm": 0.31643639835182463, + "learning_rate": 8.7073146917613e-06, + "loss": 0.8244, + "num_tokens": 41826596004.0, + "step": 10007 + }, + { + "epoch": 1.1893048128342245, + "grad_norm": 0.2734226021830037, + "learning_rate": 8.705639908444832e-06, + "loss": 0.7917, + "num_tokens": 41830779171.0, + "step": 10008 + }, + { + "epoch": 1.1894236482471776, + "grad_norm": 0.2731104037460517, + "learning_rate": 8.70396521009628e-06, + "loss": 0.8008, + "num_tokens": 41834965897.0, + "step": 10009 + }, + { + "epoch": 1.1895424836601307, + "grad_norm": 0.313186976697799, + "learning_rate": 8.702290596777658e-06, + "loss": 0.8338, + "num_tokens": 41839137639.0, + "step": 10010 + }, + { + "epoch": 1.1896613190730838, + "grad_norm": 0.3060419003328042, + "learning_rate": 8.70061606855098e-06, + "loss": 0.8328, + "num_tokens": 41843301291.0, + "step": 10011 + }, + { + "epoch": 1.189780154486037, + "grad_norm": 0.29551041421297913, + "learning_rate": 8.69894162547827e-06, + "loss": 0.8154, + "num_tokens": 41847489994.0, + "step": 10012 + }, + { + "epoch": 1.1898989898989898, + "grad_norm": 0.2928576815141213, + "learning_rate": 8.697267267621527e-06, + "loss": 0.8182, + "num_tokens": 41851679129.0, + "step": 10013 + }, + { + "epoch": 1.190017825311943, + "grad_norm": 0.29371970429838384, + "learning_rate": 8.695592995042764e-06, + "loss": 0.849, + "num_tokens": 41855824749.0, + "step": 10014 + }, + { + "epoch": 1.190136660724896, + "grad_norm": 0.30249808458977956, + "learning_rate": 8.693918807803988e-06, + "loss": 0.8381, + "num_tokens": 41860012918.0, + "step": 10015 + }, + { + "epoch": 1.190255496137849, + "grad_norm": 0.3133345172056883, + "learning_rate": 8.69224470596719e-06, + "loss": 0.8488, + "num_tokens": 41864200709.0, + "step": 10016 + }, + { + "epoch": 1.1903743315508022, + "grad_norm": 0.30810859951641445, + "learning_rate": 8.690570689594373e-06, + "loss": 0.8105, + "num_tokens": 41868389856.0, + "step": 10017 + }, + { + "epoch": 1.1904931669637553, + "grad_norm": 0.28363969176022386, + "learning_rate": 8.688896758747533e-06, + "loss": 0.8311, + "num_tokens": 41872547868.0, + "step": 10018 + }, + { + "epoch": 1.1906120023767082, + "grad_norm": 0.2991528036545709, + "learning_rate": 8.687222913488663e-06, + "loss": 0.7747, + "num_tokens": 41876728310.0, + "step": 10019 + }, + { + "epoch": 1.1907308377896613, + "grad_norm": 0.3094893382629162, + "learning_rate": 8.685549153879746e-06, + "loss": 0.8277, + "num_tokens": 41880891417.0, + "step": 10020 + }, + { + "epoch": 1.1908496732026144, + "grad_norm": 0.30968724494712296, + "learning_rate": 8.683875479982771e-06, + "loss": 0.82, + "num_tokens": 41885079813.0, + "step": 10021 + }, + { + "epoch": 1.1909685086155675, + "grad_norm": 0.2685092664405595, + "learning_rate": 8.682201891859715e-06, + "loss": 0.8226, + "num_tokens": 41889269248.0, + "step": 10022 + }, + { + "epoch": 1.1910873440285206, + "grad_norm": 0.31795385840579715, + "learning_rate": 8.680528389572563e-06, + "loss": 0.7806, + "num_tokens": 41893458330.0, + "step": 10023 + }, + { + "epoch": 1.1912061794414734, + "grad_norm": 0.32468226349755647, + "learning_rate": 8.678854973183282e-06, + "loss": 0.8525, + "num_tokens": 41897640418.0, + "step": 10024 + }, + { + "epoch": 1.1913250148544265, + "grad_norm": 0.30385188766936416, + "learning_rate": 8.677181642753854e-06, + "loss": 0.7942, + "num_tokens": 41901817825.0, + "step": 10025 + }, + { + "epoch": 1.1914438502673796, + "grad_norm": 0.2833453185717789, + "learning_rate": 8.67550839834624e-06, + "loss": 0.8122, + "num_tokens": 41906006009.0, + "step": 10026 + }, + { + "epoch": 1.1915626856803327, + "grad_norm": 0.35793673750802946, + "learning_rate": 8.673835240022412e-06, + "loss": 0.7952, + "num_tokens": 41910195941.0, + "step": 10027 + }, + { + "epoch": 1.1916815210932858, + "grad_norm": 0.37233449599903445, + "learning_rate": 8.67216216784433e-06, + "loss": 0.805, + "num_tokens": 41914385586.0, + "step": 10028 + }, + { + "epoch": 1.191800356506239, + "grad_norm": 0.32139073980067084, + "learning_rate": 8.670489181873952e-06, + "loss": 0.8088, + "num_tokens": 41918524799.0, + "step": 10029 + }, + { + "epoch": 1.1919191919191918, + "grad_norm": 0.38874928395646735, + "learning_rate": 8.668816282173234e-06, + "loss": 0.7994, + "num_tokens": 41922713250.0, + "step": 10030 + }, + { + "epoch": 1.192038027332145, + "grad_norm": 0.34727698387545125, + "learning_rate": 8.667143468804133e-06, + "loss": 0.8505, + "num_tokens": 41926901346.0, + "step": 10031 + }, + { + "epoch": 1.192156862745098, + "grad_norm": 0.3313169271295366, + "learning_rate": 8.665470741828597e-06, + "loss": 0.7941, + "num_tokens": 41931090899.0, + "step": 10032 + }, + { + "epoch": 1.1922756981580511, + "grad_norm": 0.3414307962762469, + "learning_rate": 8.663798101308576e-06, + "loss": 0.8137, + "num_tokens": 41935248184.0, + "step": 10033 + }, + { + "epoch": 1.1923945335710042, + "grad_norm": 0.3360523599065986, + "learning_rate": 8.662125547306007e-06, + "loss": 0.8374, + "num_tokens": 41939437896.0, + "step": 10034 + }, + { + "epoch": 1.192513368983957, + "grad_norm": 0.3109121588062648, + "learning_rate": 8.660453079882833e-06, + "loss": 0.8156, + "num_tokens": 41943627675.0, + "step": 10035 + }, + { + "epoch": 1.1926322043969102, + "grad_norm": 0.29396885591572214, + "learning_rate": 8.65878069910099e-06, + "loss": 0.8019, + "num_tokens": 41947794949.0, + "step": 10036 + }, + { + "epoch": 1.1927510398098633, + "grad_norm": 0.3144113848347155, + "learning_rate": 8.657108405022416e-06, + "loss": 0.8294, + "num_tokens": 41951939470.0, + "step": 10037 + }, + { + "epoch": 1.1928698752228164, + "grad_norm": 0.2779040764731488, + "learning_rate": 8.65543619770904e-06, + "loss": 0.8157, + "num_tokens": 41956129096.0, + "step": 10038 + }, + { + "epoch": 1.1929887106357695, + "grad_norm": 0.28821930255989925, + "learning_rate": 8.653764077222788e-06, + "loss": 0.8467, + "num_tokens": 41960319718.0, + "step": 10039 + }, + { + "epoch": 1.1931075460487226, + "grad_norm": 0.37190509145623485, + "learning_rate": 8.652092043625584e-06, + "loss": 0.8753, + "num_tokens": 41964508057.0, + "step": 10040 + }, + { + "epoch": 1.1932263814616757, + "grad_norm": 0.28392378070535235, + "learning_rate": 8.650420096979352e-06, + "loss": 0.827, + "num_tokens": 41968697601.0, + "step": 10041 + }, + { + "epoch": 1.1933452168746286, + "grad_norm": 0.3664249128402411, + "learning_rate": 8.648748237346008e-06, + "loss": 0.8103, + "num_tokens": 41972862246.0, + "step": 10042 + }, + { + "epoch": 1.1934640522875817, + "grad_norm": 0.31027603769519946, + "learning_rate": 8.647076464787463e-06, + "loss": 0.8292, + "num_tokens": 41977049857.0, + "step": 10043 + }, + { + "epoch": 1.1935828877005348, + "grad_norm": 0.34765804211212453, + "learning_rate": 8.645404779365634e-06, + "loss": 0.825, + "num_tokens": 41981239808.0, + "step": 10044 + }, + { + "epoch": 1.1937017231134879, + "grad_norm": 0.3296666193123205, + "learning_rate": 8.64373318114243e-06, + "loss": 0.8123, + "num_tokens": 41985403858.0, + "step": 10045 + }, + { + "epoch": 1.1938205585264408, + "grad_norm": 0.3188257742458306, + "learning_rate": 8.642061670179749e-06, + "loss": 0.8283, + "num_tokens": 41989564510.0, + "step": 10046 + }, + { + "epoch": 1.1939393939393939, + "grad_norm": 0.35756161166138734, + "learning_rate": 8.640390246539499e-06, + "loss": 0.8058, + "num_tokens": 41993753351.0, + "step": 10047 + }, + { + "epoch": 1.194058229352347, + "grad_norm": 0.3326377282775826, + "learning_rate": 8.638718910283577e-06, + "loss": 0.8145, + "num_tokens": 41997943837.0, + "step": 10048 + }, + { + "epoch": 1.1941770647653, + "grad_norm": 0.29933655572635237, + "learning_rate": 8.637047661473872e-06, + "loss": 0.7981, + "num_tokens": 42002127677.0, + "step": 10049 + }, + { + "epoch": 1.1942959001782532, + "grad_norm": 0.3777351709925405, + "learning_rate": 8.635376500172286e-06, + "loss": 0.8279, + "num_tokens": 42006293083.0, + "step": 10050 + }, + { + "epoch": 1.1944147355912063, + "grad_norm": 0.29606341943936176, + "learning_rate": 8.633705426440704e-06, + "loss": 0.7958, + "num_tokens": 42010457444.0, + "step": 10051 + }, + { + "epoch": 1.1945335710041594, + "grad_norm": 0.3802819542929375, + "learning_rate": 8.632034440341009e-06, + "loss": 0.8399, + "num_tokens": 42014643265.0, + "step": 10052 + }, + { + "epoch": 1.1946524064171122, + "grad_norm": 0.33227694216915377, + "learning_rate": 8.630363541935085e-06, + "loss": 0.8141, + "num_tokens": 42018831006.0, + "step": 10053 + }, + { + "epoch": 1.1947712418300653, + "grad_norm": 0.32042643597340137, + "learning_rate": 8.628692731284812e-06, + "loss": 0.8025, + "num_tokens": 42023020857.0, + "step": 10054 + }, + { + "epoch": 1.1948900772430184, + "grad_norm": 0.39467563698390173, + "learning_rate": 8.62702200845206e-06, + "loss": 0.8329, + "num_tokens": 42027201498.0, + "step": 10055 + }, + { + "epoch": 1.1950089126559715, + "grad_norm": 0.3118002889349472, + "learning_rate": 8.625351373498712e-06, + "loss": 0.8209, + "num_tokens": 42031390226.0, + "step": 10056 + }, + { + "epoch": 1.1951277480689246, + "grad_norm": 0.3543822886564938, + "learning_rate": 8.623680826486629e-06, + "loss": 0.8468, + "num_tokens": 42035579953.0, + "step": 10057 + }, + { + "epoch": 1.1952465834818775, + "grad_norm": 0.33276359780396836, + "learning_rate": 8.62201036747768e-06, + "loss": 0.8157, + "num_tokens": 42039769262.0, + "step": 10058 + }, + { + "epoch": 1.1953654188948306, + "grad_norm": 0.31759935549935936, + "learning_rate": 8.620339996533728e-06, + "loss": 0.8155, + "num_tokens": 42043946249.0, + "step": 10059 + }, + { + "epoch": 1.1954842543077837, + "grad_norm": 0.3077736798271272, + "learning_rate": 8.618669713716632e-06, + "loss": 0.8084, + "num_tokens": 42048137232.0, + "step": 10060 + }, + { + "epoch": 1.1956030897207368, + "grad_norm": 0.2985640719019002, + "learning_rate": 8.61699951908825e-06, + "loss": 0.8535, + "num_tokens": 42052318345.0, + "step": 10061 + }, + { + "epoch": 1.19572192513369, + "grad_norm": 0.32598658083495846, + "learning_rate": 8.615329412710425e-06, + "loss": 0.8164, + "num_tokens": 42056506429.0, + "step": 10062 + }, + { + "epoch": 1.195840760546643, + "grad_norm": 0.30862584299749496, + "learning_rate": 8.613659394645022e-06, + "loss": 0.8262, + "num_tokens": 42060696083.0, + "step": 10063 + }, + { + "epoch": 1.195959595959596, + "grad_norm": 0.30963931929478483, + "learning_rate": 8.611989464953877e-06, + "loss": 0.8622, + "num_tokens": 42064886059.0, + "step": 10064 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 0.2894315675831492, + "learning_rate": 8.610319623698838e-06, + "loss": 0.8175, + "num_tokens": 42069075007.0, + "step": 10065 + }, + { + "epoch": 1.196197266785502, + "grad_norm": 0.3075590736713107, + "learning_rate": 8.608649870941741e-06, + "loss": 0.8477, + "num_tokens": 42073235949.0, + "step": 10066 + }, + { + "epoch": 1.1963161021984552, + "grad_norm": 0.2903020988351742, + "learning_rate": 8.606980206744426e-06, + "loss": 0.8271, + "num_tokens": 42077414591.0, + "step": 10067 + }, + { + "epoch": 1.1964349376114083, + "grad_norm": 0.2815210702460902, + "learning_rate": 8.605310631168721e-06, + "loss": 0.8431, + "num_tokens": 42081591320.0, + "step": 10068 + }, + { + "epoch": 1.1965537730243612, + "grad_norm": 0.3223215028302719, + "learning_rate": 8.603641144276466e-06, + "loss": 0.8216, + "num_tokens": 42085778862.0, + "step": 10069 + }, + { + "epoch": 1.1966726084373143, + "grad_norm": 0.31110085171912455, + "learning_rate": 8.60197174612948e-06, + "loss": 0.8646, + "num_tokens": 42089940026.0, + "step": 10070 + }, + { + "epoch": 1.1967914438502674, + "grad_norm": 0.319591387156514, + "learning_rate": 8.600302436789585e-06, + "loss": 0.8417, + "num_tokens": 42094129156.0, + "step": 10071 + }, + { + "epoch": 1.1969102792632205, + "grad_norm": 0.3242194624581422, + "learning_rate": 8.598633216318611e-06, + "loss": 0.8318, + "num_tokens": 42098319832.0, + "step": 10072 + }, + { + "epoch": 1.1970291146761736, + "grad_norm": 0.3060537592064801, + "learning_rate": 8.596964084778363e-06, + "loss": 0.8139, + "num_tokens": 42102504773.0, + "step": 10073 + }, + { + "epoch": 1.1971479500891267, + "grad_norm": 0.30521208968382746, + "learning_rate": 8.595295042230657e-06, + "loss": 0.8243, + "num_tokens": 42106662495.0, + "step": 10074 + }, + { + "epoch": 1.1972667855020795, + "grad_norm": 0.3218058059955236, + "learning_rate": 8.59362608873731e-06, + "loss": 0.8258, + "num_tokens": 42110852765.0, + "step": 10075 + }, + { + "epoch": 1.1973856209150326, + "grad_norm": 0.3068629114747636, + "learning_rate": 8.591957224360126e-06, + "loss": 0.812, + "num_tokens": 42115040377.0, + "step": 10076 + }, + { + "epoch": 1.1975044563279857, + "grad_norm": 0.3825823845401117, + "learning_rate": 8.590288449160906e-06, + "loss": 0.8174, + "num_tokens": 42119231336.0, + "step": 10077 + }, + { + "epoch": 1.1976232917409388, + "grad_norm": 0.3091643924210686, + "learning_rate": 8.588619763201454e-06, + "loss": 0.8094, + "num_tokens": 42123420208.0, + "step": 10078 + }, + { + "epoch": 1.197742127153892, + "grad_norm": 0.31790616393201343, + "learning_rate": 8.586951166543564e-06, + "loss": 0.7727, + "num_tokens": 42127593438.0, + "step": 10079 + }, + { + "epoch": 1.1978609625668448, + "grad_norm": 0.3272001868966415, + "learning_rate": 8.585282659249032e-06, + "loss": 0.8375, + "num_tokens": 42131762946.0, + "step": 10080 + }, + { + "epoch": 1.197979797979798, + "grad_norm": 0.3405079105206124, + "learning_rate": 8.583614241379643e-06, + "loss": 0.8271, + "num_tokens": 42135951070.0, + "step": 10081 + }, + { + "epoch": 1.198098633392751, + "grad_norm": 0.29968153856063484, + "learning_rate": 8.581945912997195e-06, + "loss": 0.8046, + "num_tokens": 42140141173.0, + "step": 10082 + }, + { + "epoch": 1.1982174688057041, + "grad_norm": 0.34199090580431135, + "learning_rate": 8.580277674163462e-06, + "loss": 0.8106, + "num_tokens": 42144329330.0, + "step": 10083 + }, + { + "epoch": 1.1983363042186572, + "grad_norm": 0.30713147888531805, + "learning_rate": 8.578609524940227e-06, + "loss": 0.8209, + "num_tokens": 42148516988.0, + "step": 10084 + }, + { + "epoch": 1.1984551396316103, + "grad_norm": 0.32766077284394046, + "learning_rate": 8.576941465389273e-06, + "loss": 0.8133, + "num_tokens": 42152707523.0, + "step": 10085 + }, + { + "epoch": 1.1985739750445632, + "grad_norm": 0.29698937362344535, + "learning_rate": 8.575273495572366e-06, + "loss": 0.7923, + "num_tokens": 42156897469.0, + "step": 10086 + }, + { + "epoch": 1.1986928104575163, + "grad_norm": 0.30959495069870907, + "learning_rate": 8.573605615551276e-06, + "loss": 0.7933, + "num_tokens": 42161087160.0, + "step": 10087 + }, + { + "epoch": 1.1988116458704694, + "grad_norm": 0.3512344316556068, + "learning_rate": 8.571937825387777e-06, + "loss": 0.8267, + "num_tokens": 42165276510.0, + "step": 10088 + }, + { + "epoch": 1.1989304812834225, + "grad_norm": 0.31602053789270407, + "learning_rate": 8.570270125143629e-06, + "loss": 0.8091, + "num_tokens": 42169459383.0, + "step": 10089 + }, + { + "epoch": 1.1990493166963756, + "grad_norm": 0.2947170970392031, + "learning_rate": 8.568602514880597e-06, + "loss": 0.8244, + "num_tokens": 42173648271.0, + "step": 10090 + }, + { + "epoch": 1.1991681521093285, + "grad_norm": 0.35132945261255083, + "learning_rate": 8.566934994660431e-06, + "loss": 0.8056, + "num_tokens": 42177820130.0, + "step": 10091 + }, + { + "epoch": 1.1992869875222816, + "grad_norm": 0.3040482224783026, + "learning_rate": 8.56526756454489e-06, + "loss": 0.7802, + "num_tokens": 42181958773.0, + "step": 10092 + }, + { + "epoch": 1.1994058229352347, + "grad_norm": 0.36583533579178373, + "learning_rate": 8.563600224595725e-06, + "loss": 0.7924, + "num_tokens": 42186128149.0, + "step": 10093 + }, + { + "epoch": 1.1995246583481878, + "grad_norm": 0.3275878620250774, + "learning_rate": 8.561932974874676e-06, + "loss": 0.8262, + "num_tokens": 42190268563.0, + "step": 10094 + }, + { + "epoch": 1.1996434937611409, + "grad_norm": 0.3571641156118762, + "learning_rate": 8.560265815443498e-06, + "loss": 0.8257, + "num_tokens": 42194435094.0, + "step": 10095 + }, + { + "epoch": 1.199762329174094, + "grad_norm": 0.3353570280703875, + "learning_rate": 8.558598746363923e-06, + "loss": 0.8368, + "num_tokens": 42198623358.0, + "step": 10096 + }, + { + "epoch": 1.1998811645870469, + "grad_norm": 0.3652560838812365, + "learning_rate": 8.556931767697689e-06, + "loss": 0.8488, + "num_tokens": 42202789349.0, + "step": 10097 + }, + { + "epoch": 1.2, + "grad_norm": 0.28673831777715997, + "learning_rate": 8.555264879506538e-06, + "loss": 0.79, + "num_tokens": 42206977596.0, + "step": 10098 + }, + { + "epoch": 1.200118835412953, + "grad_norm": 0.346778168252704, + "learning_rate": 8.553598081852189e-06, + "loss": 0.8041, + "num_tokens": 42211166780.0, + "step": 10099 + }, + { + "epoch": 1.2002376708259062, + "grad_norm": 0.31594600537600415, + "learning_rate": 8.551931374796374e-06, + "loss": 0.8429, + "num_tokens": 42215324050.0, + "step": 10100 + }, + { + "epoch": 1.2003565062388593, + "grad_norm": 0.3164259754611839, + "learning_rate": 8.550264758400819e-06, + "loss": 0.8666, + "num_tokens": 42219514344.0, + "step": 10101 + }, + { + "epoch": 1.2004753416518121, + "grad_norm": 0.28703042671040513, + "learning_rate": 8.548598232727243e-06, + "loss": 0.8383, + "num_tokens": 42223703005.0, + "step": 10102 + }, + { + "epoch": 1.2005941770647652, + "grad_norm": 0.3263667660632007, + "learning_rate": 8.546931797837366e-06, + "loss": 0.8231, + "num_tokens": 42227891392.0, + "step": 10103 + }, + { + "epoch": 1.2007130124777183, + "grad_norm": 0.29778413070620596, + "learning_rate": 8.545265453792894e-06, + "loss": 0.8079, + "num_tokens": 42232080364.0, + "step": 10104 + }, + { + "epoch": 1.2008318478906714, + "grad_norm": 0.30816554183520223, + "learning_rate": 8.543599200655544e-06, + "loss": 0.8271, + "num_tokens": 42236268569.0, + "step": 10105 + }, + { + "epoch": 1.2009506833036245, + "grad_norm": 0.2996707292239065, + "learning_rate": 8.541933038487016e-06, + "loss": 0.8123, + "num_tokens": 42240442050.0, + "step": 10106 + }, + { + "epoch": 1.2010695187165776, + "grad_norm": 0.30573170071950245, + "learning_rate": 8.540266967349025e-06, + "loss": 0.8283, + "num_tokens": 42244613987.0, + "step": 10107 + }, + { + "epoch": 1.2011883541295305, + "grad_norm": 0.3093539766865417, + "learning_rate": 8.538600987303263e-06, + "loss": 0.8445, + "num_tokens": 42248775387.0, + "step": 10108 + }, + { + "epoch": 1.2013071895424836, + "grad_norm": 0.34067938125575065, + "learning_rate": 8.536935098411427e-06, + "loss": 0.7936, + "num_tokens": 42252966256.0, + "step": 10109 + }, + { + "epoch": 1.2014260249554367, + "grad_norm": 0.29448796979617436, + "learning_rate": 8.535269300735214e-06, + "loss": 0.8272, + "num_tokens": 42257154527.0, + "step": 10110 + }, + { + "epoch": 1.2015448603683898, + "grad_norm": 0.3323169274423677, + "learning_rate": 8.533603594336311e-06, + "loss": 0.8253, + "num_tokens": 42261328116.0, + "step": 10111 + }, + { + "epoch": 1.201663695781343, + "grad_norm": 0.2835853644958561, + "learning_rate": 8.531937979276408e-06, + "loss": 0.8124, + "num_tokens": 42265516315.0, + "step": 10112 + }, + { + "epoch": 1.2017825311942958, + "grad_norm": 0.3115646781581004, + "learning_rate": 8.530272455617183e-06, + "loss": 0.8144, + "num_tokens": 42269653506.0, + "step": 10113 + }, + { + "epoch": 1.2019013666072489, + "grad_norm": 0.29922409842171394, + "learning_rate": 8.528607023420321e-06, + "loss": 0.8205, + "num_tokens": 42273836600.0, + "step": 10114 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.3128733377784479, + "learning_rate": 8.526941682747495e-06, + "loss": 0.8293, + "num_tokens": 42278024339.0, + "step": 10115 + }, + { + "epoch": 1.202139037433155, + "grad_norm": 0.284536705268873, + "learning_rate": 8.525276433660383e-06, + "loss": 0.7963, + "num_tokens": 42282214348.0, + "step": 10116 + }, + { + "epoch": 1.2022578728461082, + "grad_norm": 0.3009922509788051, + "learning_rate": 8.523611276220654e-06, + "loss": 0.8188, + "num_tokens": 42286382683.0, + "step": 10117 + }, + { + "epoch": 1.2023767082590613, + "grad_norm": 0.32686735836932157, + "learning_rate": 8.521946210489969e-06, + "loss": 0.8114, + "num_tokens": 42290571445.0, + "step": 10118 + }, + { + "epoch": 1.2024955436720142, + "grad_norm": 0.3136596013484344, + "learning_rate": 8.52028123652999e-06, + "loss": 0.8118, + "num_tokens": 42294760838.0, + "step": 10119 + }, + { + "epoch": 1.2026143790849673, + "grad_norm": 0.3311780618679906, + "learning_rate": 8.518616354402387e-06, + "loss": 0.8317, + "num_tokens": 42298950486.0, + "step": 10120 + }, + { + "epoch": 1.2027332144979204, + "grad_norm": 0.28993429527807263, + "learning_rate": 8.51695156416881e-06, + "loss": 0.8415, + "num_tokens": 42303117844.0, + "step": 10121 + }, + { + "epoch": 1.2028520499108735, + "grad_norm": 0.27301629653623044, + "learning_rate": 8.51528686589091e-06, + "loss": 0.8091, + "num_tokens": 42307307731.0, + "step": 10122 + }, + { + "epoch": 1.2029708853238266, + "grad_norm": 0.32047678169748445, + "learning_rate": 8.513622259630341e-06, + "loss": 0.7941, + "num_tokens": 42311497595.0, + "step": 10123 + }, + { + "epoch": 1.2030897207367794, + "grad_norm": 0.3390152783148984, + "learning_rate": 8.511957745448743e-06, + "loss": 0.8866, + "num_tokens": 42315685390.0, + "step": 10124 + }, + { + "epoch": 1.2032085561497325, + "grad_norm": 0.2748314736495869, + "learning_rate": 8.510293323407762e-06, + "loss": 0.8109, + "num_tokens": 42319874694.0, + "step": 10125 + }, + { + "epoch": 1.2033273915626856, + "grad_norm": 0.34210690989733855, + "learning_rate": 8.508628993569037e-06, + "loss": 0.8155, + "num_tokens": 42324034151.0, + "step": 10126 + }, + { + "epoch": 1.2034462269756387, + "grad_norm": 0.296598292395609, + "learning_rate": 8.506964755994207e-06, + "loss": 0.8153, + "num_tokens": 42328204559.0, + "step": 10127 + }, + { + "epoch": 1.2035650623885918, + "grad_norm": 0.2920949236857192, + "learning_rate": 8.5053006107449e-06, + "loss": 0.8154, + "num_tokens": 42332372587.0, + "step": 10128 + }, + { + "epoch": 1.203683897801545, + "grad_norm": 0.33336864084538015, + "learning_rate": 8.503636557882746e-06, + "loss": 0.8024, + "num_tokens": 42336532641.0, + "step": 10129 + }, + { + "epoch": 1.203802733214498, + "grad_norm": 0.30905436272538506, + "learning_rate": 8.501972597469373e-06, + "loss": 0.8169, + "num_tokens": 42340720366.0, + "step": 10130 + }, + { + "epoch": 1.203921568627451, + "grad_norm": 0.28532367917003576, + "learning_rate": 8.500308729566397e-06, + "loss": 0.8126, + "num_tokens": 42344909118.0, + "step": 10131 + }, + { + "epoch": 1.204040404040404, + "grad_norm": 0.3571015894893755, + "learning_rate": 8.49864495423544e-06, + "loss": 0.8285, + "num_tokens": 42349099017.0, + "step": 10132 + }, + { + "epoch": 1.2041592394533571, + "grad_norm": 0.2908946695768447, + "learning_rate": 8.496981271538119e-06, + "loss": 0.8077, + "num_tokens": 42353288410.0, + "step": 10133 + }, + { + "epoch": 1.2042780748663102, + "grad_norm": 0.2736669636779207, + "learning_rate": 8.495317681536045e-06, + "loss": 0.824, + "num_tokens": 42357478660.0, + "step": 10134 + }, + { + "epoch": 1.204396910279263, + "grad_norm": 0.3170043826236264, + "learning_rate": 8.493654184290827e-06, + "loss": 0.8273, + "num_tokens": 42361668434.0, + "step": 10135 + }, + { + "epoch": 1.2045157456922162, + "grad_norm": 0.3080225050295516, + "learning_rate": 8.491990779864068e-06, + "loss": 0.8403, + "num_tokens": 42365856482.0, + "step": 10136 + }, + { + "epoch": 1.2046345811051693, + "grad_norm": 0.3118036607654869, + "learning_rate": 8.490327468317367e-06, + "loss": 0.8386, + "num_tokens": 42370044082.0, + "step": 10137 + }, + { + "epoch": 1.2047534165181224, + "grad_norm": 0.2915811383296929, + "learning_rate": 8.488664249712327e-06, + "loss": 0.8172, + "num_tokens": 42374235168.0, + "step": 10138 + }, + { + "epoch": 1.2048722519310755, + "grad_norm": 0.28697466068638666, + "learning_rate": 8.487001124110542e-06, + "loss": 0.8345, + "num_tokens": 42378415344.0, + "step": 10139 + }, + { + "epoch": 1.2049910873440286, + "grad_norm": 0.3230043444892055, + "learning_rate": 8.4853380915736e-06, + "loss": 0.8026, + "num_tokens": 42382605585.0, + "step": 10140 + }, + { + "epoch": 1.2051099227569817, + "grad_norm": 0.27353599773936643, + "learning_rate": 8.483675152163094e-06, + "loss": 0.7885, + "num_tokens": 42386795191.0, + "step": 10141 + }, + { + "epoch": 1.2052287581699346, + "grad_norm": 0.31451268703721935, + "learning_rate": 8.482012305940603e-06, + "loss": 0.8301, + "num_tokens": 42390965031.0, + "step": 10142 + }, + { + "epoch": 1.2053475935828877, + "grad_norm": 0.3093648599375526, + "learning_rate": 8.480349552967712e-06, + "loss": 0.8151, + "num_tokens": 42395154827.0, + "step": 10143 + }, + { + "epoch": 1.2054664289958408, + "grad_norm": 0.27555044751364, + "learning_rate": 8.47868689330599e-06, + "loss": 0.8284, + "num_tokens": 42399316657.0, + "step": 10144 + }, + { + "epoch": 1.2055852644087939, + "grad_norm": 0.30751792776055115, + "learning_rate": 8.477024327017023e-06, + "loss": 0.8206, + "num_tokens": 42403506105.0, + "step": 10145 + }, + { + "epoch": 1.205704099821747, + "grad_norm": 0.29893072186971037, + "learning_rate": 8.475361854162375e-06, + "loss": 0.8013, + "num_tokens": 42407663938.0, + "step": 10146 + }, + { + "epoch": 1.2058229352346999, + "grad_norm": 0.3263281674481674, + "learning_rate": 8.473699474803611e-06, + "loss": 0.7881, + "num_tokens": 42411854357.0, + "step": 10147 + }, + { + "epoch": 1.205941770647653, + "grad_norm": 0.29491657075360284, + "learning_rate": 8.4720371890023e-06, + "loss": 0.8487, + "num_tokens": 42415973867.0, + "step": 10148 + }, + { + "epoch": 1.206060606060606, + "grad_norm": 0.35960567836000024, + "learning_rate": 8.470374996819997e-06, + "loss": 0.7793, + "num_tokens": 42420132588.0, + "step": 10149 + }, + { + "epoch": 1.2061794414735592, + "grad_norm": 0.30455512851954425, + "learning_rate": 8.468712898318263e-06, + "loss": 0.8584, + "num_tokens": 42424313825.0, + "step": 10150 + }, + { + "epoch": 1.2062982768865123, + "grad_norm": 0.33982495413107416, + "learning_rate": 8.467050893558643e-06, + "loss": 0.7786, + "num_tokens": 42428423601.0, + "step": 10151 + }, + { + "epoch": 1.2064171122994654, + "grad_norm": 0.31935070006130306, + "learning_rate": 8.465388982602697e-06, + "loss": 0.7777, + "num_tokens": 42432591518.0, + "step": 10152 + }, + { + "epoch": 1.2065359477124182, + "grad_norm": 0.3112076012354203, + "learning_rate": 8.463727165511968e-06, + "loss": 0.823, + "num_tokens": 42436780673.0, + "step": 10153 + }, + { + "epoch": 1.2066547831253713, + "grad_norm": 0.3098651839086909, + "learning_rate": 8.462065442347995e-06, + "loss": 0.8136, + "num_tokens": 42440936492.0, + "step": 10154 + }, + { + "epoch": 1.2067736185383244, + "grad_norm": 0.33487491339009695, + "learning_rate": 8.460403813172322e-06, + "loss": 0.8329, + "num_tokens": 42445126588.0, + "step": 10155 + }, + { + "epoch": 1.2068924539512775, + "grad_norm": 0.3378962118918642, + "learning_rate": 8.458742278046484e-06, + "loss": 0.8291, + "num_tokens": 42449293593.0, + "step": 10156 + }, + { + "epoch": 1.2070112893642306, + "grad_norm": 0.2935939919366822, + "learning_rate": 8.457080837032003e-06, + "loss": 0.7997, + "num_tokens": 42453481828.0, + "step": 10157 + }, + { + "epoch": 1.2071301247771835, + "grad_norm": 0.3519420419223025, + "learning_rate": 8.455419490190424e-06, + "loss": 0.8107, + "num_tokens": 42457671808.0, + "step": 10158 + }, + { + "epoch": 1.2072489601901366, + "grad_norm": 0.28604310543714756, + "learning_rate": 8.453758237583262e-06, + "loss": 0.8152, + "num_tokens": 42461853873.0, + "step": 10159 + }, + { + "epoch": 1.2073677956030897, + "grad_norm": 0.3775779782273005, + "learning_rate": 8.452097079272042e-06, + "loss": 0.8097, + "num_tokens": 42466039842.0, + "step": 10160 + }, + { + "epoch": 1.2074866310160428, + "grad_norm": 0.3241821195231881, + "learning_rate": 8.450436015318284e-06, + "loss": 0.7857, + "num_tokens": 42470221380.0, + "step": 10161 + }, + { + "epoch": 1.207605466428996, + "grad_norm": 0.3329421506694339, + "learning_rate": 8.448775045783497e-06, + "loss": 0.8259, + "num_tokens": 42474409176.0, + "step": 10162 + }, + { + "epoch": 1.207724301841949, + "grad_norm": 0.3370640502898402, + "learning_rate": 8.447114170729198e-06, + "loss": 0.8565, + "num_tokens": 42478598848.0, + "step": 10163 + }, + { + "epoch": 1.2078431372549019, + "grad_norm": 0.3611419042855428, + "learning_rate": 8.445453390216892e-06, + "loss": 0.7787, + "num_tokens": 42482763128.0, + "step": 10164 + }, + { + "epoch": 1.207961972667855, + "grad_norm": 0.311669048333569, + "learning_rate": 8.443792704308082e-06, + "loss": 0.8171, + "num_tokens": 42486945188.0, + "step": 10165 + }, + { + "epoch": 1.208080808080808, + "grad_norm": 0.33929811905729446, + "learning_rate": 8.442132113064276e-06, + "loss": 0.8454, + "num_tokens": 42491133273.0, + "step": 10166 + }, + { + "epoch": 1.2081996434937612, + "grad_norm": 0.3286256865735976, + "learning_rate": 8.440471616546963e-06, + "loss": 0.8243, + "num_tokens": 42495321667.0, + "step": 10167 + }, + { + "epoch": 1.2083184789067143, + "grad_norm": 0.3250251035995467, + "learning_rate": 8.438811214817642e-06, + "loss": 0.8174, + "num_tokens": 42499509539.0, + "step": 10168 + }, + { + "epoch": 1.2084373143196672, + "grad_norm": 0.31824132918593834, + "learning_rate": 8.4371509079378e-06, + "loss": 0.811, + "num_tokens": 42503661281.0, + "step": 10169 + }, + { + "epoch": 1.2085561497326203, + "grad_norm": 0.31278105618302443, + "learning_rate": 8.435490695968922e-06, + "loss": 0.8426, + "num_tokens": 42507850891.0, + "step": 10170 + }, + { + "epoch": 1.2086749851455734, + "grad_norm": 0.335966504797199, + "learning_rate": 8.433830578972497e-06, + "loss": 0.818, + "num_tokens": 42512005388.0, + "step": 10171 + }, + { + "epoch": 1.2087938205585265, + "grad_norm": 0.30117205340531644, + "learning_rate": 8.432170557010004e-06, + "loss": 0.8603, + "num_tokens": 42516194461.0, + "step": 10172 + }, + { + "epoch": 1.2089126559714796, + "grad_norm": 0.320472983040954, + "learning_rate": 8.430510630142915e-06, + "loss": 0.8318, + "num_tokens": 42520383061.0, + "step": 10173 + }, + { + "epoch": 1.2090314913844327, + "grad_norm": 0.3109353248346881, + "learning_rate": 8.428850798432709e-06, + "loss": 0.8161, + "num_tokens": 42524569751.0, + "step": 10174 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.3190220011172881, + "learning_rate": 8.427191061940844e-06, + "loss": 0.8322, + "num_tokens": 42528758752.0, + "step": 10175 + }, + { + "epoch": 1.2092691622103386, + "grad_norm": 0.30106259766150845, + "learning_rate": 8.425531420728796e-06, + "loss": 0.8126, + "num_tokens": 42532940370.0, + "step": 10176 + }, + { + "epoch": 1.2093879976232917, + "grad_norm": 0.3031753571892407, + "learning_rate": 8.423871874858025e-06, + "loss": 0.8169, + "num_tokens": 42537129009.0, + "step": 10177 + }, + { + "epoch": 1.2095068330362448, + "grad_norm": 0.3327097027450646, + "learning_rate": 8.422212424389988e-06, + "loss": 0.8199, + "num_tokens": 42541318671.0, + "step": 10178 + }, + { + "epoch": 1.209625668449198, + "grad_norm": 0.2740132952810555, + "learning_rate": 8.420553069386144e-06, + "loss": 0.7984, + "num_tokens": 42545508940.0, + "step": 10179 + }, + { + "epoch": 1.2097445038621508, + "grad_norm": 0.26687074620383033, + "learning_rate": 8.418893809907938e-06, + "loss": 0.7974, + "num_tokens": 42549699010.0, + "step": 10180 + }, + { + "epoch": 1.209863339275104, + "grad_norm": 0.3056662493487483, + "learning_rate": 8.41723464601682e-06, + "loss": 0.8092, + "num_tokens": 42553879079.0, + "step": 10181 + }, + { + "epoch": 1.209982174688057, + "grad_norm": 0.2901895675654441, + "learning_rate": 8.415575577774239e-06, + "loss": 0.7883, + "num_tokens": 42558067611.0, + "step": 10182 + }, + { + "epoch": 1.2101010101010101, + "grad_norm": 0.2916624557643366, + "learning_rate": 8.413916605241625e-06, + "loss": 0.8371, + "num_tokens": 42562256894.0, + "step": 10183 + }, + { + "epoch": 1.2102198455139632, + "grad_norm": 0.2944815488320754, + "learning_rate": 8.41225772848043e-06, + "loss": 0.7725, + "num_tokens": 42566445183.0, + "step": 10184 + }, + { + "epoch": 1.2103386809269163, + "grad_norm": 0.30121858016888925, + "learning_rate": 8.41059894755208e-06, + "loss": 0.7761, + "num_tokens": 42570611338.0, + "step": 10185 + }, + { + "epoch": 1.2104575163398692, + "grad_norm": 0.35413502474890035, + "learning_rate": 8.408940262518004e-06, + "loss": 0.8164, + "num_tokens": 42574800064.0, + "step": 10186 + }, + { + "epoch": 1.2105763517528223, + "grad_norm": 0.2749897605799861, + "learning_rate": 8.407281673439633e-06, + "loss": 0.8188, + "num_tokens": 42578959533.0, + "step": 10187 + }, + { + "epoch": 1.2106951871657754, + "grad_norm": 0.3019797334841863, + "learning_rate": 8.405623180378386e-06, + "loss": 0.794, + "num_tokens": 42583149130.0, + "step": 10188 + }, + { + "epoch": 1.2108140225787285, + "grad_norm": 0.32830745968614783, + "learning_rate": 8.403964783395681e-06, + "loss": 0.809, + "num_tokens": 42587337041.0, + "step": 10189 + }, + { + "epoch": 1.2109328579916816, + "grad_norm": 0.2949677570151188, + "learning_rate": 8.402306482552943e-06, + "loss": 0.7985, + "num_tokens": 42591525030.0, + "step": 10190 + }, + { + "epoch": 1.2110516934046345, + "grad_norm": 0.34399322983741715, + "learning_rate": 8.400648277911576e-06, + "loss": 0.8344, + "num_tokens": 42595707170.0, + "step": 10191 + }, + { + "epoch": 1.2111705288175876, + "grad_norm": 0.30225736045775053, + "learning_rate": 8.398990169532995e-06, + "loss": 0.786, + "num_tokens": 42599895178.0, + "step": 10192 + }, + { + "epoch": 1.2112893642305407, + "grad_norm": 0.3950161170411997, + "learning_rate": 8.397332157478601e-06, + "loss": 0.7771, + "num_tokens": 42604054967.0, + "step": 10193 + }, + { + "epoch": 1.2114081996434938, + "grad_norm": 0.2987034774601315, + "learning_rate": 8.395674241809794e-06, + "loss": 0.8124, + "num_tokens": 42608243653.0, + "step": 10194 + }, + { + "epoch": 1.2115270350564469, + "grad_norm": 0.396343272999819, + "learning_rate": 8.394016422587976e-06, + "loss": 0.8655, + "num_tokens": 42612433527.0, + "step": 10195 + }, + { + "epoch": 1.2116458704694, + "grad_norm": 0.3244618621758327, + "learning_rate": 8.392358699874543e-06, + "loss": 0.8265, + "num_tokens": 42616622838.0, + "step": 10196 + }, + { + "epoch": 1.2117647058823529, + "grad_norm": 0.42090485599941435, + "learning_rate": 8.390701073730884e-06, + "loss": 0.8109, + "num_tokens": 42620812583.0, + "step": 10197 + }, + { + "epoch": 1.211883541295306, + "grad_norm": 0.384526719066099, + "learning_rate": 8.389043544218387e-06, + "loss": 0.8096, + "num_tokens": 42624998779.0, + "step": 10198 + }, + { + "epoch": 1.212002376708259, + "grad_norm": 0.36647847416527685, + "learning_rate": 8.387386111398435e-06, + "loss": 0.8274, + "num_tokens": 42629187192.0, + "step": 10199 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.3434750764328641, + "learning_rate": 8.38572877533241e-06, + "loss": 0.8534, + "num_tokens": 42633328319.0, + "step": 10200 + }, + { + "epoch": 1.2122400475341653, + "grad_norm": 0.3818187243323929, + "learning_rate": 8.384071536081684e-06, + "loss": 0.808, + "num_tokens": 42637518956.0, + "step": 10201 + }, + { + "epoch": 1.2123588829471181, + "grad_norm": 0.31628022813505563, + "learning_rate": 8.382414393707634e-06, + "loss": 0.7822, + "num_tokens": 42641671759.0, + "step": 10202 + }, + { + "epoch": 1.2124777183600712, + "grad_norm": 0.35018479542152153, + "learning_rate": 8.38075734827163e-06, + "loss": 0.7999, + "num_tokens": 42645859330.0, + "step": 10203 + }, + { + "epoch": 1.2125965537730243, + "grad_norm": 0.32573950285795455, + "learning_rate": 8.37910039983504e-06, + "loss": 0.8137, + "num_tokens": 42650047258.0, + "step": 10204 + }, + { + "epoch": 1.2127153891859774, + "grad_norm": 0.35476870442212266, + "learning_rate": 8.37744354845922e-06, + "loss": 0.8249, + "num_tokens": 42654235094.0, + "step": 10205 + }, + { + "epoch": 1.2128342245989305, + "grad_norm": 0.33793467834076274, + "learning_rate": 8.375786794205536e-06, + "loss": 0.7861, + "num_tokens": 42658424711.0, + "step": 10206 + }, + { + "epoch": 1.2129530600118836, + "grad_norm": 0.35063356382837974, + "learning_rate": 8.374130137135336e-06, + "loss": 0.8254, + "num_tokens": 42662614654.0, + "step": 10207 + }, + { + "epoch": 1.2130718954248365, + "grad_norm": 0.31764818150644564, + "learning_rate": 8.372473577309973e-06, + "loss": 0.7949, + "num_tokens": 42666804681.0, + "step": 10208 + }, + { + "epoch": 1.2131907308377896, + "grad_norm": 0.3471403783504381, + "learning_rate": 8.370817114790804e-06, + "loss": 0.8512, + "num_tokens": 42670995156.0, + "step": 10209 + }, + { + "epoch": 1.2133095662507427, + "grad_norm": 0.3144876630589363, + "learning_rate": 8.369160749639164e-06, + "loss": 0.8135, + "num_tokens": 42675183190.0, + "step": 10210 + }, + { + "epoch": 1.2134284016636958, + "grad_norm": 0.3275761619720701, + "learning_rate": 8.367504481916397e-06, + "loss": 0.8461, + "num_tokens": 42679372848.0, + "step": 10211 + }, + { + "epoch": 1.213547237076649, + "grad_norm": 0.31830732613251095, + "learning_rate": 8.36584831168384e-06, + "loss": 0.7968, + "num_tokens": 42683561754.0, + "step": 10212 + }, + { + "epoch": 1.2136660724896018, + "grad_norm": 0.3190481588473677, + "learning_rate": 8.364192239002826e-06, + "loss": 0.8013, + "num_tokens": 42687751324.0, + "step": 10213 + }, + { + "epoch": 1.2137849079025549, + "grad_norm": 0.3351036180857305, + "learning_rate": 8.362536263934682e-06, + "loss": 0.7909, + "num_tokens": 42691941520.0, + "step": 10214 + }, + { + "epoch": 1.213903743315508, + "grad_norm": 0.28912691730478995, + "learning_rate": 8.360880386540741e-06, + "loss": 0.7788, + "num_tokens": 42696129158.0, + "step": 10215 + }, + { + "epoch": 1.214022578728461, + "grad_norm": 0.31751066328387456, + "learning_rate": 8.359224606882324e-06, + "loss": 0.8242, + "num_tokens": 42700291506.0, + "step": 10216 + }, + { + "epoch": 1.2141414141414142, + "grad_norm": 0.28178082577599034, + "learning_rate": 8.357568925020745e-06, + "loss": 0.8115, + "num_tokens": 42704480006.0, + "step": 10217 + }, + { + "epoch": 1.2142602495543673, + "grad_norm": 0.2719786087806548, + "learning_rate": 8.355913341017326e-06, + "loss": 0.7878, + "num_tokens": 42708649406.0, + "step": 10218 + }, + { + "epoch": 1.2143790849673202, + "grad_norm": 0.30124267324195103, + "learning_rate": 8.354257854933372e-06, + "loss": 0.8219, + "num_tokens": 42712838063.0, + "step": 10219 + }, + { + "epoch": 1.2144979203802733, + "grad_norm": 0.25543974440780076, + "learning_rate": 8.352602466830197e-06, + "loss": 0.8013, + "num_tokens": 42717026871.0, + "step": 10220 + }, + { + "epoch": 1.2146167557932264, + "grad_norm": 0.2893691805149128, + "learning_rate": 8.350947176769102e-06, + "loss": 0.7971, + "num_tokens": 42721216874.0, + "step": 10221 + }, + { + "epoch": 1.2147355912061795, + "grad_norm": 0.2924665850422006, + "learning_rate": 8.349291984811391e-06, + "loss": 0.8622, + "num_tokens": 42725395870.0, + "step": 10222 + }, + { + "epoch": 1.2148544266191326, + "grad_norm": 0.3188727081707958, + "learning_rate": 8.347636891018363e-06, + "loss": 0.8343, + "num_tokens": 42729569010.0, + "step": 10223 + }, + { + "epoch": 1.2149732620320854, + "grad_norm": 0.28374563201443304, + "learning_rate": 8.345981895451305e-06, + "loss": 0.7886, + "num_tokens": 42733726870.0, + "step": 10224 + }, + { + "epoch": 1.2150920974450385, + "grad_norm": 0.31620377419825524, + "learning_rate": 8.344326998171513e-06, + "loss": 0.8176, + "num_tokens": 42737862527.0, + "step": 10225 + }, + { + "epoch": 1.2152109328579916, + "grad_norm": 0.32911356688272025, + "learning_rate": 8.342672199240271e-06, + "loss": 0.7812, + "num_tokens": 42742030722.0, + "step": 10226 + }, + { + "epoch": 1.2153297682709447, + "grad_norm": 0.3252700511001936, + "learning_rate": 8.341017498718859e-06, + "loss": 0.8247, + "num_tokens": 42746220816.0, + "step": 10227 + }, + { + "epoch": 1.2154486036838978, + "grad_norm": 0.30658006687546896, + "learning_rate": 8.339362896668564e-06, + "loss": 0.7677, + "num_tokens": 42750375480.0, + "step": 10228 + }, + { + "epoch": 1.215567439096851, + "grad_norm": 0.30212514381608857, + "learning_rate": 8.337708393150654e-06, + "loss": 0.7952, + "num_tokens": 42754533703.0, + "step": 10229 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.342354698088006, + "learning_rate": 8.336053988226405e-06, + "loss": 0.8387, + "num_tokens": 42758722179.0, + "step": 10230 + }, + { + "epoch": 1.215805109922757, + "grad_norm": 0.2765166068953731, + "learning_rate": 8.334399681957086e-06, + "loss": 0.8366, + "num_tokens": 42762910052.0, + "step": 10231 + }, + { + "epoch": 1.21592394533571, + "grad_norm": 0.3144376560266639, + "learning_rate": 8.332745474403955e-06, + "loss": 0.8375, + "num_tokens": 42767099957.0, + "step": 10232 + }, + { + "epoch": 1.2160427807486631, + "grad_norm": 0.29316068952984625, + "learning_rate": 8.331091365628276e-06, + "loss": 0.8425, + "num_tokens": 42771284969.0, + "step": 10233 + }, + { + "epoch": 1.2161616161616162, + "grad_norm": 0.30214914105820734, + "learning_rate": 8.329437355691311e-06, + "loss": 0.816, + "num_tokens": 42775474511.0, + "step": 10234 + }, + { + "epoch": 1.216280451574569, + "grad_norm": 0.3025337365434446, + "learning_rate": 8.32778344465431e-06, + "loss": 0.8392, + "num_tokens": 42779663354.0, + "step": 10235 + }, + { + "epoch": 1.2163992869875222, + "grad_norm": 0.2893266012343456, + "learning_rate": 8.326129632578524e-06, + "loss": 0.8428, + "num_tokens": 42783852507.0, + "step": 10236 + }, + { + "epoch": 1.2165181224004753, + "grad_norm": 0.2741556755099477, + "learning_rate": 8.324475919525196e-06, + "loss": 0.8362, + "num_tokens": 42788042622.0, + "step": 10237 + }, + { + "epoch": 1.2166369578134284, + "grad_norm": 0.2835094708681731, + "learning_rate": 8.322822305555572e-06, + "loss": 0.8425, + "num_tokens": 42792220159.0, + "step": 10238 + }, + { + "epoch": 1.2167557932263815, + "grad_norm": 0.29975109689452456, + "learning_rate": 8.32116879073089e-06, + "loss": 0.8396, + "num_tokens": 42796410412.0, + "step": 10239 + }, + { + "epoch": 1.2168746286393346, + "grad_norm": 0.2773839529593816, + "learning_rate": 8.319515375112381e-06, + "loss": 0.8399, + "num_tokens": 42800575307.0, + "step": 10240 + }, + { + "epoch": 1.2169934640522877, + "grad_norm": 0.30016383987010703, + "learning_rate": 8.317862058761286e-06, + "loss": 0.8222, + "num_tokens": 42804731117.0, + "step": 10241 + }, + { + "epoch": 1.2171122994652406, + "grad_norm": 0.2809490731692603, + "learning_rate": 8.316208841738826e-06, + "loss": 0.8115, + "num_tokens": 42808921638.0, + "step": 10242 + }, + { + "epoch": 1.2172311348781937, + "grad_norm": 0.29843633918674595, + "learning_rate": 8.314555724106225e-06, + "loss": 0.7858, + "num_tokens": 42813087449.0, + "step": 10243 + }, + { + "epoch": 1.2173499702911468, + "grad_norm": 0.2796619538574679, + "learning_rate": 8.312902705924705e-06, + "loss": 0.7767, + "num_tokens": 42817246690.0, + "step": 10244 + }, + { + "epoch": 1.2174688057040999, + "grad_norm": 0.272933740065443, + "learning_rate": 8.31124978725549e-06, + "loss": 0.8033, + "num_tokens": 42821416279.0, + "step": 10245 + }, + { + "epoch": 1.217587641117053, + "grad_norm": 0.30651730529474935, + "learning_rate": 8.309596968159777e-06, + "loss": 0.8244, + "num_tokens": 42825605841.0, + "step": 10246 + }, + { + "epoch": 1.2177064765300059, + "grad_norm": 0.2668908700416886, + "learning_rate": 8.307944248698788e-06, + "loss": 0.8137, + "num_tokens": 42829795070.0, + "step": 10247 + }, + { + "epoch": 1.217825311942959, + "grad_norm": 0.27632399042856204, + "learning_rate": 8.306291628933724e-06, + "loss": 0.7961, + "num_tokens": 42833984867.0, + "step": 10248 + }, + { + "epoch": 1.217944147355912, + "grad_norm": 0.3111337655903833, + "learning_rate": 8.304639108925791e-06, + "loss": 0.7909, + "num_tokens": 42838172312.0, + "step": 10249 + }, + { + "epoch": 1.2180629827688652, + "grad_norm": 0.2698726631269631, + "learning_rate": 8.302986688736188e-06, + "loss": 0.8093, + "num_tokens": 42842363808.0, + "step": 10250 + }, + { + "epoch": 1.2181818181818183, + "grad_norm": 0.31338271290304265, + "learning_rate": 8.301334368426102e-06, + "loss": 0.8514, + "num_tokens": 42846551806.0, + "step": 10251 + }, + { + "epoch": 1.2183006535947714, + "grad_norm": 0.28214467134865706, + "learning_rate": 8.299682148056731e-06, + "loss": 0.8341, + "num_tokens": 42850739968.0, + "step": 10252 + }, + { + "epoch": 1.2184194890077242, + "grad_norm": 0.28750886550955124, + "learning_rate": 8.298030027689256e-06, + "loss": 0.8092, + "num_tokens": 42854928059.0, + "step": 10253 + }, + { + "epoch": 1.2185383244206773, + "grad_norm": 0.31618064617754227, + "learning_rate": 8.29637800738487e-06, + "loss": 0.7934, + "num_tokens": 42859118032.0, + "step": 10254 + }, + { + "epoch": 1.2186571598336304, + "grad_norm": 0.329998017003373, + "learning_rate": 8.294726087204745e-06, + "loss": 0.8395, + "num_tokens": 42863309095.0, + "step": 10255 + }, + { + "epoch": 1.2187759952465835, + "grad_norm": 0.32015297218155303, + "learning_rate": 8.29307426721006e-06, + "loss": 0.8085, + "num_tokens": 42867499254.0, + "step": 10256 + }, + { + "epoch": 1.2188948306595366, + "grad_norm": 0.29875042650135475, + "learning_rate": 8.291422547461987e-06, + "loss": 0.8367, + "num_tokens": 42871686897.0, + "step": 10257 + }, + { + "epoch": 1.2190136660724895, + "grad_norm": 0.31830117678633, + "learning_rate": 8.289770928021699e-06, + "loss": 0.7914, + "num_tokens": 42875875132.0, + "step": 10258 + }, + { + "epoch": 1.2191325014854426, + "grad_norm": 0.3095610577784555, + "learning_rate": 8.288119408950348e-06, + "loss": 0.8286, + "num_tokens": 42880065127.0, + "step": 10259 + }, + { + "epoch": 1.2192513368983957, + "grad_norm": 0.28538056278557566, + "learning_rate": 8.286467990309113e-06, + "loss": 0.8272, + "num_tokens": 42884254555.0, + "step": 10260 + }, + { + "epoch": 1.2193701723113488, + "grad_norm": 0.3191484567620241, + "learning_rate": 8.284816672159137e-06, + "loss": 0.8349, + "num_tokens": 42888442888.0, + "step": 10261 + }, + { + "epoch": 1.219489007724302, + "grad_norm": 0.2888332888224509, + "learning_rate": 8.283165454561583e-06, + "loss": 0.7933, + "num_tokens": 42892633341.0, + "step": 10262 + }, + { + "epoch": 1.219607843137255, + "grad_norm": 0.25241391338714425, + "learning_rate": 8.2815143375776e-06, + "loss": 0.7954, + "num_tokens": 42896814766.0, + "step": 10263 + }, + { + "epoch": 1.2197266785502079, + "grad_norm": 0.3460788422710407, + "learning_rate": 8.27986332126833e-06, + "loss": 0.8053, + "num_tokens": 42900977165.0, + "step": 10264 + }, + { + "epoch": 1.219845513963161, + "grad_norm": 0.2938277006328597, + "learning_rate": 8.278212405694917e-06, + "loss": 0.8408, + "num_tokens": 42905140230.0, + "step": 10265 + }, + { + "epoch": 1.219964349376114, + "grad_norm": 0.3024695879782789, + "learning_rate": 8.276561590918502e-06, + "loss": 0.8311, + "num_tokens": 42909329137.0, + "step": 10266 + }, + { + "epoch": 1.2200831847890672, + "grad_norm": 0.3446165997930821, + "learning_rate": 8.274910877000219e-06, + "loss": 0.8291, + "num_tokens": 42913517989.0, + "step": 10267 + }, + { + "epoch": 1.2202020202020203, + "grad_norm": 0.28910802386420514, + "learning_rate": 8.273260264001204e-06, + "loss": 0.8456, + "num_tokens": 42917708679.0, + "step": 10268 + }, + { + "epoch": 1.2203208556149732, + "grad_norm": 0.3442592963151758, + "learning_rate": 8.27160975198258e-06, + "loss": 0.8363, + "num_tokens": 42921898067.0, + "step": 10269 + }, + { + "epoch": 1.2204396910279263, + "grad_norm": 0.30106130267810377, + "learning_rate": 8.269959341005466e-06, + "loss": 0.8068, + "num_tokens": 42926085613.0, + "step": 10270 + }, + { + "epoch": 1.2205585264408794, + "grad_norm": 0.3076559506633455, + "learning_rate": 8.268309031130995e-06, + "loss": 0.8205, + "num_tokens": 42930264635.0, + "step": 10271 + }, + { + "epoch": 1.2206773618538325, + "grad_norm": 0.32140631286288635, + "learning_rate": 8.266658822420269e-06, + "loss": 0.7799, + "num_tokens": 42934454877.0, + "step": 10272 + }, + { + "epoch": 1.2207961972667856, + "grad_norm": 0.34224317101417323, + "learning_rate": 8.265008714934415e-06, + "loss": 0.8325, + "num_tokens": 42938640266.0, + "step": 10273 + }, + { + "epoch": 1.2209150326797387, + "grad_norm": 0.32104037264148516, + "learning_rate": 8.263358708734532e-06, + "loss": 0.8009, + "num_tokens": 42942775682.0, + "step": 10274 + }, + { + "epoch": 1.2210338680926915, + "grad_norm": 0.35565074101350447, + "learning_rate": 8.261708803881729e-06, + "loss": 0.8102, + "num_tokens": 42946942065.0, + "step": 10275 + }, + { + "epoch": 1.2211527035056446, + "grad_norm": 0.3090742971812437, + "learning_rate": 8.26005900043711e-06, + "loss": 0.7805, + "num_tokens": 42951130410.0, + "step": 10276 + }, + { + "epoch": 1.2212715389185977, + "grad_norm": 0.3483476623630139, + "learning_rate": 8.258409298461771e-06, + "loss": 0.8274, + "num_tokens": 42955317443.0, + "step": 10277 + }, + { + "epoch": 1.2213903743315508, + "grad_norm": 0.3410913785620576, + "learning_rate": 8.256759698016799e-06, + "loss": 0.8133, + "num_tokens": 42959505855.0, + "step": 10278 + }, + { + "epoch": 1.221509209744504, + "grad_norm": 0.29453004580697795, + "learning_rate": 8.255110199163295e-06, + "loss": 0.7988, + "num_tokens": 42963687592.0, + "step": 10279 + }, + { + "epoch": 1.2216280451574568, + "grad_norm": 0.34911342201421536, + "learning_rate": 8.253460801962341e-06, + "loss": 0.8137, + "num_tokens": 42967850422.0, + "step": 10280 + }, + { + "epoch": 1.22174688057041, + "grad_norm": 0.29471813522565765, + "learning_rate": 8.25181150647502e-06, + "loss": 0.8096, + "num_tokens": 42972026975.0, + "step": 10281 + }, + { + "epoch": 1.221865715983363, + "grad_norm": 0.3662677908354229, + "learning_rate": 8.250162312762408e-06, + "loss": 0.8185, + "num_tokens": 42976198545.0, + "step": 10282 + }, + { + "epoch": 1.2219845513963161, + "grad_norm": 0.3098527168824913, + "learning_rate": 8.248513220885586e-06, + "loss": 0.8243, + "num_tokens": 42980386249.0, + "step": 10283 + }, + { + "epoch": 1.2221033868092692, + "grad_norm": 0.3620270286313196, + "learning_rate": 8.24686423090562e-06, + "loss": 0.7849, + "num_tokens": 42984574780.0, + "step": 10284 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.32377969222373126, + "learning_rate": 8.245215342883582e-06, + "loss": 0.8372, + "num_tokens": 42988758731.0, + "step": 10285 + }, + { + "epoch": 1.2223410576351752, + "grad_norm": 0.31769531787947997, + "learning_rate": 8.243566556880533e-06, + "loss": 0.8363, + "num_tokens": 42992947138.0, + "step": 10286 + }, + { + "epoch": 1.2224598930481283, + "grad_norm": 0.36479566993657975, + "learning_rate": 8.241917872957537e-06, + "loss": 0.8081, + "num_tokens": 42997135464.0, + "step": 10287 + }, + { + "epoch": 1.2225787284610814, + "grad_norm": 0.27624946074483864, + "learning_rate": 8.240269291175643e-06, + "loss": 0.8074, + "num_tokens": 43001323935.0, + "step": 10288 + }, + { + "epoch": 1.2226975638740345, + "grad_norm": 0.35319574256692604, + "learning_rate": 8.238620811595912e-06, + "loss": 0.7962, + "num_tokens": 43005514113.0, + "step": 10289 + }, + { + "epoch": 1.2228163992869876, + "grad_norm": 0.30812645768765096, + "learning_rate": 8.236972434279387e-06, + "loss": 0.7894, + "num_tokens": 43009703654.0, + "step": 10290 + }, + { + "epoch": 1.2229352346999405, + "grad_norm": 0.31272539402835603, + "learning_rate": 8.23532415928711e-06, + "loss": 0.8003, + "num_tokens": 43013894001.0, + "step": 10291 + }, + { + "epoch": 1.2230540701128936, + "grad_norm": 0.36123932743378445, + "learning_rate": 8.233675986680134e-06, + "loss": 0.8012, + "num_tokens": 43018083259.0, + "step": 10292 + }, + { + "epoch": 1.2231729055258467, + "grad_norm": 0.3058139670924824, + "learning_rate": 8.232027916519485e-06, + "loss": 0.8048, + "num_tokens": 43022251209.0, + "step": 10293 + }, + { + "epoch": 1.2232917409387998, + "grad_norm": 0.40134279154552394, + "learning_rate": 8.230379948866205e-06, + "loss": 0.8419, + "num_tokens": 43026440730.0, + "step": 10294 + }, + { + "epoch": 1.2234105763517529, + "grad_norm": 0.3168530158333281, + "learning_rate": 8.228732083781319e-06, + "loss": 0.8257, + "num_tokens": 43030623112.0, + "step": 10295 + }, + { + "epoch": 1.223529411764706, + "grad_norm": 0.4308092137077416, + "learning_rate": 8.22708432132585e-06, + "loss": 0.816, + "num_tokens": 43034812155.0, + "step": 10296 + }, + { + "epoch": 1.2236482471776589, + "grad_norm": 0.3358499350779152, + "learning_rate": 8.225436661560826e-06, + "loss": 0.8386, + "num_tokens": 43039000634.0, + "step": 10297 + }, + { + "epoch": 1.223767082590612, + "grad_norm": 0.3646312268206809, + "learning_rate": 8.223789104547265e-06, + "loss": 0.7844, + "num_tokens": 43043190965.0, + "step": 10298 + }, + { + "epoch": 1.223885918003565, + "grad_norm": 0.3686903950683145, + "learning_rate": 8.22214165034618e-06, + "loss": 0.8198, + "num_tokens": 43047380844.0, + "step": 10299 + }, + { + "epoch": 1.2240047534165182, + "grad_norm": 0.2988415289851796, + "learning_rate": 8.22049429901858e-06, + "loss": 0.8033, + "num_tokens": 43051570402.0, + "step": 10300 + }, + { + "epoch": 1.2241235888294713, + "grad_norm": 0.4536755937830593, + "learning_rate": 8.218847050625476e-06, + "loss": 0.8197, + "num_tokens": 43055733837.0, + "step": 10301 + }, + { + "epoch": 1.2242424242424241, + "grad_norm": 0.33553967145607233, + "learning_rate": 8.217199905227869e-06, + "loss": 0.8011, + "num_tokens": 43059921824.0, + "step": 10302 + }, + { + "epoch": 1.2243612596553772, + "grad_norm": 0.48151721953915844, + "learning_rate": 8.215552862886753e-06, + "loss": 0.8268, + "num_tokens": 43064111251.0, + "step": 10303 + }, + { + "epoch": 1.2244800950683303, + "grad_norm": 0.35699218729946114, + "learning_rate": 8.213905923663135e-06, + "loss": 0.8503, + "num_tokens": 43068268360.0, + "step": 10304 + }, + { + "epoch": 1.2245989304812834, + "grad_norm": 0.4993922494799061, + "learning_rate": 8.212259087617998e-06, + "loss": 0.838, + "num_tokens": 43072455626.0, + "step": 10305 + }, + { + "epoch": 1.2247177658942365, + "grad_norm": 0.3966445656809172, + "learning_rate": 8.210612354812332e-06, + "loss": 0.8348, + "num_tokens": 43076645029.0, + "step": 10306 + }, + { + "epoch": 1.2248366013071896, + "grad_norm": 0.4827585803549273, + "learning_rate": 8.208965725307125e-06, + "loss": 0.8653, + "num_tokens": 43080832884.0, + "step": 10307 + }, + { + "epoch": 1.2249554367201425, + "grad_norm": 0.38272203641078906, + "learning_rate": 8.207319199163352e-06, + "loss": 0.8141, + "num_tokens": 43085002931.0, + "step": 10308 + }, + { + "epoch": 1.2250742721330956, + "grad_norm": 0.5200745884451105, + "learning_rate": 8.20567277644199e-06, + "loss": 0.8158, + "num_tokens": 43089168031.0, + "step": 10309 + }, + { + "epoch": 1.2251931075460487, + "grad_norm": 0.3993673990213236, + "learning_rate": 8.204026457204013e-06, + "loss": 0.8369, + "num_tokens": 43093356795.0, + "step": 10310 + }, + { + "epoch": 1.2253119429590018, + "grad_norm": 0.5146863931177191, + "learning_rate": 8.202380241510391e-06, + "loss": 0.8201, + "num_tokens": 43097544585.0, + "step": 10311 + }, + { + "epoch": 1.225430778371955, + "grad_norm": 0.4460479161265958, + "learning_rate": 8.20073412942209e-06, + "loss": 0.8091, + "num_tokens": 43101733656.0, + "step": 10312 + }, + { + "epoch": 1.2255496137849078, + "grad_norm": 0.4509652793038166, + "learning_rate": 8.199088121000066e-06, + "loss": 0.8344, + "num_tokens": 43105880386.0, + "step": 10313 + }, + { + "epoch": 1.2256684491978609, + "grad_norm": 0.39852627431858234, + "learning_rate": 8.197442216305278e-06, + "loss": 0.798, + "num_tokens": 43110069188.0, + "step": 10314 + }, + { + "epoch": 1.225787284610814, + "grad_norm": 0.4171760414290292, + "learning_rate": 8.195796415398685e-06, + "loss": 0.8089, + "num_tokens": 43114258623.0, + "step": 10315 + }, + { + "epoch": 1.225906120023767, + "grad_norm": 0.32942276549415944, + "learning_rate": 8.194150718341223e-06, + "loss": 0.8307, + "num_tokens": 43118429989.0, + "step": 10316 + }, + { + "epoch": 1.2260249554367202, + "grad_norm": 0.4371914193485585, + "learning_rate": 8.192505125193852e-06, + "loss": 0.8386, + "num_tokens": 43122619265.0, + "step": 10317 + }, + { + "epoch": 1.2261437908496733, + "grad_norm": 0.3628707359692105, + "learning_rate": 8.190859636017512e-06, + "loss": 0.8335, + "num_tokens": 43126790960.0, + "step": 10318 + }, + { + "epoch": 1.2262626262626264, + "grad_norm": 0.45665570989154763, + "learning_rate": 8.189214250873132e-06, + "loss": 0.8086, + "num_tokens": 43130952953.0, + "step": 10319 + }, + { + "epoch": 1.2263814616755793, + "grad_norm": 0.38591140501583593, + "learning_rate": 8.187568969821657e-06, + "loss": 0.7914, + "num_tokens": 43135141509.0, + "step": 10320 + }, + { + "epoch": 1.2265002970885324, + "grad_norm": 0.4617955896675704, + "learning_rate": 8.185923792924009e-06, + "loss": 0.8102, + "num_tokens": 43139319313.0, + "step": 10321 + }, + { + "epoch": 1.2266191325014855, + "grad_norm": 0.3836804641483692, + "learning_rate": 8.184278720241114e-06, + "loss": 0.8129, + "num_tokens": 43143477367.0, + "step": 10322 + }, + { + "epoch": 1.2267379679144386, + "grad_norm": 0.46930136516537607, + "learning_rate": 8.182633751833902e-06, + "loss": 0.8438, + "num_tokens": 43147616379.0, + "step": 10323 + }, + { + "epoch": 1.2268568033273914, + "grad_norm": 0.4105999914112453, + "learning_rate": 8.180988887763291e-06, + "loss": 0.7712, + "num_tokens": 43151805114.0, + "step": 10324 + }, + { + "epoch": 1.2269756387403445, + "grad_norm": 0.5063733093495462, + "learning_rate": 8.17934412809019e-06, + "loss": 0.831, + "num_tokens": 43155992962.0, + "step": 10325 + }, + { + "epoch": 1.2270944741532976, + "grad_norm": 0.4340923733967125, + "learning_rate": 8.177699472875511e-06, + "loss": 0.8165, + "num_tokens": 43160177879.0, + "step": 10326 + }, + { + "epoch": 1.2272133095662507, + "grad_norm": 0.4354940750516937, + "learning_rate": 8.176054922180166e-06, + "loss": 0.8407, + "num_tokens": 43164348138.0, + "step": 10327 + }, + { + "epoch": 1.2273321449792038, + "grad_norm": 0.4147024639812562, + "learning_rate": 8.174410476065056e-06, + "loss": 0.822, + "num_tokens": 43168538343.0, + "step": 10328 + }, + { + "epoch": 1.227450980392157, + "grad_norm": 0.3890307535217957, + "learning_rate": 8.172766134591073e-06, + "loss": 0.7963, + "num_tokens": 43172726091.0, + "step": 10329 + }, + { + "epoch": 1.22756981580511, + "grad_norm": 0.39828975783593373, + "learning_rate": 8.171121897819123e-06, + "loss": 0.796, + "num_tokens": 43176915372.0, + "step": 10330 + }, + { + "epoch": 1.227688651218063, + "grad_norm": 0.3550390372800195, + "learning_rate": 8.169477765810095e-06, + "loss": 0.8085, + "num_tokens": 43181103946.0, + "step": 10331 + }, + { + "epoch": 1.227807486631016, + "grad_norm": 0.37216492437946946, + "learning_rate": 8.167833738624875e-06, + "loss": 0.7963, + "num_tokens": 43185276903.0, + "step": 10332 + }, + { + "epoch": 1.2279263220439691, + "grad_norm": 0.36704737263301895, + "learning_rate": 8.166189816324349e-06, + "loss": 0.8538, + "num_tokens": 43189437366.0, + "step": 10333 + }, + { + "epoch": 1.2280451574569222, + "grad_norm": 0.27821089913098596, + "learning_rate": 8.164545998969393e-06, + "loss": 0.8258, + "num_tokens": 43193594961.0, + "step": 10334 + }, + { + "epoch": 1.228163992869875, + "grad_norm": 0.39135913555988855, + "learning_rate": 8.162902286620884e-06, + "loss": 0.8219, + "num_tokens": 43197784685.0, + "step": 10335 + }, + { + "epoch": 1.2282828282828282, + "grad_norm": 0.27846050894463464, + "learning_rate": 8.161258679339698e-06, + "loss": 0.795, + "num_tokens": 43201973321.0, + "step": 10336 + }, + { + "epoch": 1.2284016636957813, + "grad_norm": 0.463020609768817, + "learning_rate": 8.159615177186702e-06, + "loss": 0.812, + "num_tokens": 43206135908.0, + "step": 10337 + }, + { + "epoch": 1.2285204991087344, + "grad_norm": 0.3406240097191504, + "learning_rate": 8.157971780222757e-06, + "loss": 0.8435, + "num_tokens": 43210299000.0, + "step": 10338 + }, + { + "epoch": 1.2286393345216875, + "grad_norm": 0.4294868039605155, + "learning_rate": 8.15632848850873e-06, + "loss": 0.8111, + "num_tokens": 43214456850.0, + "step": 10339 + }, + { + "epoch": 1.2287581699346406, + "grad_norm": 0.37908672313963987, + "learning_rate": 8.15468530210547e-06, + "loss": 0.8357, + "num_tokens": 43218644703.0, + "step": 10340 + }, + { + "epoch": 1.2288770053475937, + "grad_norm": 0.4311118317844603, + "learning_rate": 8.153042221073837e-06, + "loss": 0.8299, + "num_tokens": 43222833040.0, + "step": 10341 + }, + { + "epoch": 1.2289958407605466, + "grad_norm": 0.39391701902697834, + "learning_rate": 8.151399245474671e-06, + "loss": 0.8789, + "num_tokens": 43226991955.0, + "step": 10342 + }, + { + "epoch": 1.2291146761734997, + "grad_norm": 0.4140803009034074, + "learning_rate": 8.14975637536883e-06, + "loss": 0.8108, + "num_tokens": 43231170384.0, + "step": 10343 + }, + { + "epoch": 1.2292335115864528, + "grad_norm": 0.38237950210630595, + "learning_rate": 8.14811361081714e-06, + "loss": 0.8024, + "num_tokens": 43235359358.0, + "step": 10344 + }, + { + "epoch": 1.2293523469994059, + "grad_norm": 0.42028902512227834, + "learning_rate": 8.146470951880448e-06, + "loss": 0.8385, + "num_tokens": 43239518728.0, + "step": 10345 + }, + { + "epoch": 1.229471182412359, + "grad_norm": 0.35444638233605946, + "learning_rate": 8.144828398619587e-06, + "loss": 0.8105, + "num_tokens": 43243685643.0, + "step": 10346 + }, + { + "epoch": 1.2295900178253119, + "grad_norm": 0.40727918658261264, + "learning_rate": 8.143185951095382e-06, + "loss": 0.8436, + "num_tokens": 43247871381.0, + "step": 10347 + }, + { + "epoch": 1.229708853238265, + "grad_norm": 0.3316424568768008, + "learning_rate": 8.14154360936866e-06, + "loss": 0.8209, + "num_tokens": 43252030401.0, + "step": 10348 + }, + { + "epoch": 1.229827688651218, + "grad_norm": 0.42122953700843774, + "learning_rate": 8.139901373500243e-06, + "loss": 0.8188, + "num_tokens": 43256194807.0, + "step": 10349 + }, + { + "epoch": 1.2299465240641712, + "grad_norm": 0.35445106420895267, + "learning_rate": 8.138259243550948e-06, + "loss": 0.8282, + "num_tokens": 43260356169.0, + "step": 10350 + }, + { + "epoch": 1.2300653594771243, + "grad_norm": 0.4388402047464217, + "learning_rate": 8.136617219581592e-06, + "loss": 0.8202, + "num_tokens": 43264545044.0, + "step": 10351 + }, + { + "epoch": 1.2301841948900774, + "grad_norm": 0.36061689563071786, + "learning_rate": 8.134975301652977e-06, + "loss": 0.8286, + "num_tokens": 43268715634.0, + "step": 10352 + }, + { + "epoch": 1.2303030303030302, + "grad_norm": 0.4268475618052991, + "learning_rate": 8.133333489825917e-06, + "loss": 0.8125, + "num_tokens": 43272905062.0, + "step": 10353 + }, + { + "epoch": 1.2304218657159833, + "grad_norm": 0.4164430036701456, + "learning_rate": 8.131691784161206e-06, + "loss": 0.8015, + "num_tokens": 43277094366.0, + "step": 10354 + }, + { + "epoch": 1.2305407011289364, + "grad_norm": 0.4029170032458475, + "learning_rate": 8.13005018471965e-06, + "loss": 0.8123, + "num_tokens": 43281252540.0, + "step": 10355 + }, + { + "epoch": 1.2306595365418895, + "grad_norm": 0.4177614120710691, + "learning_rate": 8.12840869156204e-06, + "loss": 0.8077, + "num_tokens": 43285442479.0, + "step": 10356 + }, + { + "epoch": 1.2307783719548426, + "grad_norm": 0.3545696767743071, + "learning_rate": 8.12676730474916e-06, + "loss": 0.8348, + "num_tokens": 43289632054.0, + "step": 10357 + }, + { + "epoch": 1.2308972073677955, + "grad_norm": 0.3505128235333942, + "learning_rate": 8.125126024341806e-06, + "loss": 0.8059, + "num_tokens": 43293822111.0, + "step": 10358 + }, + { + "epoch": 1.2310160427807486, + "grad_norm": 0.3957389117505189, + "learning_rate": 8.123484850400756e-06, + "loss": 0.826, + "num_tokens": 43298010197.0, + "step": 10359 + }, + { + "epoch": 1.2311348781937017, + "grad_norm": 0.3485794757670952, + "learning_rate": 8.121843782986789e-06, + "loss": 0.8487, + "num_tokens": 43302198250.0, + "step": 10360 + }, + { + "epoch": 1.2312537136066548, + "grad_norm": 0.39587035341450233, + "learning_rate": 8.120202822160673e-06, + "loss": 0.8105, + "num_tokens": 43306376544.0, + "step": 10361 + }, + { + "epoch": 1.231372549019608, + "grad_norm": 0.3845378821348033, + "learning_rate": 8.118561967983184e-06, + "loss": 0.8148, + "num_tokens": 43310566123.0, + "step": 10362 + }, + { + "epoch": 1.231491384432561, + "grad_norm": 0.42584315059975225, + "learning_rate": 8.116921220515088e-06, + "loss": 0.8217, + "num_tokens": 43314754239.0, + "step": 10363 + }, + { + "epoch": 1.2316102198455139, + "grad_norm": 0.37728180471645806, + "learning_rate": 8.115280579817151e-06, + "loss": 0.8232, + "num_tokens": 43318943514.0, + "step": 10364 + }, + { + "epoch": 1.231729055258467, + "grad_norm": 0.3915722412501351, + "learning_rate": 8.113640045950124e-06, + "loss": 0.8107, + "num_tokens": 43323113136.0, + "step": 10365 + }, + { + "epoch": 1.23184789067142, + "grad_norm": 0.3653480339147406, + "learning_rate": 8.111999618974766e-06, + "loss": 0.8232, + "num_tokens": 43327302792.0, + "step": 10366 + }, + { + "epoch": 1.2319667260843732, + "grad_norm": 0.37032704968600866, + "learning_rate": 8.110359298951826e-06, + "loss": 0.83, + "num_tokens": 43331464610.0, + "step": 10367 + }, + { + "epoch": 1.2320855614973263, + "grad_norm": 0.30165675797090646, + "learning_rate": 8.108719085942051e-06, + "loss": 0.8347, + "num_tokens": 43335614454.0, + "step": 10368 + }, + { + "epoch": 1.2322043969102792, + "grad_norm": 0.37919289255787436, + "learning_rate": 8.107078980006184e-06, + "loss": 0.7921, + "num_tokens": 43339802156.0, + "step": 10369 + }, + { + "epoch": 1.2323232323232323, + "grad_norm": 0.3125428234280377, + "learning_rate": 8.105438981204968e-06, + "loss": 0.8526, + "num_tokens": 43343991064.0, + "step": 10370 + }, + { + "epoch": 1.2324420677361854, + "grad_norm": 0.4012827282232379, + "learning_rate": 8.103799089599129e-06, + "loss": 0.8031, + "num_tokens": 43348179251.0, + "step": 10371 + }, + { + "epoch": 1.2325609031491385, + "grad_norm": 0.3525363054286098, + "learning_rate": 8.102159305249401e-06, + "loss": 0.8507, + "num_tokens": 43352368638.0, + "step": 10372 + }, + { + "epoch": 1.2326797385620916, + "grad_norm": 0.43868227660346537, + "learning_rate": 8.100519628216512e-06, + "loss": 0.8436, + "num_tokens": 43356558766.0, + "step": 10373 + }, + { + "epoch": 1.2327985739750447, + "grad_norm": 0.3600929789560612, + "learning_rate": 8.09888005856119e-06, + "loss": 0.8018, + "num_tokens": 43360725013.0, + "step": 10374 + }, + { + "epoch": 1.2329174093879975, + "grad_norm": 0.3884653315989409, + "learning_rate": 8.097240596344143e-06, + "loss": 0.8183, + "num_tokens": 43364910165.0, + "step": 10375 + }, + { + "epoch": 1.2330362448009506, + "grad_norm": 0.3727879018695196, + "learning_rate": 8.095601241626093e-06, + "loss": 0.8558, + "num_tokens": 43369098530.0, + "step": 10376 + }, + { + "epoch": 1.2331550802139037, + "grad_norm": 0.38090529526675043, + "learning_rate": 8.093961994467747e-06, + "loss": 0.8123, + "num_tokens": 43373272065.0, + "step": 10377 + }, + { + "epoch": 1.2332739156268568, + "grad_norm": 0.33626479990946057, + "learning_rate": 8.092322854929817e-06, + "loss": 0.7698, + "num_tokens": 43377462504.0, + "step": 10378 + }, + { + "epoch": 1.23339275103981, + "grad_norm": 0.42641817991990566, + "learning_rate": 8.090683823073003e-06, + "loss": 0.8455, + "num_tokens": 43381652444.0, + "step": 10379 + }, + { + "epoch": 1.2335115864527628, + "grad_norm": 0.3194763708156576, + "learning_rate": 8.089044898957998e-06, + "loss": 0.8083, + "num_tokens": 43385813240.0, + "step": 10380 + }, + { + "epoch": 1.233630421865716, + "grad_norm": 0.4010652333477987, + "learning_rate": 8.087406082645506e-06, + "loss": 0.7975, + "num_tokens": 43390002939.0, + "step": 10381 + }, + { + "epoch": 1.233749257278669, + "grad_norm": 0.3404120740592631, + "learning_rate": 8.085767374196216e-06, + "loss": 0.8182, + "num_tokens": 43394144584.0, + "step": 10382 + }, + { + "epoch": 1.2338680926916221, + "grad_norm": 0.4564802438848958, + "learning_rate": 8.084128773670815e-06, + "loss": 0.8024, + "num_tokens": 43398334592.0, + "step": 10383 + }, + { + "epoch": 1.2339869281045752, + "grad_norm": 0.38997792010405513, + "learning_rate": 8.08249028112998e-06, + "loss": 0.801, + "num_tokens": 43402520154.0, + "step": 10384 + }, + { + "epoch": 1.2341057635175283, + "grad_norm": 0.4373378246746871, + "learning_rate": 8.080851896634395e-06, + "loss": 0.8311, + "num_tokens": 43406708878.0, + "step": 10385 + }, + { + "epoch": 1.2342245989304812, + "grad_norm": 0.42343939845359735, + "learning_rate": 8.079213620244735e-06, + "loss": 0.8264, + "num_tokens": 43410896945.0, + "step": 10386 + }, + { + "epoch": 1.2343434343434343, + "grad_norm": 0.3708054312040225, + "learning_rate": 8.077575452021669e-06, + "loss": 0.8264, + "num_tokens": 43415068664.0, + "step": 10387 + }, + { + "epoch": 1.2344622697563874, + "grad_norm": 0.3894493128010184, + "learning_rate": 8.075937392025867e-06, + "loss": 0.8382, + "num_tokens": 43419196612.0, + "step": 10388 + }, + { + "epoch": 1.2345811051693405, + "grad_norm": 0.36609679088658953, + "learning_rate": 8.07429944031799e-06, + "loss": 0.8037, + "num_tokens": 43423386561.0, + "step": 10389 + }, + { + "epoch": 1.2346999405822936, + "grad_norm": 0.3490555544281615, + "learning_rate": 8.072661596958695e-06, + "loss": 0.8055, + "num_tokens": 43427576108.0, + "step": 10390 + }, + { + "epoch": 1.2348187759952465, + "grad_norm": 0.4069949828426185, + "learning_rate": 8.07102386200864e-06, + "loss": 0.8165, + "num_tokens": 43431765119.0, + "step": 10391 + }, + { + "epoch": 1.2349376114081996, + "grad_norm": 0.3486054898362972, + "learning_rate": 8.06938623552847e-06, + "loss": 0.8325, + "num_tokens": 43435953838.0, + "step": 10392 + }, + { + "epoch": 1.2350564468211527, + "grad_norm": 0.3958494456074698, + "learning_rate": 8.067748717578842e-06, + "loss": 0.8214, + "num_tokens": 43440111082.0, + "step": 10393 + }, + { + "epoch": 1.2351752822341058, + "grad_norm": 0.34145323156593377, + "learning_rate": 8.066111308220391e-06, + "loss": 0.7977, + "num_tokens": 43444284478.0, + "step": 10394 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.4230657445601158, + "learning_rate": 8.06447400751376e-06, + "loss": 0.8173, + "num_tokens": 43448474093.0, + "step": 10395 + }, + { + "epoch": 1.235412953060012, + "grad_norm": 0.3441459560653732, + "learning_rate": 8.062836815519581e-06, + "loss": 0.8021, + "num_tokens": 43452660128.0, + "step": 10396 + }, + { + "epoch": 1.2355317884729649, + "grad_norm": 0.3694266519390041, + "learning_rate": 8.061199732298483e-06, + "loss": 0.7821, + "num_tokens": 43456847967.0, + "step": 10397 + }, + { + "epoch": 1.235650623885918, + "grad_norm": 0.3635377877293925, + "learning_rate": 8.059562757911098e-06, + "loss": 0.7952, + "num_tokens": 43461036855.0, + "step": 10398 + }, + { + "epoch": 1.235769459298871, + "grad_norm": 0.36029087612089106, + "learning_rate": 8.057925892418041e-06, + "loss": 0.8261, + "num_tokens": 43465208630.0, + "step": 10399 + }, + { + "epoch": 1.2358882947118242, + "grad_norm": 0.3229895687668679, + "learning_rate": 8.05628913587994e-06, + "loss": 0.8161, + "num_tokens": 43469397716.0, + "step": 10400 + }, + { + "epoch": 1.2360071301247773, + "grad_norm": 0.4076592110719867, + "learning_rate": 8.054652488357406e-06, + "loss": 0.799, + "num_tokens": 43473582471.0, + "step": 10401 + }, + { + "epoch": 1.2361259655377301, + "grad_norm": 0.31185107281099034, + "learning_rate": 8.053015949911047e-06, + "loss": 0.8136, + "num_tokens": 43477753240.0, + "step": 10402 + }, + { + "epoch": 1.2362448009506832, + "grad_norm": 0.38040539848225946, + "learning_rate": 8.051379520601472e-06, + "loss": 0.8151, + "num_tokens": 43481942126.0, + "step": 10403 + }, + { + "epoch": 1.2363636363636363, + "grad_norm": 0.29234160496927236, + "learning_rate": 8.049743200489285e-06, + "loss": 0.8267, + "num_tokens": 43486103551.0, + "step": 10404 + }, + { + "epoch": 1.2364824717765894, + "grad_norm": 0.3959214671523348, + "learning_rate": 8.048106989635074e-06, + "loss": 0.8115, + "num_tokens": 43490230294.0, + "step": 10405 + }, + { + "epoch": 1.2366013071895425, + "grad_norm": 0.31125119067106427, + "learning_rate": 8.046470888099452e-06, + "loss": 0.7823, + "num_tokens": 43494418754.0, + "step": 10406 + }, + { + "epoch": 1.2367201426024956, + "grad_norm": 0.4036973339429635, + "learning_rate": 8.044834895942992e-06, + "loss": 0.7972, + "num_tokens": 43498596580.0, + "step": 10407 + }, + { + "epoch": 1.2368389780154485, + "grad_norm": 0.3746194030710493, + "learning_rate": 8.04319901322629e-06, + "loss": 0.7923, + "num_tokens": 43502785879.0, + "step": 10408 + }, + { + "epoch": 1.2369578134284016, + "grad_norm": 0.4045519996528383, + "learning_rate": 8.041563240009928e-06, + "loss": 0.8468, + "num_tokens": 43506945172.0, + "step": 10409 + }, + { + "epoch": 1.2370766488413547, + "grad_norm": 0.3338037131419673, + "learning_rate": 8.039927576354478e-06, + "loss": 0.7894, + "num_tokens": 43511133995.0, + "step": 10410 + }, + { + "epoch": 1.2371954842543078, + "grad_norm": 0.39762570222382027, + "learning_rate": 8.038292022320521e-06, + "loss": 0.8242, + "num_tokens": 43515322205.0, + "step": 10411 + }, + { + "epoch": 1.237314319667261, + "grad_norm": 0.35381420659193313, + "learning_rate": 8.036656577968622e-06, + "loss": 0.7755, + "num_tokens": 43519512005.0, + "step": 10412 + }, + { + "epoch": 1.2374331550802138, + "grad_norm": 0.372903003468217, + "learning_rate": 8.035021243359348e-06, + "loss": 0.8394, + "num_tokens": 43523698844.0, + "step": 10413 + }, + { + "epoch": 1.2375519904931669, + "grad_norm": 0.3686879694761771, + "learning_rate": 8.033386018553269e-06, + "loss": 0.8333, + "num_tokens": 43527887965.0, + "step": 10414 + }, + { + "epoch": 1.23767082590612, + "grad_norm": 0.3678579560603165, + "learning_rate": 8.031750903610931e-06, + "loss": 0.8209, + "num_tokens": 43532076557.0, + "step": 10415 + }, + { + "epoch": 1.237789661319073, + "grad_norm": 0.32880149059769365, + "learning_rate": 8.030115898592894e-06, + "loss": 0.8121, + "num_tokens": 43536267154.0, + "step": 10416 + }, + { + "epoch": 1.2379084967320262, + "grad_norm": 0.3358113326599186, + "learning_rate": 8.028481003559708e-06, + "loss": 0.8341, + "num_tokens": 43540438493.0, + "step": 10417 + }, + { + "epoch": 1.2380273321449793, + "grad_norm": 0.2962816263409768, + "learning_rate": 8.02684621857191e-06, + "loss": 0.7823, + "num_tokens": 43544626723.0, + "step": 10418 + }, + { + "epoch": 1.2381461675579324, + "grad_norm": 0.39677465325763633, + "learning_rate": 8.025211543690056e-06, + "loss": 0.7983, + "num_tokens": 43548812024.0, + "step": 10419 + }, + { + "epoch": 1.2382650029708853, + "grad_norm": 0.3105280132081769, + "learning_rate": 8.023576978974676e-06, + "loss": 0.8282, + "num_tokens": 43552994510.0, + "step": 10420 + }, + { + "epoch": 1.2383838383838384, + "grad_norm": 0.3690857786888871, + "learning_rate": 8.021942524486301e-06, + "loss": 0.8265, + "num_tokens": 43557153900.0, + "step": 10421 + }, + { + "epoch": 1.2385026737967915, + "grad_norm": 0.30515391829681565, + "learning_rate": 8.020308180285468e-06, + "loss": 0.8595, + "num_tokens": 43561344998.0, + "step": 10422 + }, + { + "epoch": 1.2386215092097446, + "grad_norm": 0.38243830528307043, + "learning_rate": 8.018673946432695e-06, + "loss": 0.8301, + "num_tokens": 43565511924.0, + "step": 10423 + }, + { + "epoch": 1.2387403446226974, + "grad_norm": 0.3194624722372509, + "learning_rate": 8.017039822988504e-06, + "loss": 0.8489, + "num_tokens": 43569669282.0, + "step": 10424 + }, + { + "epoch": 1.2388591800356505, + "grad_norm": 0.40414602686102824, + "learning_rate": 8.015405810013416e-06, + "loss": 0.7868, + "num_tokens": 43573849292.0, + "step": 10425 + }, + { + "epoch": 1.2389780154486036, + "grad_norm": 0.3657791284558929, + "learning_rate": 8.013771907567942e-06, + "loss": 0.7955, + "num_tokens": 43578039380.0, + "step": 10426 + }, + { + "epoch": 1.2390968508615567, + "grad_norm": 0.37217396139779685, + "learning_rate": 8.012138115712593e-06, + "loss": 0.8601, + "num_tokens": 43582228454.0, + "step": 10427 + }, + { + "epoch": 1.2392156862745098, + "grad_norm": 0.3561163406933032, + "learning_rate": 8.010504434507868e-06, + "loss": 0.8215, + "num_tokens": 43586416928.0, + "step": 10428 + }, + { + "epoch": 1.239334521687463, + "grad_norm": 0.3753971776197951, + "learning_rate": 8.008870864014276e-06, + "loss": 0.8227, + "num_tokens": 43590591883.0, + "step": 10429 + }, + { + "epoch": 1.239453357100416, + "grad_norm": 0.3581560008470733, + "learning_rate": 8.007237404292307e-06, + "loss": 0.8022, + "num_tokens": 43594716655.0, + "step": 10430 + }, + { + "epoch": 1.239572192513369, + "grad_norm": 0.33250060460644615, + "learning_rate": 8.005604055402453e-06, + "loss": 0.7748, + "num_tokens": 43598907209.0, + "step": 10431 + }, + { + "epoch": 1.239691027926322, + "grad_norm": 0.32523315363366606, + "learning_rate": 8.003970817405206e-06, + "loss": 0.7823, + "num_tokens": 43603094931.0, + "step": 10432 + }, + { + "epoch": 1.2398098633392751, + "grad_norm": 0.34647128291619333, + "learning_rate": 8.002337690361053e-06, + "loss": 0.8251, + "num_tokens": 43607263983.0, + "step": 10433 + }, + { + "epoch": 1.2399286987522282, + "grad_norm": 0.32064632846296376, + "learning_rate": 8.000704674330467e-06, + "loss": 0.8279, + "num_tokens": 43611423247.0, + "step": 10434 + }, + { + "epoch": 1.2400475341651813, + "grad_norm": 0.3026319426087547, + "learning_rate": 7.999071769373931e-06, + "loss": 0.8232, + "num_tokens": 43615610435.0, + "step": 10435 + }, + { + "epoch": 1.2401663695781342, + "grad_norm": 0.3413751070131464, + "learning_rate": 7.997438975551913e-06, + "loss": 0.7907, + "num_tokens": 43619801501.0, + "step": 10436 + }, + { + "epoch": 1.2402852049910873, + "grad_norm": 0.2934917210533046, + "learning_rate": 7.995806292924876e-06, + "loss": 0.8004, + "num_tokens": 43623989401.0, + "step": 10437 + }, + { + "epoch": 1.2404040404040404, + "grad_norm": 0.3510195517592556, + "learning_rate": 7.994173721553295e-06, + "loss": 0.8726, + "num_tokens": 43628178234.0, + "step": 10438 + }, + { + "epoch": 1.2405228758169935, + "grad_norm": 0.278953948474758, + "learning_rate": 7.992541261497623e-06, + "loss": 0.806, + "num_tokens": 43632366562.0, + "step": 10439 + }, + { + "epoch": 1.2406417112299466, + "grad_norm": 0.30158298196826444, + "learning_rate": 7.990908912818319e-06, + "loss": 0.8082, + "num_tokens": 43636555176.0, + "step": 10440 + }, + { + "epoch": 1.2407605466428997, + "grad_norm": 0.32835793405575214, + "learning_rate": 7.989276675575827e-06, + "loss": 0.7873, + "num_tokens": 43640743351.0, + "step": 10441 + }, + { + "epoch": 1.2408793820558526, + "grad_norm": 0.31886415461882595, + "learning_rate": 7.987644549830602e-06, + "loss": 0.8143, + "num_tokens": 43644932027.0, + "step": 10442 + }, + { + "epoch": 1.2409982174688057, + "grad_norm": 0.274035412203881, + "learning_rate": 7.986012535643082e-06, + "loss": 0.8118, + "num_tokens": 43649101253.0, + "step": 10443 + }, + { + "epoch": 1.2411170528817588, + "grad_norm": 0.36766948288088536, + "learning_rate": 7.984380633073712e-06, + "loss": 0.7803, + "num_tokens": 43653289170.0, + "step": 10444 + }, + { + "epoch": 1.2412358882947119, + "grad_norm": 0.29843019745885185, + "learning_rate": 7.982748842182922e-06, + "loss": 0.8384, + "num_tokens": 43657477033.0, + "step": 10445 + }, + { + "epoch": 1.241354723707665, + "grad_norm": 0.3411466820536914, + "learning_rate": 7.981117163031142e-06, + "loss": 0.7984, + "num_tokens": 43661666685.0, + "step": 10446 + }, + { + "epoch": 1.2414735591206179, + "grad_norm": 0.2907770524756731, + "learning_rate": 7.979485595678803e-06, + "loss": 0.8607, + "num_tokens": 43665832137.0, + "step": 10447 + }, + { + "epoch": 1.241592394533571, + "grad_norm": 0.2805707289032708, + "learning_rate": 7.977854140186326e-06, + "loss": 0.8411, + "num_tokens": 43669991339.0, + "step": 10448 + }, + { + "epoch": 1.241711229946524, + "grad_norm": 0.29679109653024943, + "learning_rate": 7.976222796614126e-06, + "loss": 0.8055, + "num_tokens": 43674168063.0, + "step": 10449 + }, + { + "epoch": 1.2418300653594772, + "grad_norm": 0.26272035422142603, + "learning_rate": 7.974591565022616e-06, + "loss": 0.7769, + "num_tokens": 43678356872.0, + "step": 10450 + }, + { + "epoch": 1.2419489007724303, + "grad_norm": 0.30931656388759265, + "learning_rate": 7.972960445472213e-06, + "loss": 0.8015, + "num_tokens": 43682521942.0, + "step": 10451 + }, + { + "epoch": 1.2420677361853834, + "grad_norm": 0.2768735800891425, + "learning_rate": 7.971329438023321e-06, + "loss": 0.8235, + "num_tokens": 43686684590.0, + "step": 10452 + }, + { + "epoch": 1.2421865715983362, + "grad_norm": 0.32564743850283706, + "learning_rate": 7.96969854273634e-06, + "loss": 0.8561, + "num_tokens": 43690845824.0, + "step": 10453 + }, + { + "epoch": 1.2423054070112893, + "grad_norm": 0.29898613112420147, + "learning_rate": 7.968067759671665e-06, + "loss": 0.8547, + "num_tokens": 43695034590.0, + "step": 10454 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.27057586501824116, + "learning_rate": 7.966437088889696e-06, + "loss": 0.8359, + "num_tokens": 43699221878.0, + "step": 10455 + }, + { + "epoch": 1.2425430778371955, + "grad_norm": 0.29909941695183173, + "learning_rate": 7.964806530450812e-06, + "loss": 0.8185, + "num_tokens": 43703411016.0, + "step": 10456 + }, + { + "epoch": 1.2426619132501486, + "grad_norm": 0.2806037934491409, + "learning_rate": 7.963176084415406e-06, + "loss": 0.8272, + "num_tokens": 43707597987.0, + "step": 10457 + }, + { + "epoch": 1.2427807486631015, + "grad_norm": 0.2939175682281331, + "learning_rate": 7.961545750843862e-06, + "loss": 0.8339, + "num_tokens": 43711787246.0, + "step": 10458 + }, + { + "epoch": 1.2428995840760546, + "grad_norm": 0.31418775652008774, + "learning_rate": 7.959915529796551e-06, + "loss": 0.8425, + "num_tokens": 43715976829.0, + "step": 10459 + }, + { + "epoch": 1.2430184194890077, + "grad_norm": 0.3055098090735738, + "learning_rate": 7.958285421333845e-06, + "loss": 0.7988, + "num_tokens": 43720154938.0, + "step": 10460 + }, + { + "epoch": 1.2431372549019608, + "grad_norm": 0.3106329899609702, + "learning_rate": 7.956655425516115e-06, + "loss": 0.8025, + "num_tokens": 43724342954.0, + "step": 10461 + }, + { + "epoch": 1.243256090314914, + "grad_norm": 0.28888409598668324, + "learning_rate": 7.95502554240372e-06, + "loss": 0.8245, + "num_tokens": 43728530863.0, + "step": 10462 + }, + { + "epoch": 1.243374925727867, + "grad_norm": 0.29334079122667445, + "learning_rate": 7.95339577205703e-06, + "loss": 0.8894, + "num_tokens": 43732698515.0, + "step": 10463 + }, + { + "epoch": 1.2434937611408199, + "grad_norm": 0.29922927522903714, + "learning_rate": 7.951766114536394e-06, + "loss": 0.8198, + "num_tokens": 43736882337.0, + "step": 10464 + }, + { + "epoch": 1.243612596553773, + "grad_norm": 0.3353198536142001, + "learning_rate": 7.950136569902165e-06, + "loss": 0.8132, + "num_tokens": 43741070685.0, + "step": 10465 + }, + { + "epoch": 1.243731431966726, + "grad_norm": 0.303892019739182, + "learning_rate": 7.948507138214692e-06, + "loss": 0.7417, + "num_tokens": 43745230493.0, + "step": 10466 + }, + { + "epoch": 1.2438502673796792, + "grad_norm": 0.267805220353319, + "learning_rate": 7.946877819534316e-06, + "loss": 0.8321, + "num_tokens": 43749403162.0, + "step": 10467 + }, + { + "epoch": 1.2439691027926323, + "grad_norm": 0.3126069143717168, + "learning_rate": 7.945248613921375e-06, + "loss": 0.8509, + "num_tokens": 43753555749.0, + "step": 10468 + }, + { + "epoch": 1.2440879382055852, + "grad_norm": 0.3211733198583013, + "learning_rate": 7.943619521436206e-06, + "loss": 0.8038, + "num_tokens": 43757745128.0, + "step": 10469 + }, + { + "epoch": 1.2442067736185383, + "grad_norm": 0.30322001896228434, + "learning_rate": 7.941990542139142e-06, + "loss": 0.8104, + "num_tokens": 43761934518.0, + "step": 10470 + }, + { + "epoch": 1.2443256090314914, + "grad_norm": 0.30034363520909285, + "learning_rate": 7.940361676090509e-06, + "loss": 0.8197, + "num_tokens": 43766100305.0, + "step": 10471 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.3129962888817891, + "learning_rate": 7.938732923350624e-06, + "loss": 0.828, + "num_tokens": 43770262185.0, + "step": 10472 + }, + { + "epoch": 1.2445632798573976, + "grad_norm": 0.31142996523006744, + "learning_rate": 7.937104283979815e-06, + "loss": 0.809, + "num_tokens": 43774448778.0, + "step": 10473 + }, + { + "epoch": 1.2446821152703507, + "grad_norm": 0.27686891225676274, + "learning_rate": 7.935475758038388e-06, + "loss": 0.8234, + "num_tokens": 43778637534.0, + "step": 10474 + }, + { + "epoch": 1.2448009506833035, + "grad_norm": 0.3117667000507203, + "learning_rate": 7.93384734558665e-06, + "loss": 0.7886, + "num_tokens": 43782827371.0, + "step": 10475 + }, + { + "epoch": 1.2449197860962566, + "grad_norm": 0.27656770771802475, + "learning_rate": 7.932219046684918e-06, + "loss": 0.8085, + "num_tokens": 43786966991.0, + "step": 10476 + }, + { + "epoch": 1.2450386215092097, + "grad_norm": 0.27433831892987637, + "learning_rate": 7.930590861393484e-06, + "loss": 0.7792, + "num_tokens": 43791132221.0, + "step": 10477 + }, + { + "epoch": 1.2451574569221628, + "grad_norm": 0.28379895193141136, + "learning_rate": 7.92896278977265e-06, + "loss": 0.8316, + "num_tokens": 43795281535.0, + "step": 10478 + }, + { + "epoch": 1.245276292335116, + "grad_norm": 0.27274651681742956, + "learning_rate": 7.927334831882707e-06, + "loss": 0.8066, + "num_tokens": 43799471701.0, + "step": 10479 + }, + { + "epoch": 1.2453951277480688, + "grad_norm": 0.30473703453514467, + "learning_rate": 7.92570698778394e-06, + "loss": 0.8242, + "num_tokens": 43803661319.0, + "step": 10480 + }, + { + "epoch": 1.245513963161022, + "grad_norm": 0.2923895736687378, + "learning_rate": 7.92407925753664e-06, + "loss": 0.8389, + "num_tokens": 43807835813.0, + "step": 10481 + }, + { + "epoch": 1.245632798573975, + "grad_norm": 0.2983378952515087, + "learning_rate": 7.922451641201088e-06, + "loss": 0.8147, + "num_tokens": 43812025397.0, + "step": 10482 + }, + { + "epoch": 1.2457516339869281, + "grad_norm": 0.3192189042457383, + "learning_rate": 7.920824138837554e-06, + "loss": 0.8111, + "num_tokens": 43816207083.0, + "step": 10483 + }, + { + "epoch": 1.2458704693998812, + "grad_norm": 0.3003680038824159, + "learning_rate": 7.919196750506317e-06, + "loss": 0.8176, + "num_tokens": 43820395658.0, + "step": 10484 + }, + { + "epoch": 1.2459893048128343, + "grad_norm": 0.32387354222609727, + "learning_rate": 7.917569476267636e-06, + "loss": 0.7898, + "num_tokens": 43824584962.0, + "step": 10485 + }, + { + "epoch": 1.2461081402257872, + "grad_norm": 0.30574818308873325, + "learning_rate": 7.915942316181783e-06, + "loss": 0.8198, + "num_tokens": 43828729360.0, + "step": 10486 + }, + { + "epoch": 1.2462269756387403, + "grad_norm": 0.2996459470998289, + "learning_rate": 7.914315270309012e-06, + "loss": 0.8346, + "num_tokens": 43832919178.0, + "step": 10487 + }, + { + "epoch": 1.2463458110516934, + "grad_norm": 0.33653339321899683, + "learning_rate": 7.912688338709578e-06, + "loss": 0.7937, + "num_tokens": 43837108427.0, + "step": 10488 + }, + { + "epoch": 1.2464646464646465, + "grad_norm": 0.3067781874361933, + "learning_rate": 7.911061521443738e-06, + "loss": 0.8234, + "num_tokens": 43841281737.0, + "step": 10489 + }, + { + "epoch": 1.2465834818775996, + "grad_norm": 0.3519923738419069, + "learning_rate": 7.909434818571728e-06, + "loss": 0.843, + "num_tokens": 43845471274.0, + "step": 10490 + }, + { + "epoch": 1.2467023172905525, + "grad_norm": 0.28942876855969635, + "learning_rate": 7.9078082301538e-06, + "loss": 0.7964, + "num_tokens": 43849660144.0, + "step": 10491 + }, + { + "epoch": 1.2468211527035056, + "grad_norm": 0.34848315234280663, + "learning_rate": 7.906181756250186e-06, + "loss": 0.8141, + "num_tokens": 43853821329.0, + "step": 10492 + }, + { + "epoch": 1.2469399881164587, + "grad_norm": 0.3111615536867955, + "learning_rate": 7.904555396921124e-06, + "loss": 0.8249, + "num_tokens": 43857967965.0, + "step": 10493 + }, + { + "epoch": 1.2470588235294118, + "grad_norm": 0.3448277344647315, + "learning_rate": 7.90292915222684e-06, + "loss": 0.7706, + "num_tokens": 43862156469.0, + "step": 10494 + }, + { + "epoch": 1.2471776589423649, + "grad_norm": 0.27900853128341147, + "learning_rate": 7.90130302222756e-06, + "loss": 0.8196, + "num_tokens": 43866332666.0, + "step": 10495 + }, + { + "epoch": 1.247296494355318, + "grad_norm": 0.34047939228957025, + "learning_rate": 7.899677006983507e-06, + "loss": 0.8248, + "num_tokens": 43870522377.0, + "step": 10496 + }, + { + "epoch": 1.2474153297682709, + "grad_norm": 0.26681722784019085, + "learning_rate": 7.898051106554896e-06, + "loss": 0.8066, + "num_tokens": 43874710841.0, + "step": 10497 + }, + { + "epoch": 1.247534165181224, + "grad_norm": 0.2918522360050032, + "learning_rate": 7.896425321001944e-06, + "loss": 0.8437, + "num_tokens": 43878899974.0, + "step": 10498 + }, + { + "epoch": 1.247653000594177, + "grad_norm": 0.2726907634494045, + "learning_rate": 7.894799650384855e-06, + "loss": 0.8126, + "num_tokens": 43883088440.0, + "step": 10499 + }, + { + "epoch": 1.2477718360071302, + "grad_norm": 0.28397772184328757, + "learning_rate": 7.893174094763832e-06, + "loss": 0.786, + "num_tokens": 43887274693.0, + "step": 10500 + }, + { + "epoch": 1.2478906714200833, + "grad_norm": 0.28349684880562936, + "learning_rate": 7.891548654199075e-06, + "loss": 0.8211, + "num_tokens": 43891462105.0, + "step": 10501 + }, + { + "epoch": 1.2480095068330361, + "grad_norm": 0.27491876571853924, + "learning_rate": 7.889923328750783e-06, + "loss": 0.7965, + "num_tokens": 43895637634.0, + "step": 10502 + }, + { + "epoch": 1.2481283422459892, + "grad_norm": 0.28538439068511223, + "learning_rate": 7.888298118479147e-06, + "loss": 0.8203, + "num_tokens": 43899827082.0, + "step": 10503 + }, + { + "epoch": 1.2482471776589423, + "grad_norm": 0.27785553240612393, + "learning_rate": 7.88667302344435e-06, + "loss": 0.8381, + "num_tokens": 43904017270.0, + "step": 10504 + }, + { + "epoch": 1.2483660130718954, + "grad_norm": 0.308102114802526, + "learning_rate": 7.88504804370658e-06, + "loss": 0.8246, + "num_tokens": 43908200387.0, + "step": 10505 + }, + { + "epoch": 1.2484848484848485, + "grad_norm": 0.3255880021728508, + "learning_rate": 7.883423179326014e-06, + "loss": 0.8109, + "num_tokens": 43912378475.0, + "step": 10506 + }, + { + "epoch": 1.2486036838978016, + "grad_norm": 0.2741405859634821, + "learning_rate": 7.88179843036282e-06, + "loss": 0.8341, + "num_tokens": 43916565970.0, + "step": 10507 + }, + { + "epoch": 1.2487225193107545, + "grad_norm": 0.2958104565200392, + "learning_rate": 7.880173796877178e-06, + "loss": 0.8078, + "num_tokens": 43920735487.0, + "step": 10508 + }, + { + "epoch": 1.2488413547237076, + "grad_norm": 0.29657326409319434, + "learning_rate": 7.878549278929244e-06, + "loss": 0.822, + "num_tokens": 43924924218.0, + "step": 10509 + }, + { + "epoch": 1.2489601901366607, + "grad_norm": 0.28746213767379913, + "learning_rate": 7.876924876579186e-06, + "loss": 0.8161, + "num_tokens": 43929112063.0, + "step": 10510 + }, + { + "epoch": 1.2490790255496138, + "grad_norm": 0.2761963687715314, + "learning_rate": 7.875300589887162e-06, + "loss": 0.8354, + "num_tokens": 43933271264.0, + "step": 10511 + }, + { + "epoch": 1.249197860962567, + "grad_norm": 0.2852471175221323, + "learning_rate": 7.87367641891332e-06, + "loss": 0.8196, + "num_tokens": 43937460037.0, + "step": 10512 + }, + { + "epoch": 1.2493166963755198, + "grad_norm": 0.2939891485656742, + "learning_rate": 7.872052363717808e-06, + "loss": 0.7653, + "num_tokens": 43941648225.0, + "step": 10513 + }, + { + "epoch": 1.2494355317884729, + "grad_norm": 0.31916804605863525, + "learning_rate": 7.870428424360774e-06, + "loss": 0.8372, + "num_tokens": 43945812163.0, + "step": 10514 + }, + { + "epoch": 1.249554367201426, + "grad_norm": 0.3113766451388345, + "learning_rate": 7.868804600902358e-06, + "loss": 0.8231, + "num_tokens": 43949999493.0, + "step": 10515 + }, + { + "epoch": 1.249673202614379, + "grad_norm": 0.2748115890723046, + "learning_rate": 7.867180893402697e-06, + "loss": 0.7972, + "num_tokens": 43954189064.0, + "step": 10516 + }, + { + "epoch": 1.2497920380273322, + "grad_norm": 0.3030248154727197, + "learning_rate": 7.865557301921918e-06, + "loss": 0.8098, + "num_tokens": 43958378473.0, + "step": 10517 + }, + { + "epoch": 1.2499108734402853, + "grad_norm": 0.29362979218825774, + "learning_rate": 7.863933826520148e-06, + "loss": 0.8132, + "num_tokens": 43962546531.0, + "step": 10518 + }, + { + "epoch": 1.2500297088532384, + "grad_norm": 0.2937195367356074, + "learning_rate": 7.862310467257514e-06, + "loss": 0.7938, + "num_tokens": 43966735118.0, + "step": 10519 + }, + { + "epoch": 1.2501485442661913, + "grad_norm": 0.29906325340910084, + "learning_rate": 7.860687224194127e-06, + "loss": 0.7945, + "num_tokens": 43970923447.0, + "step": 10520 + }, + { + "epoch": 1.2502673796791444, + "grad_norm": 0.34609420897629695, + "learning_rate": 7.859064097390112e-06, + "loss": 0.7825, + "num_tokens": 43975112801.0, + "step": 10521 + }, + { + "epoch": 1.2503862150920975, + "grad_norm": 0.29003994337224503, + "learning_rate": 7.857441086905571e-06, + "loss": 0.8407, + "num_tokens": 43979303362.0, + "step": 10522 + }, + { + "epoch": 1.2505050505050506, + "grad_norm": 0.2656418684907065, + "learning_rate": 7.855818192800612e-06, + "loss": 0.8303, + "num_tokens": 43983493019.0, + "step": 10523 + }, + { + "epoch": 1.2506238859180034, + "grad_norm": 0.2894302271545649, + "learning_rate": 7.85419541513534e-06, + "loss": 0.7775, + "num_tokens": 43987665430.0, + "step": 10524 + }, + { + "epoch": 1.2507427213309565, + "grad_norm": 0.2626202740977446, + "learning_rate": 7.852572753969842e-06, + "loss": 0.7647, + "num_tokens": 43991854192.0, + "step": 10525 + }, + { + "epoch": 1.2508615567439096, + "grad_norm": 0.3022065763621353, + "learning_rate": 7.850950209364218e-06, + "loss": 0.8042, + "num_tokens": 43996043806.0, + "step": 10526 + }, + { + "epoch": 1.2509803921568627, + "grad_norm": 0.2798239740909154, + "learning_rate": 7.849327781378555e-06, + "loss": 0.7923, + "num_tokens": 44000227998.0, + "step": 10527 + }, + { + "epoch": 1.2510992275698158, + "grad_norm": 0.30966703490994285, + "learning_rate": 7.84770547007294e-06, + "loss": 0.82, + "num_tokens": 44004387347.0, + "step": 10528 + }, + { + "epoch": 1.251218062982769, + "grad_norm": 0.32640964656960614, + "learning_rate": 7.846083275507449e-06, + "loss": 0.8594, + "num_tokens": 44008553131.0, + "step": 10529 + }, + { + "epoch": 1.251336898395722, + "grad_norm": 0.3193251716718781, + "learning_rate": 7.844461197742155e-06, + "loss": 0.8094, + "num_tokens": 44012717881.0, + "step": 10530 + }, + { + "epoch": 1.251455733808675, + "grad_norm": 0.29136942630807966, + "learning_rate": 7.842839236837135e-06, + "loss": 0.8069, + "num_tokens": 44016905281.0, + "step": 10531 + }, + { + "epoch": 1.251574569221628, + "grad_norm": 0.3325232725695626, + "learning_rate": 7.841217392852446e-06, + "loss": 0.8342, + "num_tokens": 44021078723.0, + "step": 10532 + }, + { + "epoch": 1.2516934046345811, + "grad_norm": 0.30767107066719723, + "learning_rate": 7.839595665848163e-06, + "loss": 0.8449, + "num_tokens": 44025266220.0, + "step": 10533 + }, + { + "epoch": 1.2518122400475342, + "grad_norm": 0.28802647716571056, + "learning_rate": 7.83797405588434e-06, + "loss": 0.8263, + "num_tokens": 44029455978.0, + "step": 10534 + }, + { + "epoch": 1.251931075460487, + "grad_norm": 0.29909442428519467, + "learning_rate": 7.836352563021025e-06, + "loss": 0.8015, + "num_tokens": 44033645198.0, + "step": 10535 + }, + { + "epoch": 1.2520499108734402, + "grad_norm": 0.32556728944369095, + "learning_rate": 7.834731187318271e-06, + "loss": 0.7765, + "num_tokens": 44037828211.0, + "step": 10536 + }, + { + "epoch": 1.2521687462863933, + "grad_norm": 0.2850790234006232, + "learning_rate": 7.833109928836127e-06, + "loss": 0.7777, + "num_tokens": 44042007133.0, + "step": 10537 + }, + { + "epoch": 1.2522875816993464, + "grad_norm": 0.29735270925654794, + "learning_rate": 7.831488787634626e-06, + "loss": 0.7844, + "num_tokens": 44046171208.0, + "step": 10538 + }, + { + "epoch": 1.2524064171122995, + "grad_norm": 0.3598576520203253, + "learning_rate": 7.829867763773809e-06, + "loss": 0.7962, + "num_tokens": 44050358975.0, + "step": 10539 + }, + { + "epoch": 1.2525252525252526, + "grad_norm": 0.2952116097309802, + "learning_rate": 7.828246857313706e-06, + "loss": 0.8081, + "num_tokens": 44054546885.0, + "step": 10540 + }, + { + "epoch": 1.2526440879382057, + "grad_norm": 0.3681480646419562, + "learning_rate": 7.826626068314347e-06, + "loss": 0.8592, + "num_tokens": 44058726561.0, + "step": 10541 + }, + { + "epoch": 1.2527629233511586, + "grad_norm": 0.29428301750020125, + "learning_rate": 7.825005396835757e-06, + "loss": 0.8564, + "num_tokens": 44062912951.0, + "step": 10542 + }, + { + "epoch": 1.2528817587641117, + "grad_norm": 0.3653296232155964, + "learning_rate": 7.82338484293795e-06, + "loss": 0.7743, + "num_tokens": 44067101553.0, + "step": 10543 + }, + { + "epoch": 1.2530005941770648, + "grad_norm": 0.302469930991102, + "learning_rate": 7.82176440668094e-06, + "loss": 0.7932, + "num_tokens": 44071278760.0, + "step": 10544 + }, + { + "epoch": 1.2531194295900179, + "grad_norm": 0.35893848071471346, + "learning_rate": 7.820144088124742e-06, + "loss": 0.8253, + "num_tokens": 44075467310.0, + "step": 10545 + }, + { + "epoch": 1.2532382650029708, + "grad_norm": 0.33986637443967305, + "learning_rate": 7.818523887329358e-06, + "loss": 0.7785, + "num_tokens": 44079657661.0, + "step": 10546 + }, + { + "epoch": 1.2533571004159239, + "grad_norm": 0.3348129797866619, + "learning_rate": 7.816903804354797e-06, + "loss": 0.8194, + "num_tokens": 44083846944.0, + "step": 10547 + }, + { + "epoch": 1.253475935828877, + "grad_norm": 0.3596008601616083, + "learning_rate": 7.815283839261045e-06, + "loss": 0.8403, + "num_tokens": 44088008866.0, + "step": 10548 + }, + { + "epoch": 1.25359477124183, + "grad_norm": 0.3175067486077064, + "learning_rate": 7.813663992108102e-06, + "loss": 0.7949, + "num_tokens": 44092198047.0, + "step": 10549 + }, + { + "epoch": 1.2537136066547832, + "grad_norm": 0.38484252145945136, + "learning_rate": 7.812044262955955e-06, + "loss": 0.8014, + "num_tokens": 44096388661.0, + "step": 10550 + }, + { + "epoch": 1.2538324420677363, + "grad_norm": 0.2870517519337562, + "learning_rate": 7.810424651864584e-06, + "loss": 0.7842, + "num_tokens": 44100578543.0, + "step": 10551 + }, + { + "epoch": 1.2539512774806894, + "grad_norm": 0.33375643663841054, + "learning_rate": 7.808805158893975e-06, + "loss": 0.7859, + "num_tokens": 44104768697.0, + "step": 10552 + }, + { + "epoch": 1.2540701128936422, + "grad_norm": 0.2843664285189306, + "learning_rate": 7.807185784104103e-06, + "loss": 0.8205, + "num_tokens": 44108956786.0, + "step": 10553 + }, + { + "epoch": 1.2541889483065953, + "grad_norm": 0.3238607225279026, + "learning_rate": 7.805566527554932e-06, + "loss": 0.8255, + "num_tokens": 44113146431.0, + "step": 10554 + }, + { + "epoch": 1.2543077837195484, + "grad_norm": 0.3331921352219341, + "learning_rate": 7.80394738930644e-06, + "loss": 0.8254, + "num_tokens": 44117336108.0, + "step": 10555 + }, + { + "epoch": 1.2544266191325015, + "grad_norm": 0.28802780135557327, + "learning_rate": 7.802328369418576e-06, + "loss": 0.7981, + "num_tokens": 44121523115.0, + "step": 10556 + }, + { + "epoch": 1.2545454545454544, + "grad_norm": 0.3089088293734021, + "learning_rate": 7.800709467951306e-06, + "loss": 0.7896, + "num_tokens": 44125713973.0, + "step": 10557 + }, + { + "epoch": 1.2546642899584075, + "grad_norm": 0.2766807032573823, + "learning_rate": 7.799090684964581e-06, + "loss": 0.8046, + "num_tokens": 44129904293.0, + "step": 10558 + }, + { + "epoch": 1.2547831253713606, + "grad_norm": 0.2963566373301252, + "learning_rate": 7.797472020518351e-06, + "loss": 0.774, + "num_tokens": 44134088572.0, + "step": 10559 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.3193231606561135, + "learning_rate": 7.795853474672562e-06, + "loss": 0.7979, + "num_tokens": 44138260500.0, + "step": 10560 + }, + { + "epoch": 1.2550207961972668, + "grad_norm": 0.29701246493146843, + "learning_rate": 7.794235047487149e-06, + "loss": 0.8131, + "num_tokens": 44142449705.0, + "step": 10561 + }, + { + "epoch": 1.25513963161022, + "grad_norm": 0.3162807823162568, + "learning_rate": 7.792616739022051e-06, + "loss": 0.7823, + "num_tokens": 44146618478.0, + "step": 10562 + }, + { + "epoch": 1.255258467023173, + "grad_norm": 0.2711435935431615, + "learning_rate": 7.790998549337206e-06, + "loss": 0.7951, + "num_tokens": 44150807516.0, + "step": 10563 + }, + { + "epoch": 1.2553773024361259, + "grad_norm": 0.2786085752376623, + "learning_rate": 7.789380478492526e-06, + "loss": 0.8143, + "num_tokens": 44154941891.0, + "step": 10564 + }, + { + "epoch": 1.255496137849079, + "grad_norm": 0.33023187695193734, + "learning_rate": 7.787762526547946e-06, + "loss": 0.8004, + "num_tokens": 44159131355.0, + "step": 10565 + }, + { + "epoch": 1.255614973262032, + "grad_norm": 0.2814353101977846, + "learning_rate": 7.78614469356338e-06, + "loss": 0.7806, + "num_tokens": 44163299257.0, + "step": 10566 + }, + { + "epoch": 1.2557338086749852, + "grad_norm": 0.3063030771275933, + "learning_rate": 7.784526979598742e-06, + "loss": 0.7933, + "num_tokens": 44167458676.0, + "step": 10567 + }, + { + "epoch": 1.2558526440879383, + "grad_norm": 0.27704634359794406, + "learning_rate": 7.782909384713946e-06, + "loss": 0.7706, + "num_tokens": 44171648664.0, + "step": 10568 + }, + { + "epoch": 1.2559714795008912, + "grad_norm": 0.2949692742182114, + "learning_rate": 7.781291908968889e-06, + "loss": 0.807, + "num_tokens": 44175837140.0, + "step": 10569 + }, + { + "epoch": 1.2560903149138443, + "grad_norm": 0.2779947000885379, + "learning_rate": 7.779674552423475e-06, + "loss": 0.8025, + "num_tokens": 44179991062.0, + "step": 10570 + }, + { + "epoch": 1.2562091503267974, + "grad_norm": 0.3137950964381848, + "learning_rate": 7.778057315137597e-06, + "loss": 0.7828, + "num_tokens": 44184181004.0, + "step": 10571 + }, + { + "epoch": 1.2563279857397505, + "grad_norm": 0.3122282701660975, + "learning_rate": 7.776440197171156e-06, + "loss": 0.821, + "num_tokens": 44188337601.0, + "step": 10572 + }, + { + "epoch": 1.2564468211527036, + "grad_norm": 0.27796861537048095, + "learning_rate": 7.774823198584034e-06, + "loss": 0.8368, + "num_tokens": 44192525760.0, + "step": 10573 + }, + { + "epoch": 1.2565656565656567, + "grad_norm": 0.279808622276882, + "learning_rate": 7.773206319436111e-06, + "loss": 0.845, + "num_tokens": 44196686794.0, + "step": 10574 + }, + { + "epoch": 1.2566844919786098, + "grad_norm": 0.2899549410420011, + "learning_rate": 7.771589559787266e-06, + "loss": 0.7951, + "num_tokens": 44200876467.0, + "step": 10575 + }, + { + "epoch": 1.2568033273915626, + "grad_norm": 0.30009890286433244, + "learning_rate": 7.769972919697382e-06, + "loss": 0.8366, + "num_tokens": 44205064102.0, + "step": 10576 + }, + { + "epoch": 1.2569221628045157, + "grad_norm": 0.3078527522822879, + "learning_rate": 7.768356399226313e-06, + "loss": 0.7802, + "num_tokens": 44209254155.0, + "step": 10577 + }, + { + "epoch": 1.2570409982174688, + "grad_norm": 0.2710084881574663, + "learning_rate": 7.766739998433939e-06, + "loss": 0.8284, + "num_tokens": 44213443332.0, + "step": 10578 + }, + { + "epoch": 1.257159833630422, + "grad_norm": 0.3018810976880809, + "learning_rate": 7.765123717380112e-06, + "loss": 0.8368, + "num_tokens": 44217603199.0, + "step": 10579 + }, + { + "epoch": 1.2572786690433748, + "grad_norm": 0.3037048836394533, + "learning_rate": 7.763507556124693e-06, + "loss": 0.7802, + "num_tokens": 44221773270.0, + "step": 10580 + }, + { + "epoch": 1.257397504456328, + "grad_norm": 0.3007457312008543, + "learning_rate": 7.76189151472753e-06, + "loss": 0.8156, + "num_tokens": 44225962460.0, + "step": 10581 + }, + { + "epoch": 1.257516339869281, + "grad_norm": 0.31134775032161316, + "learning_rate": 7.760275593248477e-06, + "loss": 0.8128, + "num_tokens": 44230150875.0, + "step": 10582 + }, + { + "epoch": 1.2576351752822341, + "grad_norm": 0.32515768121975935, + "learning_rate": 7.758659791747368e-06, + "loss": 0.7957, + "num_tokens": 44234340541.0, + "step": 10583 + }, + { + "epoch": 1.2577540106951872, + "grad_norm": 0.27494318638817306, + "learning_rate": 7.757044110284046e-06, + "loss": 0.8184, + "num_tokens": 44238528865.0, + "step": 10584 + }, + { + "epoch": 1.2578728461081403, + "grad_norm": 0.3132075994229651, + "learning_rate": 7.755428548918346e-06, + "loss": 0.7955, + "num_tokens": 44242717801.0, + "step": 10585 + }, + { + "epoch": 1.2579916815210934, + "grad_norm": 0.28337499407488365, + "learning_rate": 7.7538131077101e-06, + "loss": 0.7989, + "num_tokens": 44246907207.0, + "step": 10586 + }, + { + "epoch": 1.2581105169340463, + "grad_norm": 0.3043289038826393, + "learning_rate": 7.752197786719126e-06, + "loss": 0.8236, + "num_tokens": 44251079685.0, + "step": 10587 + }, + { + "epoch": 1.2582293523469994, + "grad_norm": 0.30570422284862064, + "learning_rate": 7.750582586005253e-06, + "loss": 0.8202, + "num_tokens": 44255248748.0, + "step": 10588 + }, + { + "epoch": 1.2583481877599525, + "grad_norm": 0.27642064717697495, + "learning_rate": 7.748967505628293e-06, + "loss": 0.8202, + "num_tokens": 44259437814.0, + "step": 10589 + }, + { + "epoch": 1.2584670231729056, + "grad_norm": 0.32158476057325613, + "learning_rate": 7.747352545648052e-06, + "loss": 0.8319, + "num_tokens": 44263614675.0, + "step": 10590 + }, + { + "epoch": 1.2585858585858585, + "grad_norm": 0.277102804289704, + "learning_rate": 7.74573770612435e-06, + "loss": 0.8422, + "num_tokens": 44267803003.0, + "step": 10591 + }, + { + "epoch": 1.2587046939988116, + "grad_norm": 0.3192672323350113, + "learning_rate": 7.744122987116982e-06, + "loss": 0.8197, + "num_tokens": 44271992219.0, + "step": 10592 + }, + { + "epoch": 1.2588235294117647, + "grad_norm": 0.29951028837400856, + "learning_rate": 7.742508388685748e-06, + "loss": 0.7943, + "num_tokens": 44276144359.0, + "step": 10593 + }, + { + "epoch": 1.2589423648247178, + "grad_norm": 0.31041829424453077, + "learning_rate": 7.740893910890444e-06, + "loss": 0.81, + "num_tokens": 44280334097.0, + "step": 10594 + }, + { + "epoch": 1.2590612002376709, + "grad_norm": 0.3097371587893595, + "learning_rate": 7.739279553790857e-06, + "loss": 0.8344, + "num_tokens": 44284473791.0, + "step": 10595 + }, + { + "epoch": 1.259180035650624, + "grad_norm": 0.2943053268611412, + "learning_rate": 7.737665317446767e-06, + "loss": 0.8176, + "num_tokens": 44288630008.0, + "step": 10596 + }, + { + "epoch": 1.259298871063577, + "grad_norm": 0.35730935906357786, + "learning_rate": 7.736051201917968e-06, + "loss": 0.8245, + "num_tokens": 44292799585.0, + "step": 10597 + }, + { + "epoch": 1.25941770647653, + "grad_norm": 0.31339354054554375, + "learning_rate": 7.734437207264225e-06, + "loss": 0.8051, + "num_tokens": 44296988624.0, + "step": 10598 + }, + { + "epoch": 1.259536541889483, + "grad_norm": 0.3082305536124056, + "learning_rate": 7.732823333545316e-06, + "loss": 0.8624, + "num_tokens": 44301176159.0, + "step": 10599 + }, + { + "epoch": 1.2596553773024362, + "grad_norm": 0.2824647278159116, + "learning_rate": 7.731209580821005e-06, + "loss": 0.774, + "num_tokens": 44305366038.0, + "step": 10600 + }, + { + "epoch": 1.2597742127153893, + "grad_norm": 0.3438991616360608, + "learning_rate": 7.729595949151054e-06, + "loss": 0.7867, + "num_tokens": 44309528061.0, + "step": 10601 + }, + { + "epoch": 1.2598930481283421, + "grad_norm": 0.29423817432593863, + "learning_rate": 7.72798243859522e-06, + "loss": 0.7811, + "num_tokens": 44313715702.0, + "step": 10602 + }, + { + "epoch": 1.2600118835412952, + "grad_norm": 0.3181861336804332, + "learning_rate": 7.726369049213263e-06, + "loss": 0.8321, + "num_tokens": 44317905296.0, + "step": 10603 + }, + { + "epoch": 1.2601307189542483, + "grad_norm": 0.31859843890672884, + "learning_rate": 7.72475578106493e-06, + "loss": 0.8181, + "num_tokens": 44322090278.0, + "step": 10604 + }, + { + "epoch": 1.2602495543672014, + "grad_norm": 0.3313500855151143, + "learning_rate": 7.723142634209962e-06, + "loss": 0.8111, + "num_tokens": 44326281699.0, + "step": 10605 + }, + { + "epoch": 1.2603683897801545, + "grad_norm": 0.299446794621455, + "learning_rate": 7.721529608708101e-06, + "loss": 0.7976, + "num_tokens": 44330445939.0, + "step": 10606 + }, + { + "epoch": 1.2604872251931076, + "grad_norm": 0.37978538775305787, + "learning_rate": 7.719916704619085e-06, + "loss": 0.8244, + "num_tokens": 44334635917.0, + "step": 10607 + }, + { + "epoch": 1.2606060606060607, + "grad_norm": 0.3031543720180507, + "learning_rate": 7.718303922002641e-06, + "loss": 0.8195, + "num_tokens": 44338824037.0, + "step": 10608 + }, + { + "epoch": 1.2607248960190136, + "grad_norm": 0.35904737962950906, + "learning_rate": 7.716691260918502e-06, + "loss": 0.8304, + "num_tokens": 44342935653.0, + "step": 10609 + }, + { + "epoch": 1.2608437314319667, + "grad_norm": 0.35562307641904617, + "learning_rate": 7.715078721426384e-06, + "loss": 0.7959, + "num_tokens": 44347117140.0, + "step": 10610 + }, + { + "epoch": 1.2609625668449198, + "grad_norm": 0.28458765021777355, + "learning_rate": 7.713466303586006e-06, + "loss": 0.8054, + "num_tokens": 44351305906.0, + "step": 10611 + }, + { + "epoch": 1.261081402257873, + "grad_norm": 0.39021269641882883, + "learning_rate": 7.711854007457084e-06, + "loss": 0.8146, + "num_tokens": 44355494727.0, + "step": 10612 + }, + { + "epoch": 1.2612002376708258, + "grad_norm": 0.29349534371130326, + "learning_rate": 7.71024183309933e-06, + "loss": 0.7738, + "num_tokens": 44359683352.0, + "step": 10613 + }, + { + "epoch": 1.2613190730837789, + "grad_norm": 0.32969934182321886, + "learning_rate": 7.708629780572439e-06, + "loss": 0.8273, + "num_tokens": 44363845580.0, + "step": 10614 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.3275489920286257, + "learning_rate": 7.707017849936116e-06, + "loss": 0.7735, + "num_tokens": 44368035987.0, + "step": 10615 + }, + { + "epoch": 1.261556743909685, + "grad_norm": 0.3282315320164309, + "learning_rate": 7.705406041250056e-06, + "loss": 0.8271, + "num_tokens": 44372224074.0, + "step": 10616 + }, + { + "epoch": 1.2616755793226382, + "grad_norm": 0.36738825835250866, + "learning_rate": 7.703794354573952e-06, + "loss": 0.827, + "num_tokens": 44376412935.0, + "step": 10617 + }, + { + "epoch": 1.2617944147355913, + "grad_norm": 0.3313513354589322, + "learning_rate": 7.702182789967487e-06, + "loss": 0.8298, + "num_tokens": 44380581639.0, + "step": 10618 + }, + { + "epoch": 1.2619132501485444, + "grad_norm": 0.313318237094592, + "learning_rate": 7.700571347490341e-06, + "loss": 0.8118, + "num_tokens": 44384768382.0, + "step": 10619 + }, + { + "epoch": 1.2620320855614973, + "grad_norm": 0.3902361305397495, + "learning_rate": 7.698960027202196e-06, + "loss": 0.7927, + "num_tokens": 44388933544.0, + "step": 10620 + }, + { + "epoch": 1.2621509209744504, + "grad_norm": 0.30344070427608516, + "learning_rate": 7.69734882916272e-06, + "loss": 0.7949, + "num_tokens": 44393101592.0, + "step": 10621 + }, + { + "epoch": 1.2622697563874035, + "grad_norm": 0.37383212900614327, + "learning_rate": 7.695737753431588e-06, + "loss": 0.8033, + "num_tokens": 44397281501.0, + "step": 10622 + }, + { + "epoch": 1.2623885918003566, + "grad_norm": 0.3263899843582367, + "learning_rate": 7.694126800068456e-06, + "loss": 0.8009, + "num_tokens": 44401461961.0, + "step": 10623 + }, + { + "epoch": 1.2625074272133094, + "grad_norm": 0.3579044020408087, + "learning_rate": 7.692515969132986e-06, + "loss": 0.798, + "num_tokens": 44405651820.0, + "step": 10624 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.3947545792550072, + "learning_rate": 7.690905260684832e-06, + "loss": 0.7812, + "num_tokens": 44409814125.0, + "step": 10625 + }, + { + "epoch": 1.2627450980392156, + "grad_norm": 0.33345356236351525, + "learning_rate": 7.689294674783647e-06, + "loss": 0.8467, + "num_tokens": 44414003294.0, + "step": 10626 + }, + { + "epoch": 1.2628639334521687, + "grad_norm": 0.44797881377431265, + "learning_rate": 7.68768421148907e-06, + "loss": 0.7812, + "num_tokens": 44418191270.0, + "step": 10627 + }, + { + "epoch": 1.2629827688651218, + "grad_norm": 0.3105146693284369, + "learning_rate": 7.686073870860747e-06, + "loss": 0.8295, + "num_tokens": 44422331847.0, + "step": 10628 + }, + { + "epoch": 1.263101604278075, + "grad_norm": 0.478172918440864, + "learning_rate": 7.684463652958313e-06, + "loss": 0.825, + "num_tokens": 44426517684.0, + "step": 10629 + }, + { + "epoch": 1.263220439691028, + "grad_norm": 0.3360102311753309, + "learning_rate": 7.682853557841402e-06, + "loss": 0.8268, + "num_tokens": 44430705774.0, + "step": 10630 + }, + { + "epoch": 1.263339275103981, + "grad_norm": 0.5334963986379561, + "learning_rate": 7.68124358556964e-06, + "loss": 0.8289, + "num_tokens": 44434867037.0, + "step": 10631 + }, + { + "epoch": 1.263458110516934, + "grad_norm": 0.38235625838187365, + "learning_rate": 7.679633736202647e-06, + "loss": 0.8218, + "num_tokens": 44439045391.0, + "step": 10632 + }, + { + "epoch": 1.2635769459298871, + "grad_norm": 0.5573268267607614, + "learning_rate": 7.678024009800045e-06, + "loss": 0.7888, + "num_tokens": 44443233839.0, + "step": 10633 + }, + { + "epoch": 1.2636957813428402, + "grad_norm": 0.5069204402633994, + "learning_rate": 7.676414406421441e-06, + "loss": 0.8423, + "num_tokens": 44447421569.0, + "step": 10634 + }, + { + "epoch": 1.263814616755793, + "grad_norm": 0.4434444556996922, + "learning_rate": 7.674804926126452e-06, + "loss": 0.842, + "num_tokens": 44451609014.0, + "step": 10635 + }, + { + "epoch": 1.2639334521687462, + "grad_norm": 0.482669595981708, + "learning_rate": 7.673195568974683e-06, + "loss": 0.7981, + "num_tokens": 44455797895.0, + "step": 10636 + }, + { + "epoch": 1.2640522875816993, + "grad_norm": 0.3675269232020716, + "learning_rate": 7.671586335025725e-06, + "loss": 0.8113, + "num_tokens": 44459954077.0, + "step": 10637 + }, + { + "epoch": 1.2641711229946524, + "grad_norm": 0.3930964397373162, + "learning_rate": 7.66997722433918e-06, + "loss": 0.787, + "num_tokens": 44464112787.0, + "step": 10638 + }, + { + "epoch": 1.2642899584076055, + "grad_norm": 0.4240672638966924, + "learning_rate": 7.668368236974642e-06, + "loss": 0.7948, + "num_tokens": 44468301673.0, + "step": 10639 + }, + { + "epoch": 1.2644087938205586, + "grad_norm": 0.33537698256573995, + "learning_rate": 7.666759372991684e-06, + "loss": 0.8174, + "num_tokens": 44472491265.0, + "step": 10640 + }, + { + "epoch": 1.2645276292335117, + "grad_norm": 0.5368839213238518, + "learning_rate": 7.665150632449902e-06, + "loss": 0.8321, + "num_tokens": 44476678675.0, + "step": 10641 + }, + { + "epoch": 1.2646464646464646, + "grad_norm": 0.4146294924728063, + "learning_rate": 7.663542015408866e-06, + "loss": 0.8492, + "num_tokens": 44480867458.0, + "step": 10642 + }, + { + "epoch": 1.2647653000594177, + "grad_norm": 0.5089227392186919, + "learning_rate": 7.661933521928152e-06, + "loss": 0.8426, + "num_tokens": 44485047452.0, + "step": 10643 + }, + { + "epoch": 1.2648841354723708, + "grad_norm": 0.47217440943616557, + "learning_rate": 7.660325152067325e-06, + "loss": 0.8416, + "num_tokens": 44489236363.0, + "step": 10644 + }, + { + "epoch": 1.2650029708853239, + "grad_norm": 0.4134998116693169, + "learning_rate": 7.65871690588595e-06, + "loss": 0.7729, + "num_tokens": 44493422515.0, + "step": 10645 + }, + { + "epoch": 1.2651218062982768, + "grad_norm": 0.39776120374674917, + "learning_rate": 7.65710878344358e-06, + "loss": 0.7793, + "num_tokens": 44497553538.0, + "step": 10646 + }, + { + "epoch": 1.2652406417112299, + "grad_norm": 0.4638187037779051, + "learning_rate": 7.655500784799778e-06, + "loss": 0.8162, + "num_tokens": 44501743612.0, + "step": 10647 + }, + { + "epoch": 1.265359477124183, + "grad_norm": 0.3830022610783014, + "learning_rate": 7.653892910014088e-06, + "loss": 0.8131, + "num_tokens": 44505932638.0, + "step": 10648 + }, + { + "epoch": 1.265478312537136, + "grad_norm": 0.4984486033414187, + "learning_rate": 7.652285159146065e-06, + "loss": 0.806, + "num_tokens": 44510123591.0, + "step": 10649 + }, + { + "epoch": 1.2655971479500892, + "grad_norm": 0.4683853227421515, + "learning_rate": 7.650677532255233e-06, + "loss": 0.8143, + "num_tokens": 44514287221.0, + "step": 10650 + }, + { + "epoch": 1.2657159833630423, + "grad_norm": 0.4473677341368995, + "learning_rate": 7.64907002940114e-06, + "loss": 0.8102, + "num_tokens": 44518475170.0, + "step": 10651 + }, + { + "epoch": 1.2658348187759954, + "grad_norm": 0.4210845498108774, + "learning_rate": 7.647462650643314e-06, + "loss": 0.8107, + "num_tokens": 44522640691.0, + "step": 10652 + }, + { + "epoch": 1.2659536541889482, + "grad_norm": 0.4455434938243489, + "learning_rate": 7.645855396041275e-06, + "loss": 0.7834, + "num_tokens": 44526831018.0, + "step": 10653 + }, + { + "epoch": 1.2660724896019013, + "grad_norm": 0.3519569482898244, + "learning_rate": 7.644248265654559e-06, + "loss": 0.8067, + "num_tokens": 44531021582.0, + "step": 10654 + }, + { + "epoch": 1.2661913250148544, + "grad_norm": 0.5053840183210779, + "learning_rate": 7.642641259542673e-06, + "loss": 0.7934, + "num_tokens": 44535181375.0, + "step": 10655 + }, + { + "epoch": 1.2663101604278075, + "grad_norm": 0.37758952043171584, + "learning_rate": 7.641034377765132e-06, + "loss": 0.8059, + "num_tokens": 44539342216.0, + "step": 10656 + }, + { + "epoch": 1.2664289958407606, + "grad_norm": 0.52415322886966, + "learning_rate": 7.639427620381448e-06, + "loss": 0.812, + "num_tokens": 44543525105.0, + "step": 10657 + }, + { + "epoch": 1.2665478312537135, + "grad_norm": 0.4374531112405275, + "learning_rate": 7.63782098745112e-06, + "loss": 0.826, + "num_tokens": 44547714513.0, + "step": 10658 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.47050948727788494, + "learning_rate": 7.636214479033645e-06, + "loss": 0.8102, + "num_tokens": 44551903668.0, + "step": 10659 + }, + { + "epoch": 1.2667855020796197, + "grad_norm": 0.4747192875394144, + "learning_rate": 7.63460809518852e-06, + "loss": 0.8491, + "num_tokens": 44556093073.0, + "step": 10660 + }, + { + "epoch": 1.2669043374925728, + "grad_norm": 0.4645885980940766, + "learning_rate": 7.633001835975243e-06, + "loss": 0.831, + "num_tokens": 44560234909.0, + "step": 10661 + }, + { + "epoch": 1.267023172905526, + "grad_norm": 0.43506807565631683, + "learning_rate": 7.631395701453289e-06, + "loss": 0.8256, + "num_tokens": 44564403225.0, + "step": 10662 + }, + { + "epoch": 1.267142008318479, + "grad_norm": 0.4397059592932844, + "learning_rate": 7.629789691682142e-06, + "loss": 0.8179, + "num_tokens": 44568571929.0, + "step": 10663 + }, + { + "epoch": 1.2672608437314319, + "grad_norm": 0.40949030869654224, + "learning_rate": 7.6281838067212765e-06, + "loss": 0.8172, + "num_tokens": 44572759246.0, + "step": 10664 + }, + { + "epoch": 1.267379679144385, + "grad_norm": 0.4347707453387343, + "learning_rate": 7.626578046630166e-06, + "loss": 0.8441, + "num_tokens": 44576948519.0, + "step": 10665 + }, + { + "epoch": 1.267498514557338, + "grad_norm": 0.3941976090512332, + "learning_rate": 7.624972411468274e-06, + "loss": 0.8128, + "num_tokens": 44581119061.0, + "step": 10666 + }, + { + "epoch": 1.2676173499702912, + "grad_norm": 0.46699508469677564, + "learning_rate": 7.623366901295068e-06, + "loss": 0.8261, + "num_tokens": 44585305814.0, + "step": 10667 + }, + { + "epoch": 1.2677361853832443, + "grad_norm": 0.423765387946219, + "learning_rate": 7.6217615161700005e-06, + "loss": 0.8056, + "num_tokens": 44589494276.0, + "step": 10668 + }, + { + "epoch": 1.2678550207961972, + "grad_norm": 0.5160505442106353, + "learning_rate": 7.620156256152527e-06, + "loss": 0.8264, + "num_tokens": 44593682729.0, + "step": 10669 + }, + { + "epoch": 1.2679738562091503, + "grad_norm": 0.4454096500880105, + "learning_rate": 7.618551121302095e-06, + "loss": 0.8248, + "num_tokens": 44597831870.0, + "step": 10670 + }, + { + "epoch": 1.2680926916221034, + "grad_norm": 0.3893675355470441, + "learning_rate": 7.616946111678147e-06, + "loss": 0.8486, + "num_tokens": 44602019261.0, + "step": 10671 + }, + { + "epoch": 1.2682115270350565, + "grad_norm": 0.3811025855141326, + "learning_rate": 7.615341227340123e-06, + "loss": 0.8387, + "num_tokens": 44606189558.0, + "step": 10672 + }, + { + "epoch": 1.2683303624480096, + "grad_norm": 0.4162242738509803, + "learning_rate": 7.6137364683474565e-06, + "loss": 0.8446, + "num_tokens": 44610355752.0, + "step": 10673 + }, + { + "epoch": 1.2684491978609627, + "grad_norm": 0.32925307216038585, + "learning_rate": 7.61213183475958e-06, + "loss": 0.8253, + "num_tokens": 44614520806.0, + "step": 10674 + }, + { + "epoch": 1.2685680332739158, + "grad_norm": 0.49078742589156965, + "learning_rate": 7.610527326635917e-06, + "loss": 0.8215, + "num_tokens": 44618665449.0, + "step": 10675 + }, + { + "epoch": 1.2686868686868686, + "grad_norm": 0.4026378407276537, + "learning_rate": 7.608922944035883e-06, + "loss": 0.8006, + "num_tokens": 44622856236.0, + "step": 10676 + }, + { + "epoch": 1.2688057040998217, + "grad_norm": 0.4810488284268289, + "learning_rate": 7.607318687018901e-06, + "loss": 0.8212, + "num_tokens": 44627044771.0, + "step": 10677 + }, + { + "epoch": 1.2689245395127748, + "grad_norm": 0.4373357464327509, + "learning_rate": 7.605714555644381e-06, + "loss": 0.7923, + "num_tokens": 44631219744.0, + "step": 10678 + }, + { + "epoch": 1.269043374925728, + "grad_norm": 0.3898184015859062, + "learning_rate": 7.604110549971719e-06, + "loss": 0.8248, + "num_tokens": 44635409539.0, + "step": 10679 + }, + { + "epoch": 1.2691622103386808, + "grad_norm": 0.3829783138066893, + "learning_rate": 7.602506670060332e-06, + "loss": 0.7953, + "num_tokens": 44639567718.0, + "step": 10680 + }, + { + "epoch": 1.269281045751634, + "grad_norm": 0.3909522868143982, + "learning_rate": 7.600902915969608e-06, + "loss": 0.8411, + "num_tokens": 44643706952.0, + "step": 10681 + }, + { + "epoch": 1.269399881164587, + "grad_norm": 0.32949230991774253, + "learning_rate": 7.599299287758943e-06, + "loss": 0.8611, + "num_tokens": 44647894906.0, + "step": 10682 + }, + { + "epoch": 1.2695187165775401, + "grad_norm": 0.5098959639945851, + "learning_rate": 7.5976957854877245e-06, + "loss": 0.8337, + "num_tokens": 44652063525.0, + "step": 10683 + }, + { + "epoch": 1.2696375519904932, + "grad_norm": 0.40280452002673905, + "learning_rate": 7.596092409215332e-06, + "loss": 0.8121, + "num_tokens": 44656249498.0, + "step": 10684 + }, + { + "epoch": 1.2697563874034463, + "grad_norm": 0.52292260269806, + "learning_rate": 7.594489159001146e-06, + "loss": 0.795, + "num_tokens": 44660439336.0, + "step": 10685 + }, + { + "epoch": 1.2698752228163994, + "grad_norm": 0.47791677630176976, + "learning_rate": 7.592886034904543e-06, + "loss": 0.8193, + "num_tokens": 44664630265.0, + "step": 10686 + }, + { + "epoch": 1.2699940582293523, + "grad_norm": 0.4035112717258869, + "learning_rate": 7.591283036984887e-06, + "loss": 0.8359, + "num_tokens": 44668818555.0, + "step": 10687 + }, + { + "epoch": 1.2701128936423054, + "grad_norm": 0.381350734200528, + "learning_rate": 7.589680165301549e-06, + "loss": 0.8575, + "num_tokens": 44672990883.0, + "step": 10688 + }, + { + "epoch": 1.2702317290552585, + "grad_norm": 0.4107670253859739, + "learning_rate": 7.588077419913884e-06, + "loss": 0.8472, + "num_tokens": 44677157907.0, + "step": 10689 + }, + { + "epoch": 1.2703505644682116, + "grad_norm": 0.3282750402431279, + "learning_rate": 7.586474800881247e-06, + "loss": 0.8046, + "num_tokens": 44681325066.0, + "step": 10690 + }, + { + "epoch": 1.2704693998811645, + "grad_norm": 0.4650234031423702, + "learning_rate": 7.584872308262989e-06, + "loss": 0.8372, + "num_tokens": 44685514189.0, + "step": 10691 + }, + { + "epoch": 1.2705882352941176, + "grad_norm": 0.3931608399612275, + "learning_rate": 7.583269942118456e-06, + "loss": 0.8211, + "num_tokens": 44689701864.0, + "step": 10692 + }, + { + "epoch": 1.2707070707070707, + "grad_norm": 0.479695563165167, + "learning_rate": 7.581667702506992e-06, + "loss": 0.8039, + "num_tokens": 44693878686.0, + "step": 10693 + }, + { + "epoch": 1.2708259061200238, + "grad_norm": 0.3892648055837514, + "learning_rate": 7.580065589487928e-06, + "loss": 0.847, + "num_tokens": 44698065644.0, + "step": 10694 + }, + { + "epoch": 1.2709447415329769, + "grad_norm": 0.454395542156406, + "learning_rate": 7.5784636031206e-06, + "loss": 0.8301, + "num_tokens": 44702233851.0, + "step": 10695 + }, + { + "epoch": 1.27106357694593, + "grad_norm": 0.40660057864291654, + "learning_rate": 7.576861743464334e-06, + "loss": 0.8069, + "num_tokens": 44706420878.0, + "step": 10696 + }, + { + "epoch": 1.271182412358883, + "grad_norm": 0.4162524393151236, + "learning_rate": 7.575260010578451e-06, + "loss": 0.8161, + "num_tokens": 44710579337.0, + "step": 10697 + }, + { + "epoch": 1.271301247771836, + "grad_norm": 0.3863749818063787, + "learning_rate": 7.5736584045222664e-06, + "loss": 0.7782, + "num_tokens": 44714746082.0, + "step": 10698 + }, + { + "epoch": 1.271420083184789, + "grad_norm": 0.4606369472903162, + "learning_rate": 7.572056925355098e-06, + "loss": 0.7987, + "num_tokens": 44718901046.0, + "step": 10699 + }, + { + "epoch": 1.2715389185977422, + "grad_norm": 0.38905574344931493, + "learning_rate": 7.570455573136251e-06, + "loss": 0.8151, + "num_tokens": 44723062863.0, + "step": 10700 + }, + { + "epoch": 1.2716577540106953, + "grad_norm": 0.4454623916077319, + "learning_rate": 7.56885434792503e-06, + "loss": 0.8214, + "num_tokens": 44727250478.0, + "step": 10701 + }, + { + "epoch": 1.2717765894236481, + "grad_norm": 0.3814855048597336, + "learning_rate": 7.567253249780735e-06, + "loss": 0.8411, + "num_tokens": 44731438785.0, + "step": 10702 + }, + { + "epoch": 1.2718954248366012, + "grad_norm": 0.4464944290540391, + "learning_rate": 7.565652278762656e-06, + "loss": 0.8321, + "num_tokens": 44735611854.0, + "step": 10703 + }, + { + "epoch": 1.2720142602495543, + "grad_norm": 0.41662456310463963, + "learning_rate": 7.5640514349300845e-06, + "loss": 0.8428, + "num_tokens": 44739800273.0, + "step": 10704 + }, + { + "epoch": 1.2721330956625074, + "grad_norm": 0.44157022803563956, + "learning_rate": 7.562450718342309e-06, + "loss": 0.7636, + "num_tokens": 44743988172.0, + "step": 10705 + }, + { + "epoch": 1.2722519310754605, + "grad_norm": 0.4158049081555871, + "learning_rate": 7.560850129058603e-06, + "loss": 0.8102, + "num_tokens": 44748177682.0, + "step": 10706 + }, + { + "epoch": 1.2723707664884136, + "grad_norm": 0.43581626807341656, + "learning_rate": 7.559249667138249e-06, + "loss": 0.8048, + "num_tokens": 44752344528.0, + "step": 10707 + }, + { + "epoch": 1.2724896019013667, + "grad_norm": 0.3443634823738071, + "learning_rate": 7.557649332640509e-06, + "loss": 0.8088, + "num_tokens": 44756534559.0, + "step": 10708 + }, + { + "epoch": 1.2726084373143196, + "grad_norm": 0.45269021056669384, + "learning_rate": 7.556049125624656e-06, + "loss": 0.8448, + "num_tokens": 44760724649.0, + "step": 10709 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.39147125967429136, + "learning_rate": 7.554449046149943e-06, + "loss": 0.7959, + "num_tokens": 44764901879.0, + "step": 10710 + }, + { + "epoch": 1.2728461081402258, + "grad_norm": 0.45810508243384646, + "learning_rate": 7.552849094275636e-06, + "loss": 0.7937, + "num_tokens": 44769059786.0, + "step": 10711 + }, + { + "epoch": 1.272964943553179, + "grad_norm": 0.414833010501557, + "learning_rate": 7.5512492700609805e-06, + "loss": 0.8121, + "num_tokens": 44773247418.0, + "step": 10712 + }, + { + "epoch": 1.2730837789661318, + "grad_norm": 0.43677668282128074, + "learning_rate": 7.549649573565225e-06, + "loss": 0.857, + "num_tokens": 44777415234.0, + "step": 10713 + }, + { + "epoch": 1.2732026143790849, + "grad_norm": 0.40310342722285214, + "learning_rate": 7.548050004847616e-06, + "loss": 0.7905, + "num_tokens": 44781584428.0, + "step": 10714 + }, + { + "epoch": 1.273321449792038, + "grad_norm": 0.35201383891178617, + "learning_rate": 7.546450563967383e-06, + "loss": 0.8479, + "num_tokens": 44785772477.0, + "step": 10715 + }, + { + "epoch": 1.273440285204991, + "grad_norm": 0.3086063623411752, + "learning_rate": 7.544851250983762e-06, + "loss": 0.7999, + "num_tokens": 44789947636.0, + "step": 10716 + }, + { + "epoch": 1.2735591206179442, + "grad_norm": 0.42301913617851805, + "learning_rate": 7.5432520659559795e-06, + "loss": 0.833, + "num_tokens": 44794136150.0, + "step": 10717 + }, + { + "epoch": 1.2736779560308973, + "grad_norm": 0.30587276108479566, + "learning_rate": 7.5416530089432635e-06, + "loss": 0.8623, + "num_tokens": 44798325345.0, + "step": 10718 + }, + { + "epoch": 1.2737967914438504, + "grad_norm": 0.46781345169454636, + "learning_rate": 7.540054080004831e-06, + "loss": 0.8217, + "num_tokens": 44802467941.0, + "step": 10719 + }, + { + "epoch": 1.2739156268568033, + "grad_norm": 0.3707701206430133, + "learning_rate": 7.53845527919989e-06, + "loss": 0.8176, + "num_tokens": 44806656891.0, + "step": 10720 + }, + { + "epoch": 1.2740344622697564, + "grad_norm": 0.4334204339553149, + "learning_rate": 7.536856606587657e-06, + "loss": 0.8018, + "num_tokens": 44810845060.0, + "step": 10721 + }, + { + "epoch": 1.2741532976827095, + "grad_norm": 0.4432127674203393, + "learning_rate": 7.535258062227333e-06, + "loss": 0.7875, + "num_tokens": 44815024082.0, + "step": 10722 + }, + { + "epoch": 1.2742721330956626, + "grad_norm": 0.41103679820397476, + "learning_rate": 7.533659646178113e-06, + "loss": 0.8195, + "num_tokens": 44819213009.0, + "step": 10723 + }, + { + "epoch": 1.2743909685086154, + "grad_norm": 0.44017442031756315, + "learning_rate": 7.532061358499201e-06, + "loss": 0.8139, + "num_tokens": 44823373971.0, + "step": 10724 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.38944788458113555, + "learning_rate": 7.53046319924978e-06, + "loss": 0.7764, + "num_tokens": 44827533426.0, + "step": 10725 + }, + { + "epoch": 1.2746286393345216, + "grad_norm": 0.36324753339696014, + "learning_rate": 7.5288651684890394e-06, + "loss": 0.8278, + "num_tokens": 44831723825.0, + "step": 10726 + }, + { + "epoch": 1.2747474747474747, + "grad_norm": 0.37517428498559297, + "learning_rate": 7.527267266276157e-06, + "loss": 0.8428, + "num_tokens": 44835914438.0, + "step": 10727 + }, + { + "epoch": 1.2748663101604278, + "grad_norm": 0.3341215059596548, + "learning_rate": 7.5256694926703085e-06, + "loss": 0.792, + "num_tokens": 44840074203.0, + "step": 10728 + }, + { + "epoch": 1.274985145573381, + "grad_norm": 0.43705778130802064, + "learning_rate": 7.524071847730665e-06, + "loss": 0.8175, + "num_tokens": 44844264774.0, + "step": 10729 + }, + { + "epoch": 1.275103980986334, + "grad_norm": 0.36378098513717144, + "learning_rate": 7.522474331516394e-06, + "loss": 0.8131, + "num_tokens": 44848453667.0, + "step": 10730 + }, + { + "epoch": 1.275222816399287, + "grad_norm": 0.46071393761509166, + "learning_rate": 7.520876944086657e-06, + "loss": 0.829, + "num_tokens": 44852632129.0, + "step": 10731 + }, + { + "epoch": 1.27534165181224, + "grad_norm": 0.4252419508712771, + "learning_rate": 7.519279685500611e-06, + "loss": 0.8141, + "num_tokens": 44856820451.0, + "step": 10732 + }, + { + "epoch": 1.2754604872251931, + "grad_norm": 0.4394407387089145, + "learning_rate": 7.5176825558174075e-06, + "loss": 0.8327, + "num_tokens": 44861005334.0, + "step": 10733 + }, + { + "epoch": 1.2755793226381462, + "grad_norm": 0.41335054339933136, + "learning_rate": 7.516085555096191e-06, + "loss": 0.8275, + "num_tokens": 44865171200.0, + "step": 10734 + }, + { + "epoch": 1.275698158051099, + "grad_norm": 0.3767991521615096, + "learning_rate": 7.514488683396106e-06, + "loss": 0.8141, + "num_tokens": 44869315716.0, + "step": 10735 + }, + { + "epoch": 1.2758169934640522, + "grad_norm": 0.36854423702819955, + "learning_rate": 7.5128919407762876e-06, + "loss": 0.7885, + "num_tokens": 44873505038.0, + "step": 10736 + }, + { + "epoch": 1.2759358288770053, + "grad_norm": 0.3979686009065928, + "learning_rate": 7.511295327295873e-06, + "loss": 0.811, + "num_tokens": 44877682562.0, + "step": 10737 + }, + { + "epoch": 1.2760546642899584, + "grad_norm": 0.339230818180369, + "learning_rate": 7.509698843013991e-06, + "loss": 0.8392, + "num_tokens": 44881872050.0, + "step": 10738 + }, + { + "epoch": 1.2761734997029115, + "grad_norm": 0.4337321488397912, + "learning_rate": 7.508102487989757e-06, + "loss": 0.8029, + "num_tokens": 44886061733.0, + "step": 10739 + }, + { + "epoch": 1.2762923351158646, + "grad_norm": 0.365876175769694, + "learning_rate": 7.506506262282296e-06, + "loss": 0.8264, + "num_tokens": 44890251043.0, + "step": 10740 + }, + { + "epoch": 1.2764111705288177, + "grad_norm": 0.44334668803414296, + "learning_rate": 7.504910165950721e-06, + "loss": 0.7848, + "num_tokens": 44894414912.0, + "step": 10741 + }, + { + "epoch": 1.2765300059417706, + "grad_norm": 0.3835120537656486, + "learning_rate": 7.503314199054137e-06, + "loss": 0.8492, + "num_tokens": 44898601999.0, + "step": 10742 + }, + { + "epoch": 1.2766488413547237, + "grad_norm": 0.4510747307228632, + "learning_rate": 7.5017183616516535e-06, + "loss": 0.8008, + "num_tokens": 44902779889.0, + "step": 10743 + }, + { + "epoch": 1.2767676767676768, + "grad_norm": 0.394606889434525, + "learning_rate": 7.500122653802367e-06, + "loss": 0.8274, + "num_tokens": 44906942092.0, + "step": 10744 + }, + { + "epoch": 1.2768865121806299, + "grad_norm": 0.41224063559139074, + "learning_rate": 7.4985270755653694e-06, + "loss": 0.7835, + "num_tokens": 44911131302.0, + "step": 10745 + }, + { + "epoch": 1.2770053475935828, + "grad_norm": 0.39423130911036774, + "learning_rate": 7.4969316269997576e-06, + "loss": 0.8491, + "num_tokens": 44915320286.0, + "step": 10746 + }, + { + "epoch": 1.2771241830065359, + "grad_norm": 0.3538426292541365, + "learning_rate": 7.4953363081646065e-06, + "loss": 0.7956, + "num_tokens": 44919509652.0, + "step": 10747 + }, + { + "epoch": 1.277243018419489, + "grad_norm": 0.32447657044584427, + "learning_rate": 7.4937411191190044e-06, + "loss": 0.7897, + "num_tokens": 44923654509.0, + "step": 10748 + }, + { + "epoch": 1.277361853832442, + "grad_norm": 0.3684738555179963, + "learning_rate": 7.4921460599220234e-06, + "loss": 0.8586, + "num_tokens": 44927841737.0, + "step": 10749 + }, + { + "epoch": 1.2774806892453952, + "grad_norm": 0.29691013711236164, + "learning_rate": 7.490551130632733e-06, + "loss": 0.8282, + "num_tokens": 44932012103.0, + "step": 10750 + }, + { + "epoch": 1.2775995246583483, + "grad_norm": 0.39728872686869265, + "learning_rate": 7.488956331310202e-06, + "loss": 0.814, + "num_tokens": 44936199847.0, + "step": 10751 + }, + { + "epoch": 1.2777183600713014, + "grad_norm": 0.29808506893812076, + "learning_rate": 7.487361662013488e-06, + "loss": 0.8108, + "num_tokens": 44940387365.0, + "step": 10752 + }, + { + "epoch": 1.2778371954842542, + "grad_norm": 0.4304939889158196, + "learning_rate": 7.485767122801646e-06, + "loss": 0.8514, + "num_tokens": 44944576135.0, + "step": 10753 + }, + { + "epoch": 1.2779560308972073, + "grad_norm": 0.3041390421045805, + "learning_rate": 7.484172713733735e-06, + "loss": 0.8136, + "num_tokens": 44948765828.0, + "step": 10754 + }, + { + "epoch": 1.2780748663101604, + "grad_norm": 0.4453197849197265, + "learning_rate": 7.48257843486879e-06, + "loss": 0.8189, + "num_tokens": 44952948527.0, + "step": 10755 + }, + { + "epoch": 1.2781937017231135, + "grad_norm": 0.3490028712146658, + "learning_rate": 7.48098428626586e-06, + "loss": 0.8172, + "num_tokens": 44957125309.0, + "step": 10756 + }, + { + "epoch": 1.2783125371360666, + "grad_norm": 0.4253766105221932, + "learning_rate": 7.479390267983979e-06, + "loss": 0.8083, + "num_tokens": 44961314143.0, + "step": 10757 + }, + { + "epoch": 1.2784313725490195, + "grad_norm": 0.3937054074775364, + "learning_rate": 7.477796380082183e-06, + "loss": 0.7893, + "num_tokens": 44965503596.0, + "step": 10758 + }, + { + "epoch": 1.2785502079619726, + "grad_norm": 0.4013810408754803, + "learning_rate": 7.476202622619495e-06, + "loss": 0.8219, + "num_tokens": 44969692502.0, + "step": 10759 + }, + { + "epoch": 1.2786690433749257, + "grad_norm": 0.38163858666194833, + "learning_rate": 7.474608995654937e-06, + "loss": 0.768, + "num_tokens": 44973870408.0, + "step": 10760 + }, + { + "epoch": 1.2787878787878788, + "grad_norm": 0.38344229724005247, + "learning_rate": 7.473015499247527e-06, + "loss": 0.832, + "num_tokens": 44978031249.0, + "step": 10761 + }, + { + "epoch": 1.278906714200832, + "grad_norm": 0.39446447008030705, + "learning_rate": 7.471422133456279e-06, + "loss": 0.8254, + "num_tokens": 44982202295.0, + "step": 10762 + }, + { + "epoch": 1.279025549613785, + "grad_norm": 0.3767321221494065, + "learning_rate": 7.469828898340199e-06, + "loss": 0.7904, + "num_tokens": 44986392149.0, + "step": 10763 + }, + { + "epoch": 1.279144385026738, + "grad_norm": 0.40219593790234526, + "learning_rate": 7.468235793958292e-06, + "loss": 0.8501, + "num_tokens": 44990570707.0, + "step": 10764 + }, + { + "epoch": 1.279263220439691, + "grad_norm": 0.33608496200942534, + "learning_rate": 7.466642820369554e-06, + "loss": 0.7971, + "num_tokens": 44994759888.0, + "step": 10765 + }, + { + "epoch": 1.279382055852644, + "grad_norm": 0.3351520965889249, + "learning_rate": 7.4650499776329784e-06, + "loss": 0.7981, + "num_tokens": 44998941892.0, + "step": 10766 + }, + { + "epoch": 1.2795008912655972, + "grad_norm": 0.35145187772374265, + "learning_rate": 7.4634572658075545e-06, + "loss": 0.8153, + "num_tokens": 45003131470.0, + "step": 10767 + }, + { + "epoch": 1.2796197266785503, + "grad_norm": 0.3376220936069989, + "learning_rate": 7.461864684952262e-06, + "loss": 0.7907, + "num_tokens": 45007321551.0, + "step": 10768 + }, + { + "epoch": 1.2797385620915032, + "grad_norm": 0.343893401166834, + "learning_rate": 7.460272235126089e-06, + "loss": 0.832, + "num_tokens": 45011511810.0, + "step": 10769 + }, + { + "epoch": 1.2798573975044563, + "grad_norm": 0.2873725868081904, + "learning_rate": 7.458679916388001e-06, + "loss": 0.7833, + "num_tokens": 45015691819.0, + "step": 10770 + }, + { + "epoch": 1.2799762329174094, + "grad_norm": 0.3667163352741474, + "learning_rate": 7.457087728796968e-06, + "loss": 0.8301, + "num_tokens": 45019865772.0, + "step": 10771 + }, + { + "epoch": 1.2800950683303625, + "grad_norm": 0.30784743846359053, + "learning_rate": 7.455495672411959e-06, + "loss": 0.7785, + "num_tokens": 45024024300.0, + "step": 10772 + }, + { + "epoch": 1.2802139037433156, + "grad_norm": 0.36556905475199103, + "learning_rate": 7.4539037472919266e-06, + "loss": 0.7813, + "num_tokens": 45028177344.0, + "step": 10773 + }, + { + "epoch": 1.2803327391562687, + "grad_norm": 0.32906092657950753, + "learning_rate": 7.452311953495825e-06, + "loss": 0.7799, + "num_tokens": 45032366752.0, + "step": 10774 + }, + { + "epoch": 1.2804515745692218, + "grad_norm": 0.3679832432642906, + "learning_rate": 7.450720291082609e-06, + "loss": 0.8188, + "num_tokens": 45036556221.0, + "step": 10775 + }, + { + "epoch": 1.2805704099821746, + "grad_norm": 0.34515147431458226, + "learning_rate": 7.449128760111224e-06, + "loss": 0.8364, + "num_tokens": 45040745668.0, + "step": 10776 + }, + { + "epoch": 1.2806892453951277, + "grad_norm": 0.3518977665723587, + "learning_rate": 7.4475373606406045e-06, + "loss": 0.8266, + "num_tokens": 45044903711.0, + "step": 10777 + }, + { + "epoch": 1.2808080808080808, + "grad_norm": 0.32554442199423406, + "learning_rate": 7.445946092729688e-06, + "loss": 0.8219, + "num_tokens": 45049064216.0, + "step": 10778 + }, + { + "epoch": 1.280926916221034, + "grad_norm": 0.37767849634711614, + "learning_rate": 7.444354956437405e-06, + "loss": 0.8307, + "num_tokens": 45053217790.0, + "step": 10779 + }, + { + "epoch": 1.2810457516339868, + "grad_norm": 0.31892837504550925, + "learning_rate": 7.442763951822678e-06, + "loss": 0.8846, + "num_tokens": 45057406637.0, + "step": 10780 + }, + { + "epoch": 1.28116458704694, + "grad_norm": 0.4007422010524896, + "learning_rate": 7.441173078944428e-06, + "loss": 0.8576, + "num_tokens": 45061579094.0, + "step": 10781 + }, + { + "epoch": 1.281283422459893, + "grad_norm": 0.33513755492985603, + "learning_rate": 7.439582337861575e-06, + "loss": 0.8355, + "num_tokens": 45065767213.0, + "step": 10782 + }, + { + "epoch": 1.2814022578728461, + "grad_norm": 0.3804028257705722, + "learning_rate": 7.437991728633025e-06, + "loss": 0.7831, + "num_tokens": 45069956951.0, + "step": 10783 + }, + { + "epoch": 1.2815210932857992, + "grad_norm": 0.3040391760676366, + "learning_rate": 7.4364012513176865e-06, + "loss": 0.8514, + "num_tokens": 45074145845.0, + "step": 10784 + }, + { + "epoch": 1.2816399286987523, + "grad_norm": 0.335482450599788, + "learning_rate": 7.434810905974457e-06, + "loss": 0.7697, + "num_tokens": 45078335440.0, + "step": 10785 + }, + { + "epoch": 1.2817587641117054, + "grad_norm": 0.2862203199136419, + "learning_rate": 7.433220692662233e-06, + "loss": 0.8321, + "num_tokens": 45082477482.0, + "step": 10786 + }, + { + "epoch": 1.2818775995246583, + "grad_norm": 0.36293184780849963, + "learning_rate": 7.431630611439904e-06, + "loss": 0.8374, + "num_tokens": 45086668191.0, + "step": 10787 + }, + { + "epoch": 1.2819964349376114, + "grad_norm": 0.3020600393567486, + "learning_rate": 7.430040662366362e-06, + "loss": 0.8058, + "num_tokens": 45090816253.0, + "step": 10788 + }, + { + "epoch": 1.2821152703505645, + "grad_norm": 0.36929115180096544, + "learning_rate": 7.428450845500483e-06, + "loss": 0.8238, + "num_tokens": 45095004704.0, + "step": 10789 + }, + { + "epoch": 1.2822341057635176, + "grad_norm": 0.29625329494124636, + "learning_rate": 7.426861160901146e-06, + "loss": 0.8375, + "num_tokens": 45099179519.0, + "step": 10790 + }, + { + "epoch": 1.2823529411764705, + "grad_norm": 0.378859998091657, + "learning_rate": 7.425271608627222e-06, + "loss": 0.7968, + "num_tokens": 45103360612.0, + "step": 10791 + }, + { + "epoch": 1.2824717765894236, + "grad_norm": 0.3089807010376173, + "learning_rate": 7.423682188737575e-06, + "loss": 0.8204, + "num_tokens": 45107534255.0, + "step": 10792 + }, + { + "epoch": 1.2825906120023767, + "grad_norm": 0.3245221669310278, + "learning_rate": 7.422092901291067e-06, + "loss": 0.8349, + "num_tokens": 45111723761.0, + "step": 10793 + }, + { + "epoch": 1.2827094474153298, + "grad_norm": 0.33485116221795896, + "learning_rate": 7.420503746346559e-06, + "loss": 0.8262, + "num_tokens": 45115912536.0, + "step": 10794 + }, + { + "epoch": 1.2828282828282829, + "grad_norm": 0.3039830711637742, + "learning_rate": 7.4189147239629e-06, + "loss": 0.8431, + "num_tokens": 45120102683.0, + "step": 10795 + }, + { + "epoch": 1.282947118241236, + "grad_norm": 0.3078318883113195, + "learning_rate": 7.417325834198936e-06, + "loss": 0.8225, + "num_tokens": 45124291817.0, + "step": 10796 + }, + { + "epoch": 1.283065953654189, + "grad_norm": 0.365932809448787, + "learning_rate": 7.415737077113507e-06, + "loss": 0.8171, + "num_tokens": 45128452727.0, + "step": 10797 + }, + { + "epoch": 1.283184789067142, + "grad_norm": 0.3073564081479955, + "learning_rate": 7.414148452765458e-06, + "loss": 0.8078, + "num_tokens": 45132642230.0, + "step": 10798 + }, + { + "epoch": 1.283303624480095, + "grad_norm": 0.3680468427636603, + "learning_rate": 7.41255996121361e-06, + "loss": 0.8035, + "num_tokens": 45136800525.0, + "step": 10799 + }, + { + "epoch": 1.2834224598930482, + "grad_norm": 0.31784853263899165, + "learning_rate": 7.4109716025167985e-06, + "loss": 0.8425, + "num_tokens": 45140959742.0, + "step": 10800 + }, + { + "epoch": 1.2835412953060013, + "grad_norm": 0.3379498345445911, + "learning_rate": 7.409383376733845e-06, + "loss": 0.7632, + "num_tokens": 45145149848.0, + "step": 10801 + }, + { + "epoch": 1.2836601307189541, + "grad_norm": 0.33217454247824013, + "learning_rate": 7.4077952839235645e-06, + "loss": 0.8155, + "num_tokens": 45149339931.0, + "step": 10802 + }, + { + "epoch": 1.2837789661319072, + "grad_norm": 0.33022194516013303, + "learning_rate": 7.406207324144772e-06, + "loss": 0.8045, + "num_tokens": 45153530375.0, + "step": 10803 + }, + { + "epoch": 1.2838978015448603, + "grad_norm": 0.3457908765850951, + "learning_rate": 7.404619497456269e-06, + "loss": 0.7977, + "num_tokens": 45157713598.0, + "step": 10804 + }, + { + "epoch": 1.2840166369578134, + "grad_norm": 0.31405600313783527, + "learning_rate": 7.403031803916863e-06, + "loss": 0.8373, + "num_tokens": 45161900719.0, + "step": 10805 + }, + { + "epoch": 1.2841354723707665, + "grad_norm": 0.32942248067238217, + "learning_rate": 7.401444243585352e-06, + "loss": 0.8121, + "num_tokens": 45166090945.0, + "step": 10806 + }, + { + "epoch": 1.2842543077837196, + "grad_norm": 0.30275624209091495, + "learning_rate": 7.399856816520527e-06, + "loss": 0.8235, + "num_tokens": 45170280171.0, + "step": 10807 + }, + { + "epoch": 1.2843731431966727, + "grad_norm": 0.3196926385056769, + "learning_rate": 7.398269522781179e-06, + "loss": 0.8033, + "num_tokens": 45174429144.0, + "step": 10808 + }, + { + "epoch": 1.2844919786096256, + "grad_norm": 0.33918291183915605, + "learning_rate": 7.396682362426086e-06, + "loss": 0.7901, + "num_tokens": 45178619645.0, + "step": 10809 + }, + { + "epoch": 1.2846108140225787, + "grad_norm": 0.29256511554224757, + "learning_rate": 7.395095335514029e-06, + "loss": 0.8184, + "num_tokens": 45182808522.0, + "step": 10810 + }, + { + "epoch": 1.2847296494355318, + "grad_norm": 0.35085232552879086, + "learning_rate": 7.3935084421037805e-06, + "loss": 0.8096, + "num_tokens": 45186997940.0, + "step": 10811 + }, + { + "epoch": 1.284848484848485, + "grad_norm": 0.310589290468892, + "learning_rate": 7.391921682254104e-06, + "loss": 0.8417, + "num_tokens": 45191187322.0, + "step": 10812 + }, + { + "epoch": 1.2849673202614378, + "grad_norm": 0.30163270914833384, + "learning_rate": 7.390335056023772e-06, + "loss": 0.8165, + "num_tokens": 45195376887.0, + "step": 10813 + }, + { + "epoch": 1.2850861556743909, + "grad_norm": 0.3198928228278265, + "learning_rate": 7.388748563471535e-06, + "loss": 0.8446, + "num_tokens": 45199557278.0, + "step": 10814 + }, + { + "epoch": 1.285204991087344, + "grad_norm": 0.2786959163794564, + "learning_rate": 7.3871622046561506e-06, + "loss": 0.7945, + "num_tokens": 45203745112.0, + "step": 10815 + }, + { + "epoch": 1.285323826500297, + "grad_norm": 0.3386535945737579, + "learning_rate": 7.385575979636366e-06, + "loss": 0.8268, + "num_tokens": 45207922386.0, + "step": 10816 + }, + { + "epoch": 1.2854426619132502, + "grad_norm": 0.27575045467984105, + "learning_rate": 7.383989888470921e-06, + "loss": 0.802, + "num_tokens": 45212111102.0, + "step": 10817 + }, + { + "epoch": 1.2855614973262033, + "grad_norm": 0.27330074688358996, + "learning_rate": 7.382403931218558e-06, + "loss": 0.7869, + "num_tokens": 45216299816.0, + "step": 10818 + }, + { + "epoch": 1.2856803327391564, + "grad_norm": 0.2656706128627447, + "learning_rate": 7.380818107938006e-06, + "loss": 0.8297, + "num_tokens": 45220489796.0, + "step": 10819 + }, + { + "epoch": 1.2857991681521093, + "grad_norm": 0.271852687654809, + "learning_rate": 7.379232418687996e-06, + "loss": 0.8177, + "num_tokens": 45224679267.0, + "step": 10820 + }, + { + "epoch": 1.2859180035650624, + "grad_norm": 0.2827870629259425, + "learning_rate": 7.377646863527255e-06, + "loss": 0.7943, + "num_tokens": 45228837196.0, + "step": 10821 + }, + { + "epoch": 1.2860368389780155, + "grad_norm": 0.2883000785790766, + "learning_rate": 7.376061442514497e-06, + "loss": 0.7923, + "num_tokens": 45232984406.0, + "step": 10822 + }, + { + "epoch": 1.2861556743909686, + "grad_norm": 0.2723381269882246, + "learning_rate": 7.374476155708434e-06, + "loss": 0.8402, + "num_tokens": 45237172616.0, + "step": 10823 + }, + { + "epoch": 1.2862745098039214, + "grad_norm": 0.2716820706262749, + "learning_rate": 7.3728910031677835e-06, + "loss": 0.8011, + "num_tokens": 45241362642.0, + "step": 10824 + }, + { + "epoch": 1.2863933452168745, + "grad_norm": 0.27027795482233913, + "learning_rate": 7.371305984951234e-06, + "loss": 0.7943, + "num_tokens": 45245552815.0, + "step": 10825 + }, + { + "epoch": 1.2865121806298276, + "grad_norm": 0.2735755690645571, + "learning_rate": 7.3697211011175e-06, + "loss": 0.7976, + "num_tokens": 45249711323.0, + "step": 10826 + }, + { + "epoch": 1.2866310160427807, + "grad_norm": 0.2849382902084961, + "learning_rate": 7.3681363517252636e-06, + "loss": 0.8321, + "num_tokens": 45253862494.0, + "step": 10827 + }, + { + "epoch": 1.2867498514557338, + "grad_norm": 0.2682752096337917, + "learning_rate": 7.3665517368332205e-06, + "loss": 0.8126, + "num_tokens": 45258026931.0, + "step": 10828 + }, + { + "epoch": 1.286868686868687, + "grad_norm": 0.2781417001997476, + "learning_rate": 7.364967256500053e-06, + "loss": 0.8209, + "num_tokens": 45262214907.0, + "step": 10829 + }, + { + "epoch": 1.28698752228164, + "grad_norm": 0.29984057504376477, + "learning_rate": 7.363382910784436e-06, + "loss": 0.8013, + "num_tokens": 45266390340.0, + "step": 10830 + }, + { + "epoch": 1.287106357694593, + "grad_norm": 0.28385493170432413, + "learning_rate": 7.361798699745042e-06, + "loss": 0.8113, + "num_tokens": 45270580858.0, + "step": 10831 + }, + { + "epoch": 1.287225193107546, + "grad_norm": 0.29861469565651333, + "learning_rate": 7.3602146234405466e-06, + "loss": 0.7549, + "num_tokens": 45274753889.0, + "step": 10832 + }, + { + "epoch": 1.2873440285204991, + "grad_norm": 0.30059520949283686, + "learning_rate": 7.3586306819296085e-06, + "loss": 0.8401, + "num_tokens": 45278941606.0, + "step": 10833 + }, + { + "epoch": 1.2874628639334522, + "grad_norm": 0.2756297610373374, + "learning_rate": 7.357046875270892e-06, + "loss": 0.7731, + "num_tokens": 45283131581.0, + "step": 10834 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.2849802919389072, + "learning_rate": 7.355463203523045e-06, + "loss": 0.8275, + "num_tokens": 45287300272.0, + "step": 10835 + }, + { + "epoch": 1.2877005347593582, + "grad_norm": 0.3343169695951357, + "learning_rate": 7.353879666744716e-06, + "loss": 0.8269, + "num_tokens": 45291461474.0, + "step": 10836 + }, + { + "epoch": 1.2878193701723113, + "grad_norm": 0.2657836522381967, + "learning_rate": 7.352296264994554e-06, + "loss": 0.8401, + "num_tokens": 45295650724.0, + "step": 10837 + }, + { + "epoch": 1.2879382055852644, + "grad_norm": 0.37142249033055535, + "learning_rate": 7.350712998331191e-06, + "loss": 0.8128, + "num_tokens": 45299839561.0, + "step": 10838 + }, + { + "epoch": 1.2880570409982175, + "grad_norm": 0.26674290962651964, + "learning_rate": 7.349129866813267e-06, + "loss": 0.8138, + "num_tokens": 45303989051.0, + "step": 10839 + }, + { + "epoch": 1.2881758764111706, + "grad_norm": 0.33726181152650686, + "learning_rate": 7.347546870499404e-06, + "loss": 0.8167, + "num_tokens": 45308178153.0, + "step": 10840 + }, + { + "epoch": 1.2882947118241237, + "grad_norm": 0.2875508356385044, + "learning_rate": 7.3459640094482344e-06, + "loss": 0.7794, + "num_tokens": 45312339480.0, + "step": 10841 + }, + { + "epoch": 1.2884135472370766, + "grad_norm": 0.32326881248309136, + "learning_rate": 7.3443812837183715e-06, + "loss": 0.8376, + "num_tokens": 45316515565.0, + "step": 10842 + }, + { + "epoch": 1.2885323826500297, + "grad_norm": 0.30228675557470946, + "learning_rate": 7.342798693368427e-06, + "loss": 0.7835, + "num_tokens": 45320681014.0, + "step": 10843 + }, + { + "epoch": 1.2886512180629828, + "grad_norm": 0.3315275609315774, + "learning_rate": 7.341216238457014e-06, + "loss": 0.8413, + "num_tokens": 45324865793.0, + "step": 10844 + }, + { + "epoch": 1.2887700534759359, + "grad_norm": 0.32677323772532463, + "learning_rate": 7.339633919042733e-06, + "loss": 0.8284, + "num_tokens": 45329042985.0, + "step": 10845 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.27180589359568913, + "learning_rate": 7.338051735184184e-06, + "loss": 0.8047, + "num_tokens": 45333180989.0, + "step": 10846 + }, + { + "epoch": 1.2890077243018419, + "grad_norm": 0.3246947648433961, + "learning_rate": 7.336469686939965e-06, + "loss": 0.7892, + "num_tokens": 45337369726.0, + "step": 10847 + }, + { + "epoch": 1.289126559714795, + "grad_norm": 0.28671108799370854, + "learning_rate": 7.334887774368653e-06, + "loss": 0.8378, + "num_tokens": 45341551224.0, + "step": 10848 + }, + { + "epoch": 1.289245395127748, + "grad_norm": 0.3123005727969871, + "learning_rate": 7.3333059975288435e-06, + "loss": 0.8264, + "num_tokens": 45345738482.0, + "step": 10849 + }, + { + "epoch": 1.2893642305407012, + "grad_norm": 0.2988805366115856, + "learning_rate": 7.331724356479107e-06, + "loss": 0.7918, + "num_tokens": 45349929472.0, + "step": 10850 + }, + { + "epoch": 1.2894830659536543, + "grad_norm": 0.27319596646969446, + "learning_rate": 7.330142851278022e-06, + "loss": 0.8068, + "num_tokens": 45354118121.0, + "step": 10851 + }, + { + "epoch": 1.2896019013666074, + "grad_norm": 0.31197125652468, + "learning_rate": 7.328561481984157e-06, + "loss": 0.8022, + "num_tokens": 45358307002.0, + "step": 10852 + }, + { + "epoch": 1.2897207367795602, + "grad_norm": 0.2773553867864544, + "learning_rate": 7.326980248656073e-06, + "loss": 0.833, + "num_tokens": 45362496883.0, + "step": 10853 + }, + { + "epoch": 1.2898395721925133, + "grad_norm": 0.285756628916862, + "learning_rate": 7.325399151352328e-06, + "loss": 0.8451, + "num_tokens": 45366685479.0, + "step": 10854 + }, + { + "epoch": 1.2899584076054664, + "grad_norm": 0.300509898018146, + "learning_rate": 7.3238181901314775e-06, + "loss": 0.7993, + "num_tokens": 45370832138.0, + "step": 10855 + }, + { + "epoch": 1.2900772430184195, + "grad_norm": 0.28255970654788376, + "learning_rate": 7.3222373650520695e-06, + "loss": 0.8269, + "num_tokens": 45375021533.0, + "step": 10856 + }, + { + "epoch": 1.2901960784313726, + "grad_norm": 0.2556313232157474, + "learning_rate": 7.320656676172644e-06, + "loss": 0.809, + "num_tokens": 45379209757.0, + "step": 10857 + }, + { + "epoch": 1.2903149138443255, + "grad_norm": 0.32473102875014825, + "learning_rate": 7.319076123551743e-06, + "loss": 0.8867, + "num_tokens": 45383378927.0, + "step": 10858 + }, + { + "epoch": 1.2904337492572786, + "grad_norm": 0.27585033630452355, + "learning_rate": 7.317495707247899e-06, + "loss": 0.7977, + "num_tokens": 45387563778.0, + "step": 10859 + }, + { + "epoch": 1.2905525846702317, + "grad_norm": 0.2976700841561475, + "learning_rate": 7.315915427319642e-06, + "loss": 0.7784, + "num_tokens": 45391751948.0, + "step": 10860 + }, + { + "epoch": 1.2906714200831848, + "grad_norm": 0.3031872135706316, + "learning_rate": 7.314335283825491e-06, + "loss": 0.8259, + "num_tokens": 45395931768.0, + "step": 10861 + }, + { + "epoch": 1.290790255496138, + "grad_norm": 0.289870875063957, + "learning_rate": 7.312755276823966e-06, + "loss": 0.7919, + "num_tokens": 45400121586.0, + "step": 10862 + }, + { + "epoch": 1.290909090909091, + "grad_norm": 0.2698551524889032, + "learning_rate": 7.311175406373577e-06, + "loss": 0.8314, + "num_tokens": 45404286777.0, + "step": 10863 + }, + { + "epoch": 1.291027926322044, + "grad_norm": 0.31503176625964785, + "learning_rate": 7.309595672532839e-06, + "loss": 0.7894, + "num_tokens": 45408476379.0, + "step": 10864 + }, + { + "epoch": 1.291146761734997, + "grad_norm": 0.2900793556295741, + "learning_rate": 7.308016075360249e-06, + "loss": 0.7961, + "num_tokens": 45412664429.0, + "step": 10865 + }, + { + "epoch": 1.29126559714795, + "grad_norm": 0.30285210935729906, + "learning_rate": 7.3064366149143095e-06, + "loss": 0.7574, + "num_tokens": 45416853792.0, + "step": 10866 + }, + { + "epoch": 1.2913844325609032, + "grad_norm": 0.3259044024485166, + "learning_rate": 7.304857291253506e-06, + "loss": 0.8213, + "num_tokens": 45421044373.0, + "step": 10867 + }, + { + "epoch": 1.2915032679738563, + "grad_norm": 0.366385412293211, + "learning_rate": 7.303278104436333e-06, + "loss": 0.8084, + "num_tokens": 45425231382.0, + "step": 10868 + }, + { + "epoch": 1.2916221033868092, + "grad_norm": 0.2660654043274278, + "learning_rate": 7.301699054521269e-06, + "loss": 0.8352, + "num_tokens": 45429420876.0, + "step": 10869 + }, + { + "epoch": 1.2917409387997623, + "grad_norm": 0.31338688867141506, + "learning_rate": 7.300120141566795e-06, + "loss": 0.7969, + "num_tokens": 45433610414.0, + "step": 10870 + }, + { + "epoch": 1.2918597742127154, + "grad_norm": 0.27274151111982214, + "learning_rate": 7.298541365631382e-06, + "loss": 0.8255, + "num_tokens": 45437779329.0, + "step": 10871 + }, + { + "epoch": 1.2919786096256685, + "grad_norm": 0.2809837610131963, + "learning_rate": 7.296962726773498e-06, + "loss": 0.8037, + "num_tokens": 45441969586.0, + "step": 10872 + }, + { + "epoch": 1.2920974450386216, + "grad_norm": 0.30160867211920644, + "learning_rate": 7.295384225051601e-06, + "loss": 0.7992, + "num_tokens": 45446159347.0, + "step": 10873 + }, + { + "epoch": 1.2922162804515747, + "grad_norm": 0.2673190871476567, + "learning_rate": 7.293805860524155e-06, + "loss": 0.826, + "num_tokens": 45450335996.0, + "step": 10874 + }, + { + "epoch": 1.2923351158645278, + "grad_norm": 0.31912735495860584, + "learning_rate": 7.292227633249609e-06, + "loss": 0.7975, + "num_tokens": 45454525232.0, + "step": 10875 + }, + { + "epoch": 1.2924539512774806, + "grad_norm": 0.3108310485204987, + "learning_rate": 7.2906495432864055e-06, + "loss": 0.7907, + "num_tokens": 45458674075.0, + "step": 10876 + }, + { + "epoch": 1.2925727866904337, + "grad_norm": 0.2834927892080846, + "learning_rate": 7.289071590692996e-06, + "loss": 0.8125, + "num_tokens": 45462852740.0, + "step": 10877 + }, + { + "epoch": 1.2926916221033868, + "grad_norm": 0.2891906210194395, + "learning_rate": 7.28749377552781e-06, + "loss": 0.8118, + "num_tokens": 45467032545.0, + "step": 10878 + }, + { + "epoch": 1.29281045751634, + "grad_norm": 0.2953366911220562, + "learning_rate": 7.285916097849284e-06, + "loss": 0.7942, + "num_tokens": 45471220636.0, + "step": 10879 + }, + { + "epoch": 1.2929292929292928, + "grad_norm": 0.29368080452059203, + "learning_rate": 7.284338557715842e-06, + "loss": 0.8041, + "num_tokens": 45475365920.0, + "step": 10880 + }, + { + "epoch": 1.293048128342246, + "grad_norm": 0.3102592993074253, + "learning_rate": 7.282761155185905e-06, + "loss": 0.8301, + "num_tokens": 45479553205.0, + "step": 10881 + }, + { + "epoch": 1.293166963755199, + "grad_norm": 0.2951396776318245, + "learning_rate": 7.281183890317887e-06, + "loss": 0.8163, + "num_tokens": 45483741022.0, + "step": 10882 + }, + { + "epoch": 1.2932857991681521, + "grad_norm": 0.2966793266101752, + "learning_rate": 7.279606763170206e-06, + "loss": 0.8437, + "num_tokens": 45487930867.0, + "step": 10883 + }, + { + "epoch": 1.2934046345811052, + "grad_norm": 0.28755955261186467, + "learning_rate": 7.278029773801269e-06, + "loss": 0.7938, + "num_tokens": 45492097486.0, + "step": 10884 + }, + { + "epoch": 1.2935234699940583, + "grad_norm": 0.278226192293006, + "learning_rate": 7.276452922269472e-06, + "loss": 0.7949, + "num_tokens": 45496286843.0, + "step": 10885 + }, + { + "epoch": 1.2936423054070114, + "grad_norm": 0.2618498785132971, + "learning_rate": 7.274876208633213e-06, + "loss": 0.8571, + "num_tokens": 45500477426.0, + "step": 10886 + }, + { + "epoch": 1.2937611408199643, + "grad_norm": 0.29539344803147427, + "learning_rate": 7.273299632950881e-06, + "loss": 0.7799, + "num_tokens": 45504666562.0, + "step": 10887 + }, + { + "epoch": 1.2938799762329174, + "grad_norm": 0.26659415721818075, + "learning_rate": 7.271723195280862e-06, + "loss": 0.8334, + "num_tokens": 45508831599.0, + "step": 10888 + }, + { + "epoch": 1.2939988116458705, + "grad_norm": 0.3246631794064178, + "learning_rate": 7.270146895681544e-06, + "loss": 0.7986, + "num_tokens": 45513020820.0, + "step": 10889 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.295045042699197, + "learning_rate": 7.268570734211296e-06, + "loss": 0.8152, + "num_tokens": 45517209307.0, + "step": 10890 + }, + { + "epoch": 1.2942364824717765, + "grad_norm": 0.338443421690692, + "learning_rate": 7.266994710928488e-06, + "loss": 0.8093, + "num_tokens": 45521383650.0, + "step": 10891 + }, + { + "epoch": 1.2943553178847296, + "grad_norm": 0.27068742865696505, + "learning_rate": 7.26541882589149e-06, + "loss": 0.7968, + "num_tokens": 45525570109.0, + "step": 10892 + }, + { + "epoch": 1.2944741532976827, + "grad_norm": 0.36441564192678394, + "learning_rate": 7.263843079158659e-06, + "loss": 0.8223, + "num_tokens": 45529736342.0, + "step": 10893 + }, + { + "epoch": 1.2945929887106358, + "grad_norm": 0.3064326077366642, + "learning_rate": 7.262267470788349e-06, + "loss": 0.7937, + "num_tokens": 45533925496.0, + "step": 10894 + }, + { + "epoch": 1.2947118241235889, + "grad_norm": 0.2914271242349151, + "learning_rate": 7.260692000838911e-06, + "loss": 0.7958, + "num_tokens": 45538114686.0, + "step": 10895 + }, + { + "epoch": 1.294830659536542, + "grad_norm": 0.3290102394981068, + "learning_rate": 7.259116669368693e-06, + "loss": 0.8476, + "num_tokens": 45542304400.0, + "step": 10896 + }, + { + "epoch": 1.294949494949495, + "grad_norm": 0.2840102611675502, + "learning_rate": 7.257541476436036e-06, + "loss": 0.8319, + "num_tokens": 45546494211.0, + "step": 10897 + }, + { + "epoch": 1.295068330362448, + "grad_norm": 0.3398247110403468, + "learning_rate": 7.25596642209927e-06, + "loss": 0.7951, + "num_tokens": 45550683157.0, + "step": 10898 + }, + { + "epoch": 1.295187165775401, + "grad_norm": 0.29309949675991565, + "learning_rate": 7.254391506416727e-06, + "loss": 0.8016, + "num_tokens": 45554871692.0, + "step": 10899 + }, + { + "epoch": 1.2953060011883542, + "grad_norm": 0.279711478327559, + "learning_rate": 7.252816729446728e-06, + "loss": 0.8239, + "num_tokens": 45559059188.0, + "step": 10900 + }, + { + "epoch": 1.2954248366013073, + "grad_norm": 0.2733661903852781, + "learning_rate": 7.251242091247594e-06, + "loss": 0.8205, + "num_tokens": 45563248021.0, + "step": 10901 + }, + { + "epoch": 1.2955436720142601, + "grad_norm": 0.29278065345789517, + "learning_rate": 7.2496675918776406e-06, + "loss": 0.851, + "num_tokens": 45567429068.0, + "step": 10902 + }, + { + "epoch": 1.2956625074272132, + "grad_norm": 0.2636623150142171, + "learning_rate": 7.248093231395181e-06, + "loss": 0.77, + "num_tokens": 45571620339.0, + "step": 10903 + }, + { + "epoch": 1.2957813428401663, + "grad_norm": 0.30931828373370723, + "learning_rate": 7.246519009858513e-06, + "loss": 0.8237, + "num_tokens": 45575808626.0, + "step": 10904 + }, + { + "epoch": 1.2959001782531194, + "grad_norm": 0.3038800515352799, + "learning_rate": 7.244944927325934e-06, + "loss": 0.7863, + "num_tokens": 45579998528.0, + "step": 10905 + }, + { + "epoch": 1.2960190136660725, + "grad_norm": 0.29458973890767365, + "learning_rate": 7.243370983855744e-06, + "loss": 0.8552, + "num_tokens": 45584186057.0, + "step": 10906 + }, + { + "epoch": 1.2961378490790256, + "grad_norm": 0.3051937123614404, + "learning_rate": 7.241797179506227e-06, + "loss": 0.7901, + "num_tokens": 45588342982.0, + "step": 10907 + }, + { + "epoch": 1.2962566844919787, + "grad_norm": 0.27805171304227255, + "learning_rate": 7.240223514335665e-06, + "loss": 0.8031, + "num_tokens": 45592507203.0, + "step": 10908 + }, + { + "epoch": 1.2963755199049316, + "grad_norm": 0.2942551068495878, + "learning_rate": 7.23864998840234e-06, + "loss": 0.7898, + "num_tokens": 45596695778.0, + "step": 10909 + }, + { + "epoch": 1.2964943553178847, + "grad_norm": 0.30103707253963424, + "learning_rate": 7.237076601764523e-06, + "loss": 0.7934, + "num_tokens": 45600883824.0, + "step": 10910 + }, + { + "epoch": 1.2966131907308378, + "grad_norm": 0.2994125849158749, + "learning_rate": 7.23550335448048e-06, + "loss": 0.8033, + "num_tokens": 45605073903.0, + "step": 10911 + }, + { + "epoch": 1.296732026143791, + "grad_norm": 0.28426322055998215, + "learning_rate": 7.233930246608478e-06, + "loss": 0.8222, + "num_tokens": 45609254690.0, + "step": 10912 + }, + { + "epoch": 1.2968508615567438, + "grad_norm": 0.3104006442036397, + "learning_rate": 7.232357278206774e-06, + "loss": 0.8442, + "num_tokens": 45613412255.0, + "step": 10913 + }, + { + "epoch": 1.2969696969696969, + "grad_norm": 0.2735052013735325, + "learning_rate": 7.230784449333617e-06, + "loss": 0.8185, + "num_tokens": 45617601881.0, + "step": 10914 + }, + { + "epoch": 1.29708853238265, + "grad_norm": 0.29262358499744184, + "learning_rate": 7.229211760047255e-06, + "loss": 0.8328, + "num_tokens": 45621772916.0, + "step": 10915 + }, + { + "epoch": 1.297207367795603, + "grad_norm": 0.2726387967386291, + "learning_rate": 7.227639210405933e-06, + "loss": 0.8115, + "num_tokens": 45625961985.0, + "step": 10916 + }, + { + "epoch": 1.2973262032085562, + "grad_norm": 0.2893014351361578, + "learning_rate": 7.226066800467883e-06, + "loss": 0.7952, + "num_tokens": 45630150291.0, + "step": 10917 + }, + { + "epoch": 1.2974450386215093, + "grad_norm": 0.2805195179341996, + "learning_rate": 7.2244945302913415e-06, + "loss": 0.8425, + "num_tokens": 45634308534.0, + "step": 10918 + }, + { + "epoch": 1.2975638740344624, + "grad_norm": 0.26751499496540115, + "learning_rate": 7.222922399934531e-06, + "loss": 0.8235, + "num_tokens": 45638498832.0, + "step": 10919 + }, + { + "epoch": 1.2976827094474153, + "grad_norm": 0.288402881849572, + "learning_rate": 7.221350409455677e-06, + "loss": 0.776, + "num_tokens": 45642678160.0, + "step": 10920 + }, + { + "epoch": 1.2978015448603684, + "grad_norm": 0.27450952010055396, + "learning_rate": 7.219778558912991e-06, + "loss": 0.8026, + "num_tokens": 45646842282.0, + "step": 10921 + }, + { + "epoch": 1.2979203802733215, + "grad_norm": 0.3014491980066265, + "learning_rate": 7.218206848364686e-06, + "loss": 0.7805, + "num_tokens": 45651006760.0, + "step": 10922 + }, + { + "epoch": 1.2980392156862746, + "grad_norm": 0.2869022609631239, + "learning_rate": 7.216635277868971e-06, + "loss": 0.8134, + "num_tokens": 45655149757.0, + "step": 10923 + }, + { + "epoch": 1.2981580510992274, + "grad_norm": 0.3314989151036789, + "learning_rate": 7.215063847484041e-06, + "loss": 0.8165, + "num_tokens": 45659308441.0, + "step": 10924 + }, + { + "epoch": 1.2982768865121805, + "grad_norm": 0.2694230082028582, + "learning_rate": 7.213492557268098e-06, + "loss": 0.7826, + "num_tokens": 45663497534.0, + "step": 10925 + }, + { + "epoch": 1.2983957219251336, + "grad_norm": 0.29647528728430733, + "learning_rate": 7.211921407279325e-06, + "loss": 0.8093, + "num_tokens": 45667685800.0, + "step": 10926 + }, + { + "epoch": 1.2985145573380867, + "grad_norm": 0.2688873260078566, + "learning_rate": 7.2103503975759045e-06, + "loss": 0.8139, + "num_tokens": 45671874330.0, + "step": 10927 + }, + { + "epoch": 1.2986333927510398, + "grad_norm": 0.2836968540807309, + "learning_rate": 7.208779528216028e-06, + "loss": 0.7933, + "num_tokens": 45676062717.0, + "step": 10928 + }, + { + "epoch": 1.298752228163993, + "grad_norm": 0.30907055425361185, + "learning_rate": 7.207208799257868e-06, + "loss": 0.8377, + "num_tokens": 45680213753.0, + "step": 10929 + }, + { + "epoch": 1.298871063576946, + "grad_norm": 0.2868662280252345, + "learning_rate": 7.205638210759585e-06, + "loss": 0.7927, + "num_tokens": 45684397699.0, + "step": 10930 + }, + { + "epoch": 1.298989898989899, + "grad_norm": 0.27782475154776437, + "learning_rate": 7.204067762779346e-06, + "loss": 0.8257, + "num_tokens": 45688586166.0, + "step": 10931 + }, + { + "epoch": 1.299108734402852, + "grad_norm": 0.3121729518368324, + "learning_rate": 7.202497455375317e-06, + "loss": 0.8155, + "num_tokens": 45692770681.0, + "step": 10932 + }, + { + "epoch": 1.2992275698158051, + "grad_norm": 0.33080737157498485, + "learning_rate": 7.200927288605639e-06, + "loss": 0.7894, + "num_tokens": 45696961048.0, + "step": 10933 + }, + { + "epoch": 1.2993464052287582, + "grad_norm": 0.29475124867946595, + "learning_rate": 7.199357262528477e-06, + "loss": 0.7762, + "num_tokens": 45701151427.0, + "step": 10934 + }, + { + "epoch": 1.299465240641711, + "grad_norm": 0.30610150090304244, + "learning_rate": 7.197787377201962e-06, + "loss": 0.8238, + "num_tokens": 45705341120.0, + "step": 10935 + }, + { + "epoch": 1.2995840760546642, + "grad_norm": 0.2960066301275036, + "learning_rate": 7.196217632684237e-06, + "loss": 0.8186, + "num_tokens": 45709524749.0, + "step": 10936 + }, + { + "epoch": 1.2997029114676173, + "grad_norm": 0.29749841565867224, + "learning_rate": 7.194648029033433e-06, + "loss": 0.8286, + "num_tokens": 45713693892.0, + "step": 10937 + }, + { + "epoch": 1.2998217468805704, + "grad_norm": 0.29290872311712585, + "learning_rate": 7.193078566307678e-06, + "loss": 0.8239, + "num_tokens": 45717884340.0, + "step": 10938 + }, + { + "epoch": 1.2999405822935235, + "grad_norm": 0.2922464438908613, + "learning_rate": 7.191509244565098e-06, + "loss": 0.7924, + "num_tokens": 45722073446.0, + "step": 10939 + }, + { + "epoch": 1.3000594177064766, + "grad_norm": 0.29483892846604226, + "learning_rate": 7.189940063863805e-06, + "loss": 0.8355, + "num_tokens": 45726244110.0, + "step": 10940 + }, + { + "epoch": 1.3001782531194297, + "grad_norm": 0.29236653927631323, + "learning_rate": 7.188371024261913e-06, + "loss": 0.8396, + "num_tokens": 45730376966.0, + "step": 10941 + }, + { + "epoch": 1.3002970885323826, + "grad_norm": 0.26553415362881944, + "learning_rate": 7.186802125817531e-06, + "loss": 0.8167, + "num_tokens": 45734566240.0, + "step": 10942 + }, + { + "epoch": 1.3004159239453357, + "grad_norm": 0.3354322537387792, + "learning_rate": 7.18523336858876e-06, + "loss": 0.8303, + "num_tokens": 45738755389.0, + "step": 10943 + }, + { + "epoch": 1.3005347593582888, + "grad_norm": 0.3096627711685396, + "learning_rate": 7.1836647526336945e-06, + "loss": 0.7871, + "num_tokens": 45742917057.0, + "step": 10944 + }, + { + "epoch": 1.3006535947712419, + "grad_norm": 0.2940059468148815, + "learning_rate": 7.182096278010424e-06, + "loss": 0.8547, + "num_tokens": 45747105874.0, + "step": 10945 + }, + { + "epoch": 1.300772430184195, + "grad_norm": 0.332796087648539, + "learning_rate": 7.180527944777041e-06, + "loss": 0.7843, + "num_tokens": 45751296159.0, + "step": 10946 + }, + { + "epoch": 1.3008912655971479, + "grad_norm": 0.2895568867170018, + "learning_rate": 7.178959752991619e-06, + "loss": 0.8329, + "num_tokens": 45755484759.0, + "step": 10947 + }, + { + "epoch": 1.301010101010101, + "grad_norm": 0.32010120188592694, + "learning_rate": 7.177391702712238e-06, + "loss": 0.7766, + "num_tokens": 45759674670.0, + "step": 10948 + }, + { + "epoch": 1.301128936423054, + "grad_norm": 0.29584005125489643, + "learning_rate": 7.175823793996966e-06, + "loss": 0.8091, + "num_tokens": 45763864164.0, + "step": 10949 + }, + { + "epoch": 1.3012477718360071, + "grad_norm": 0.32402206624272606, + "learning_rate": 7.174256026903869e-06, + "loss": 0.8036, + "num_tokens": 45768027829.0, + "step": 10950 + }, + { + "epoch": 1.3013666072489602, + "grad_norm": 0.272052803566575, + "learning_rate": 7.17268840149101e-06, + "loss": 0.8506, + "num_tokens": 45772181766.0, + "step": 10951 + }, + { + "epoch": 1.3014854426619133, + "grad_norm": 0.3457240227537816, + "learning_rate": 7.171120917816434e-06, + "loss": 0.7921, + "num_tokens": 45776371719.0, + "step": 10952 + }, + { + "epoch": 1.3016042780748662, + "grad_norm": 0.27320246065661785, + "learning_rate": 7.1695535759382e-06, + "loss": 0.8253, + "num_tokens": 45780561477.0, + "step": 10953 + }, + { + "epoch": 1.3017231134878193, + "grad_norm": 0.3154815249596651, + "learning_rate": 7.167986375914347e-06, + "loss": 0.8177, + "num_tokens": 45784751135.0, + "step": 10954 + }, + { + "epoch": 1.3018419489007724, + "grad_norm": 0.3040691743003497, + "learning_rate": 7.166419317802913e-06, + "loss": 0.8092, + "num_tokens": 45788909009.0, + "step": 10955 + }, + { + "epoch": 1.3019607843137255, + "grad_norm": 0.2865227194315885, + "learning_rate": 7.1648524016619395e-06, + "loss": 0.8375, + "num_tokens": 45793078065.0, + "step": 10956 + }, + { + "epoch": 1.3020796197266786, + "grad_norm": 0.2684087190818088, + "learning_rate": 7.163285627549443e-06, + "loss": 0.8132, + "num_tokens": 45797267222.0, + "step": 10957 + }, + { + "epoch": 1.3021984551396315, + "grad_norm": 0.2941345087747009, + "learning_rate": 7.161718995523452e-06, + "loss": 0.8016, + "num_tokens": 45801440902.0, + "step": 10958 + }, + { + "epoch": 1.3023172905525846, + "grad_norm": 0.28580577085136716, + "learning_rate": 7.160152505641984e-06, + "loss": 0.785, + "num_tokens": 45805630358.0, + "step": 10959 + }, + { + "epoch": 1.3024361259655377, + "grad_norm": 0.28066280411264954, + "learning_rate": 7.158586157963052e-06, + "loss": 0.7954, + "num_tokens": 45809820071.0, + "step": 10960 + }, + { + "epoch": 1.3025549613784908, + "grad_norm": 0.29024018592489514, + "learning_rate": 7.157019952544668e-06, + "loss": 0.8713, + "num_tokens": 45814008407.0, + "step": 10961 + }, + { + "epoch": 1.302673796791444, + "grad_norm": 0.282301999757956, + "learning_rate": 7.1554538894448256e-06, + "loss": 0.8562, + "num_tokens": 45818197836.0, + "step": 10962 + }, + { + "epoch": 1.302792632204397, + "grad_norm": 0.2763599035040421, + "learning_rate": 7.153887968721523e-06, + "loss": 0.816, + "num_tokens": 45822374400.0, + "step": 10963 + }, + { + "epoch": 1.30291146761735, + "grad_norm": 0.28756914282525126, + "learning_rate": 7.15232219043275e-06, + "loss": 0.7676, + "num_tokens": 45826563969.0, + "step": 10964 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.32167225145495504, + "learning_rate": 7.150756554636498e-06, + "loss": 0.812, + "num_tokens": 45830753983.0, + "step": 10965 + }, + { + "epoch": 1.303149138443256, + "grad_norm": 0.3075809537502223, + "learning_rate": 7.149191061390748e-06, + "loss": 0.8328, + "num_tokens": 45834872930.0, + "step": 10966 + }, + { + "epoch": 1.3032679738562092, + "grad_norm": 0.2845446634443348, + "learning_rate": 7.147625710753472e-06, + "loss": 0.8669, + "num_tokens": 45839061304.0, + "step": 10967 + }, + { + "epoch": 1.3033868092691623, + "grad_norm": 0.28016992198301366, + "learning_rate": 7.146060502782639e-06, + "loss": 0.8078, + "num_tokens": 45843250269.0, + "step": 10968 + }, + { + "epoch": 1.3035056446821152, + "grad_norm": 0.2901673840891659, + "learning_rate": 7.144495437536216e-06, + "loss": 0.7762, + "num_tokens": 45847440123.0, + "step": 10969 + }, + { + "epoch": 1.3036244800950683, + "grad_norm": 0.28379725422130564, + "learning_rate": 7.142930515072164e-06, + "loss": 0.8366, + "num_tokens": 45851629242.0, + "step": 10970 + }, + { + "epoch": 1.3037433155080214, + "grad_norm": 0.30828248478491305, + "learning_rate": 7.141365735448435e-06, + "loss": 0.7781, + "num_tokens": 45855819518.0, + "step": 10971 + }, + { + "epoch": 1.3038621509209745, + "grad_norm": 0.28776613475394014, + "learning_rate": 7.13980109872298e-06, + "loss": 0.8375, + "num_tokens": 45859992797.0, + "step": 10972 + }, + { + "epoch": 1.3039809863339276, + "grad_norm": 0.3112454897050743, + "learning_rate": 7.138236604953743e-06, + "loss": 0.7618, + "num_tokens": 45864183434.0, + "step": 10973 + }, + { + "epoch": 1.3040998217468807, + "grad_norm": 0.3095992372893845, + "learning_rate": 7.13667225419866e-06, + "loss": 0.8204, + "num_tokens": 45868341115.0, + "step": 10974 + }, + { + "epoch": 1.3042186571598338, + "grad_norm": 0.26543880945287424, + "learning_rate": 7.135108046515664e-06, + "loss": 0.8092, + "num_tokens": 45872529060.0, + "step": 10975 + }, + { + "epoch": 1.3043374925727866, + "grad_norm": 0.3086298284670499, + "learning_rate": 7.133543981962687e-06, + "loss": 0.8126, + "num_tokens": 45876689451.0, + "step": 10976 + }, + { + "epoch": 1.3044563279857397, + "grad_norm": 0.2829505825211704, + "learning_rate": 7.131980060597654e-06, + "loss": 0.7872, + "num_tokens": 45880853581.0, + "step": 10977 + }, + { + "epoch": 1.3045751633986928, + "grad_norm": 0.32345040421455346, + "learning_rate": 7.130416282478468e-06, + "loss": 0.8593, + "num_tokens": 45885040712.0, + "step": 10978 + }, + { + "epoch": 1.304693998811646, + "grad_norm": 0.28516544185182485, + "learning_rate": 7.1288526476630535e-06, + "loss": 0.8111, + "num_tokens": 45889216557.0, + "step": 10979 + }, + { + "epoch": 1.3048128342245988, + "grad_norm": 0.3116702698015308, + "learning_rate": 7.127289156209316e-06, + "loss": 0.7835, + "num_tokens": 45893405879.0, + "step": 10980 + }, + { + "epoch": 1.304931669637552, + "grad_norm": 0.28147727400253514, + "learning_rate": 7.125725808175154e-06, + "loss": 0.8799, + "num_tokens": 45897594203.0, + "step": 10981 + }, + { + "epoch": 1.305050505050505, + "grad_norm": 0.30673630952146524, + "learning_rate": 7.124162603618467e-06, + "loss": 0.8125, + "num_tokens": 45901771356.0, + "step": 10982 + }, + { + "epoch": 1.3051693404634581, + "grad_norm": 0.2926678347101508, + "learning_rate": 7.122599542597142e-06, + "loss": 0.7931, + "num_tokens": 45905954898.0, + "step": 10983 + }, + { + "epoch": 1.3052881758764112, + "grad_norm": 0.2808153951363909, + "learning_rate": 7.12103662516906e-06, + "loss": 0.8065, + "num_tokens": 45910143281.0, + "step": 10984 + }, + { + "epoch": 1.3054070112893643, + "grad_norm": 0.29534610073535233, + "learning_rate": 7.119473851392111e-06, + "loss": 0.8193, + "num_tokens": 45914333311.0, + "step": 10985 + }, + { + "epoch": 1.3055258467023174, + "grad_norm": 0.2883902827038459, + "learning_rate": 7.117911221324167e-06, + "loss": 0.8305, + "num_tokens": 45918522950.0, + "step": 10986 + }, + { + "epoch": 1.3056446821152703, + "grad_norm": 0.28916607314193393, + "learning_rate": 7.1163487350231e-06, + "loss": 0.8402, + "num_tokens": 45922709896.0, + "step": 10987 + }, + { + "epoch": 1.3057635175282234, + "grad_norm": 0.26869446614292036, + "learning_rate": 7.114786392546766e-06, + "loss": 0.7624, + "num_tokens": 45926899475.0, + "step": 10988 + }, + { + "epoch": 1.3058823529411765, + "grad_norm": 0.30532127042184787, + "learning_rate": 7.1132241939530255e-06, + "loss": 0.8429, + "num_tokens": 45931072393.0, + "step": 10989 + }, + { + "epoch": 1.3060011883541296, + "grad_norm": 0.2678735468000605, + "learning_rate": 7.111662139299735e-06, + "loss": 0.8145, + "num_tokens": 45935263159.0, + "step": 10990 + }, + { + "epoch": 1.3061200237670825, + "grad_norm": 0.2905209499696503, + "learning_rate": 7.110100228644745e-06, + "loss": 0.8265, + "num_tokens": 45939441094.0, + "step": 10991 + }, + { + "epoch": 1.3062388591800356, + "grad_norm": 0.2943149148782982, + "learning_rate": 7.108538462045897e-06, + "loss": 0.803, + "num_tokens": 45943612836.0, + "step": 10992 + }, + { + "epoch": 1.3063576945929887, + "grad_norm": 0.3227187670742338, + "learning_rate": 7.106976839561024e-06, + "loss": 0.8027, + "num_tokens": 45947801605.0, + "step": 10993 + }, + { + "epoch": 1.3064765300059418, + "grad_norm": 0.28956524955887114, + "learning_rate": 7.105415361247963e-06, + "loss": 0.8158, + "num_tokens": 45951991465.0, + "step": 10994 + }, + { + "epoch": 1.3065953654188949, + "grad_norm": 0.30169587558391053, + "learning_rate": 7.103854027164539e-06, + "loss": 0.7991, + "num_tokens": 45956180328.0, + "step": 10995 + }, + { + "epoch": 1.306714200831848, + "grad_norm": 0.31688379659835636, + "learning_rate": 7.102292837368572e-06, + "loss": 0.7894, + "num_tokens": 45960359881.0, + "step": 10996 + }, + { + "epoch": 1.306833036244801, + "grad_norm": 0.29052991797735694, + "learning_rate": 7.100731791917879e-06, + "loss": 0.8083, + "num_tokens": 45964547288.0, + "step": 10997 + }, + { + "epoch": 1.306951871657754, + "grad_norm": 0.3407638079379774, + "learning_rate": 7.099170890870273e-06, + "loss": 0.8203, + "num_tokens": 45968729955.0, + "step": 10998 + }, + { + "epoch": 1.307070707070707, + "grad_norm": 0.28277386081319567, + "learning_rate": 7.097610134283557e-06, + "loss": 0.8249, + "num_tokens": 45972904032.0, + "step": 10999 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.3278293710263739, + "learning_rate": 7.096049522215534e-06, + "loss": 0.7993, + "num_tokens": 45977094388.0, + "step": 11000 + }, + { + "epoch": 1.3073083778966132, + "grad_norm": 0.3061181548255329, + "learning_rate": 7.0944890547239945e-06, + "loss": 0.8469, + "num_tokens": 45981283854.0, + "step": 11001 + }, + { + "epoch": 1.3074272133095661, + "grad_norm": 0.26582695934523576, + "learning_rate": 7.092928731866733e-06, + "loss": 0.8201, + "num_tokens": 45985459291.0, + "step": 11002 + }, + { + "epoch": 1.3075460487225192, + "grad_norm": 0.2796189088359062, + "learning_rate": 7.091368553701527e-06, + "loss": 0.8141, + "num_tokens": 45989647354.0, + "step": 11003 + }, + { + "epoch": 1.3076648841354723, + "grad_norm": 0.29185477929699444, + "learning_rate": 7.0898085202861615e-06, + "loss": 0.8152, + "num_tokens": 45993767504.0, + "step": 11004 + }, + { + "epoch": 1.3077837195484254, + "grad_norm": 0.267902200162261, + "learning_rate": 7.088248631678405e-06, + "loss": 0.829, + "num_tokens": 45997954833.0, + "step": 11005 + }, + { + "epoch": 1.3079025549613785, + "grad_norm": 0.33338049342460874, + "learning_rate": 7.086688887936031e-06, + "loss": 0.8186, + "num_tokens": 46002120815.0, + "step": 11006 + }, + { + "epoch": 1.3080213903743316, + "grad_norm": 0.2723955776138593, + "learning_rate": 7.0851292891167965e-06, + "loss": 0.815, + "num_tokens": 46006250057.0, + "step": 11007 + }, + { + "epoch": 1.3081402257872847, + "grad_norm": 0.3145267039970784, + "learning_rate": 7.083569835278466e-06, + "loss": 0.8314, + "num_tokens": 46010415406.0, + "step": 11008 + }, + { + "epoch": 1.3082590612002376, + "grad_norm": 0.28021082277991494, + "learning_rate": 7.082010526478776e-06, + "loss": 0.8234, + "num_tokens": 46014584745.0, + "step": 11009 + }, + { + "epoch": 1.3083778966131907, + "grad_norm": 0.29108225709905344, + "learning_rate": 7.080451362775489e-06, + "loss": 0.8199, + "num_tokens": 46018773672.0, + "step": 11010 + }, + { + "epoch": 1.3084967320261438, + "grad_norm": 0.27123833979342904, + "learning_rate": 7.07889234422634e-06, + "loss": 0.8401, + "num_tokens": 46022963313.0, + "step": 11011 + }, + { + "epoch": 1.308615567439097, + "grad_norm": 0.29463422315727456, + "learning_rate": 7.077333470889065e-06, + "loss": 0.8274, + "num_tokens": 46027133230.0, + "step": 11012 + }, + { + "epoch": 1.3087344028520498, + "grad_norm": 0.2672028458549278, + "learning_rate": 7.075774742821398e-06, + "loss": 0.7906, + "num_tokens": 46031276544.0, + "step": 11013 + }, + { + "epoch": 1.3088532382650029, + "grad_norm": 0.3046993131787734, + "learning_rate": 7.074216160081058e-06, + "loss": 0.8155, + "num_tokens": 46035465926.0, + "step": 11014 + }, + { + "epoch": 1.308972073677956, + "grad_norm": 0.2912949006783124, + "learning_rate": 7.072657722725764e-06, + "loss": 0.8442, + "num_tokens": 46039654971.0, + "step": 11015 + }, + { + "epoch": 1.309090909090909, + "grad_norm": 0.2909510229343891, + "learning_rate": 7.071099430813231e-06, + "loss": 0.826, + "num_tokens": 46043839771.0, + "step": 11016 + }, + { + "epoch": 1.3092097445038622, + "grad_norm": 0.29702579332018664, + "learning_rate": 7.0695412844011735e-06, + "loss": 0.797, + "num_tokens": 46048028609.0, + "step": 11017 + }, + { + "epoch": 1.3093285799168153, + "grad_norm": 0.2764895686101262, + "learning_rate": 7.067983283547297e-06, + "loss": 0.8276, + "num_tokens": 46052217492.0, + "step": 11018 + }, + { + "epoch": 1.3094474153297684, + "grad_norm": 0.2844647496286468, + "learning_rate": 7.066425428309289e-06, + "loss": 0.8079, + "num_tokens": 46056358621.0, + "step": 11019 + }, + { + "epoch": 1.3095662507427213, + "grad_norm": 0.28611231124193603, + "learning_rate": 7.064867718744846e-06, + "loss": 0.7982, + "num_tokens": 46060546342.0, + "step": 11020 + }, + { + "epoch": 1.3096850861556744, + "grad_norm": 0.30666968101071806, + "learning_rate": 7.063310154911658e-06, + "loss": 0.8226, + "num_tokens": 46064734145.0, + "step": 11021 + }, + { + "epoch": 1.3098039215686275, + "grad_norm": 0.2947708704349179, + "learning_rate": 7.0617527368674e-06, + "loss": 0.8466, + "num_tokens": 46068918454.0, + "step": 11022 + }, + { + "epoch": 1.3099227569815806, + "grad_norm": 0.3174819872181726, + "learning_rate": 7.060195464669764e-06, + "loss": 0.8429, + "num_tokens": 46073033132.0, + "step": 11023 + }, + { + "epoch": 1.3100415923945334, + "grad_norm": 0.29131714205758386, + "learning_rate": 7.058638338376405e-06, + "loss": 0.8334, + "num_tokens": 46077221332.0, + "step": 11024 + }, + { + "epoch": 1.3101604278074865, + "grad_norm": 0.29213943724310804, + "learning_rate": 7.0570813580449945e-06, + "loss": 0.8392, + "num_tokens": 46081410178.0, + "step": 11025 + }, + { + "epoch": 1.3102792632204396, + "grad_norm": 0.2958950348644542, + "learning_rate": 7.055524523733191e-06, + "loss": 0.8466, + "num_tokens": 46085575591.0, + "step": 11026 + }, + { + "epoch": 1.3103980986333927, + "grad_norm": 0.29164842521570084, + "learning_rate": 7.053967835498652e-06, + "loss": 0.8293, + "num_tokens": 46089764893.0, + "step": 11027 + }, + { + "epoch": 1.3105169340463458, + "grad_norm": 0.30061266384597707, + "learning_rate": 7.0524112933990285e-06, + "loss": 0.8218, + "num_tokens": 46093956196.0, + "step": 11028 + }, + { + "epoch": 1.310635769459299, + "grad_norm": 0.3214952438559609, + "learning_rate": 7.050854897491959e-06, + "loss": 0.8119, + "num_tokens": 46098141674.0, + "step": 11029 + }, + { + "epoch": 1.310754604872252, + "grad_norm": 0.2638774918946159, + "learning_rate": 7.049298647835086e-06, + "loss": 0.8107, + "num_tokens": 46102312287.0, + "step": 11030 + }, + { + "epoch": 1.310873440285205, + "grad_norm": 0.273774123454623, + "learning_rate": 7.047742544486042e-06, + "loss": 0.8416, + "num_tokens": 46106500829.0, + "step": 11031 + }, + { + "epoch": 1.310992275698158, + "grad_norm": 0.2679220790095493, + "learning_rate": 7.0461865875024525e-06, + "loss": 0.8049, + "num_tokens": 46110683943.0, + "step": 11032 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.30378241736124406, + "learning_rate": 7.044630776941946e-06, + "loss": 0.7892, + "num_tokens": 46114853243.0, + "step": 11033 + }, + { + "epoch": 1.3112299465240642, + "grad_norm": 0.2970536373065255, + "learning_rate": 7.043075112862135e-06, + "loss": 0.8225, + "num_tokens": 46119028630.0, + "step": 11034 + }, + { + "epoch": 1.311348781937017, + "grad_norm": 0.28600558990387737, + "learning_rate": 7.041519595320625e-06, + "loss": 0.8347, + "num_tokens": 46123209793.0, + "step": 11035 + }, + { + "epoch": 1.3114676173499702, + "grad_norm": 0.28337534404092374, + "learning_rate": 7.039964224375033e-06, + "loss": 0.837, + "num_tokens": 46127399060.0, + "step": 11036 + }, + { + "epoch": 1.3115864527629233, + "grad_norm": 0.3171363919985874, + "learning_rate": 7.038409000082953e-06, + "loss": 0.8445, + "num_tokens": 46131588718.0, + "step": 11037 + }, + { + "epoch": 1.3117052881758764, + "grad_norm": 0.28616607889063206, + "learning_rate": 7.036853922501985e-06, + "loss": 0.8258, + "num_tokens": 46135775530.0, + "step": 11038 + }, + { + "epoch": 1.3118241235888295, + "grad_norm": 0.2877085307978932, + "learning_rate": 7.0352989916897165e-06, + "loss": 0.8109, + "num_tokens": 46139945554.0, + "step": 11039 + }, + { + "epoch": 1.3119429590017826, + "grad_norm": 0.264265801855324, + "learning_rate": 7.03374420770373e-06, + "loss": 0.8059, + "num_tokens": 46144135345.0, + "step": 11040 + }, + { + "epoch": 1.3120617944147357, + "grad_norm": 0.28992445239201736, + "learning_rate": 7.032189570601602e-06, + "loss": 0.8126, + "num_tokens": 46148324242.0, + "step": 11041 + }, + { + "epoch": 1.3121806298276886, + "grad_norm": 0.2981816418749121, + "learning_rate": 7.030635080440913e-06, + "loss": 0.8358, + "num_tokens": 46152514820.0, + "step": 11042 + }, + { + "epoch": 1.3122994652406417, + "grad_norm": 0.2660416667797747, + "learning_rate": 7.029080737279226e-06, + "loss": 0.8343, + "num_tokens": 46156704225.0, + "step": 11043 + }, + { + "epoch": 1.3124183006535948, + "grad_norm": 0.28071173528143745, + "learning_rate": 7.02752654117411e-06, + "loss": 0.8278, + "num_tokens": 46160875604.0, + "step": 11044 + }, + { + "epoch": 1.3125371360665479, + "grad_norm": 0.2572336535382716, + "learning_rate": 7.025972492183113e-06, + "loss": 0.8109, + "num_tokens": 46165064489.0, + "step": 11045 + }, + { + "epoch": 1.312655971479501, + "grad_norm": 0.28135539316427005, + "learning_rate": 7.024418590363793e-06, + "loss": 0.8175, + "num_tokens": 46169253231.0, + "step": 11046 + }, + { + "epoch": 1.3127748068924538, + "grad_norm": 0.2787493315615528, + "learning_rate": 7.022864835773691e-06, + "loss": 0.8041, + "num_tokens": 46173418853.0, + "step": 11047 + }, + { + "epoch": 1.312893642305407, + "grad_norm": 0.2671584450297924, + "learning_rate": 7.021311228470354e-06, + "loss": 0.8356, + "num_tokens": 46177606189.0, + "step": 11048 + }, + { + "epoch": 1.31301247771836, + "grad_norm": 0.2581835104483627, + "learning_rate": 7.019757768511318e-06, + "loss": 0.832, + "num_tokens": 46181794918.0, + "step": 11049 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.27845385392025324, + "learning_rate": 7.018204455954105e-06, + "loss": 0.8254, + "num_tokens": 46185983508.0, + "step": 11050 + }, + { + "epoch": 1.3132501485442662, + "grad_norm": 0.3207186258433129, + "learning_rate": 7.016651290856242e-06, + "loss": 0.8178, + "num_tokens": 46190169847.0, + "step": 11051 + }, + { + "epoch": 1.3133689839572193, + "grad_norm": 0.27108968091106456, + "learning_rate": 7.0150982732752535e-06, + "loss": 0.7906, + "num_tokens": 46194358903.0, + "step": 11052 + }, + { + "epoch": 1.3134878193701724, + "grad_norm": 0.31859433604782467, + "learning_rate": 7.013545403268647e-06, + "loss": 0.7847, + "num_tokens": 46198548509.0, + "step": 11053 + }, + { + "epoch": 1.3136066547831253, + "grad_norm": 0.30978095072828643, + "learning_rate": 7.011992680893933e-06, + "loss": 0.8312, + "num_tokens": 46202737673.0, + "step": 11054 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.25626523293807524, + "learning_rate": 7.0104401062086135e-06, + "loss": 0.7777, + "num_tokens": 46206926814.0, + "step": 11055 + }, + { + "epoch": 1.3138443256090315, + "grad_norm": 0.2834621365768738, + "learning_rate": 7.008887679270186e-06, + "loss": 0.8561, + "num_tokens": 46211114899.0, + "step": 11056 + }, + { + "epoch": 1.3139631610219846, + "grad_norm": 0.31033884527806155, + "learning_rate": 7.007335400136144e-06, + "loss": 0.836, + "num_tokens": 46215302268.0, + "step": 11057 + }, + { + "epoch": 1.3140819964349375, + "grad_norm": 0.27012768315507235, + "learning_rate": 7.005783268863971e-06, + "loss": 0.8065, + "num_tokens": 46219440086.0, + "step": 11058 + }, + { + "epoch": 1.3142008318478906, + "grad_norm": 0.28468805963522525, + "learning_rate": 7.004231285511147e-06, + "loss": 0.7982, + "num_tokens": 46223618347.0, + "step": 11059 + }, + { + "epoch": 1.3143196672608437, + "grad_norm": 0.27524988637457193, + "learning_rate": 7.002679450135148e-06, + "loss": 0.7948, + "num_tokens": 46227806480.0, + "step": 11060 + }, + { + "epoch": 1.3144385026737968, + "grad_norm": 0.28310639282595856, + "learning_rate": 7.001127762793447e-06, + "loss": 0.8226, + "num_tokens": 46231973181.0, + "step": 11061 + }, + { + "epoch": 1.31455733808675, + "grad_norm": 0.3052731999841713, + "learning_rate": 6.999576223543503e-06, + "loss": 0.8016, + "num_tokens": 46236162634.0, + "step": 11062 + }, + { + "epoch": 1.314676173499703, + "grad_norm": 0.26373537452728174, + "learning_rate": 6.998024832442778e-06, + "loss": 0.8225, + "num_tokens": 46240353498.0, + "step": 11063 + }, + { + "epoch": 1.314795008912656, + "grad_norm": 0.3179481698486249, + "learning_rate": 6.9964735895487245e-06, + "loss": 0.8279, + "num_tokens": 46244534507.0, + "step": 11064 + }, + { + "epoch": 1.314913844325609, + "grad_norm": 0.28426243949845303, + "learning_rate": 6.9949224949187896e-06, + "loss": 0.8288, + "num_tokens": 46248721855.0, + "step": 11065 + }, + { + "epoch": 1.315032679738562, + "grad_norm": 0.26763483431287477, + "learning_rate": 6.993371548610421e-06, + "loss": 0.8197, + "num_tokens": 46252896406.0, + "step": 11066 + }, + { + "epoch": 1.3151515151515152, + "grad_norm": 0.32661215632369306, + "learning_rate": 6.991820750681042e-06, + "loss": 0.8299, + "num_tokens": 46257084623.0, + "step": 11067 + }, + { + "epoch": 1.3152703505644683, + "grad_norm": 0.2792053679347586, + "learning_rate": 6.990270101188097e-06, + "loss": 0.8163, + "num_tokens": 46261251595.0, + "step": 11068 + }, + { + "epoch": 1.3153891859774212, + "grad_norm": 0.29585307030678976, + "learning_rate": 6.988719600189008e-06, + "loss": 0.8196, + "num_tokens": 46265422266.0, + "step": 11069 + }, + { + "epoch": 1.3155080213903743, + "grad_norm": 0.2728145179615217, + "learning_rate": 6.987169247741193e-06, + "loss": 0.8261, + "num_tokens": 46269611928.0, + "step": 11070 + }, + { + "epoch": 1.3156268568033274, + "grad_norm": 0.29115064276125063, + "learning_rate": 6.985619043902074e-06, + "loss": 0.8174, + "num_tokens": 46273801996.0, + "step": 11071 + }, + { + "epoch": 1.3157456922162805, + "grad_norm": 0.2896909494724146, + "learning_rate": 6.984068988729049e-06, + "loss": 0.8119, + "num_tokens": 46277990203.0, + "step": 11072 + }, + { + "epoch": 1.3158645276292336, + "grad_norm": 0.2698419031242779, + "learning_rate": 6.982519082279526e-06, + "loss": 0.7877, + "num_tokens": 46282151788.0, + "step": 11073 + }, + { + "epoch": 1.3159833630421867, + "grad_norm": 0.2781786167308577, + "learning_rate": 6.980969324610908e-06, + "loss": 0.8158, + "num_tokens": 46286329475.0, + "step": 11074 + }, + { + "epoch": 1.3161021984551398, + "grad_norm": 0.2864339127270782, + "learning_rate": 6.979419715780584e-06, + "loss": 0.8238, + "num_tokens": 46290519896.0, + "step": 11075 + }, + { + "epoch": 1.3162210338680926, + "grad_norm": 0.2785137068836877, + "learning_rate": 6.977870255845945e-06, + "loss": 0.8198, + "num_tokens": 46294707049.0, + "step": 11076 + }, + { + "epoch": 1.3163398692810457, + "grad_norm": 0.2818679336386098, + "learning_rate": 6.976320944864367e-06, + "loss": 0.8293, + "num_tokens": 46298895791.0, + "step": 11077 + }, + { + "epoch": 1.3164587046939988, + "grad_norm": 0.2797366713501001, + "learning_rate": 6.9747717828932285e-06, + "loss": 0.8148, + "num_tokens": 46303080969.0, + "step": 11078 + }, + { + "epoch": 1.316577540106952, + "grad_norm": 0.2775712390665972, + "learning_rate": 6.973222769989897e-06, + "loss": 0.8379, + "num_tokens": 46307269697.0, + "step": 11079 + }, + { + "epoch": 1.3166963755199048, + "grad_norm": 0.2748153973311709, + "learning_rate": 6.971673906211744e-06, + "loss": 0.769, + "num_tokens": 46311445962.0, + "step": 11080 + }, + { + "epoch": 1.316815210932858, + "grad_norm": 0.27534060491259327, + "learning_rate": 6.970125191616131e-06, + "loss": 0.8601, + "num_tokens": 46315635249.0, + "step": 11081 + }, + { + "epoch": 1.316934046345811, + "grad_norm": 0.28287034966264274, + "learning_rate": 6.9685766262604035e-06, + "loss": 0.8303, + "num_tokens": 46319824005.0, + "step": 11082 + }, + { + "epoch": 1.3170528817587641, + "grad_norm": 0.2791867669220905, + "learning_rate": 6.967028210201914e-06, + "loss": 0.8176, + "num_tokens": 46324012231.0, + "step": 11083 + }, + { + "epoch": 1.3171717171717172, + "grad_norm": 0.2910631138722453, + "learning_rate": 6.965479943498006e-06, + "loss": 0.8527, + "num_tokens": 46328153194.0, + "step": 11084 + }, + { + "epoch": 1.3172905525846703, + "grad_norm": 0.3099473194626323, + "learning_rate": 6.963931826206015e-06, + "loss": 0.8153, + "num_tokens": 46332342867.0, + "step": 11085 + }, + { + "epoch": 1.3174093879976234, + "grad_norm": 0.29125927552852543, + "learning_rate": 6.962383858383277e-06, + "loss": 0.797, + "num_tokens": 46336531805.0, + "step": 11086 + }, + { + "epoch": 1.3175282234105763, + "grad_norm": 0.3077069951774067, + "learning_rate": 6.960836040087115e-06, + "loss": 0.8242, + "num_tokens": 46340720783.0, + "step": 11087 + }, + { + "epoch": 1.3176470588235294, + "grad_norm": 0.31453885136589516, + "learning_rate": 6.959288371374852e-06, + "loss": 0.8312, + "num_tokens": 46344909894.0, + "step": 11088 + }, + { + "epoch": 1.3177658942364825, + "grad_norm": 0.2890566681743488, + "learning_rate": 6.957740852303802e-06, + "loss": 0.784, + "num_tokens": 46349098984.0, + "step": 11089 + }, + { + "epoch": 1.3178847296494356, + "grad_norm": 0.3394162850700896, + "learning_rate": 6.956193482931275e-06, + "loss": 0.793, + "num_tokens": 46353289094.0, + "step": 11090 + }, + { + "epoch": 1.3180035650623885, + "grad_norm": 0.28184163972185516, + "learning_rate": 6.954646263314577e-06, + "loss": 0.8131, + "num_tokens": 46357451196.0, + "step": 11091 + }, + { + "epoch": 1.3181224004753416, + "grad_norm": 0.38890094490367905, + "learning_rate": 6.9530991935110034e-06, + "loss": 0.8229, + "num_tokens": 46361615146.0, + "step": 11092 + }, + { + "epoch": 1.3182412358882947, + "grad_norm": 0.3141547903344015, + "learning_rate": 6.951552273577851e-06, + "loss": 0.803, + "num_tokens": 46365804393.0, + "step": 11093 + }, + { + "epoch": 1.3183600713012478, + "grad_norm": 0.3267746864549637, + "learning_rate": 6.950005503572407e-06, + "loss": 0.8066, + "num_tokens": 46369985246.0, + "step": 11094 + }, + { + "epoch": 1.3184789067142009, + "grad_norm": 0.29229561849664776, + "learning_rate": 6.948458883551952e-06, + "loss": 0.8031, + "num_tokens": 46374174293.0, + "step": 11095 + }, + { + "epoch": 1.318597742127154, + "grad_norm": 0.3147412323505003, + "learning_rate": 6.946912413573762e-06, + "loss": 0.8293, + "num_tokens": 46378363065.0, + "step": 11096 + }, + { + "epoch": 1.318716577540107, + "grad_norm": 0.2877135724623209, + "learning_rate": 6.9453660936951135e-06, + "loss": 0.8208, + "num_tokens": 46382528778.0, + "step": 11097 + }, + { + "epoch": 1.31883541295306, + "grad_norm": 0.2791091121185602, + "learning_rate": 6.943819923973259e-06, + "loss": 0.7862, + "num_tokens": 46386717405.0, + "step": 11098 + }, + { + "epoch": 1.318954248366013, + "grad_norm": 0.2825193265523204, + "learning_rate": 6.942273904465473e-06, + "loss": 0.7744, + "num_tokens": 46390906223.0, + "step": 11099 + }, + { + "epoch": 1.3190730837789661, + "grad_norm": 0.29059674254448087, + "learning_rate": 6.940728035229003e-06, + "loss": 0.8305, + "num_tokens": 46395096080.0, + "step": 11100 + }, + { + "epoch": 1.3191919191919192, + "grad_norm": 0.2984835559456906, + "learning_rate": 6.939182316321097e-06, + "loss": 0.7702, + "num_tokens": 46399286096.0, + "step": 11101 + }, + { + "epoch": 1.3193107546048721, + "grad_norm": 0.2945812251925424, + "learning_rate": 6.937636747799006e-06, + "loss": 0.8222, + "num_tokens": 46403475275.0, + "step": 11102 + }, + { + "epoch": 1.3194295900178252, + "grad_norm": 0.30679733545492943, + "learning_rate": 6.936091329719956e-06, + "loss": 0.8165, + "num_tokens": 46407665939.0, + "step": 11103 + }, + { + "epoch": 1.3195484254307783, + "grad_norm": 0.3124742751833219, + "learning_rate": 6.934546062141185e-06, + "loss": 0.8237, + "num_tokens": 46411855265.0, + "step": 11104 + }, + { + "epoch": 1.3196672608437314, + "grad_norm": 0.3071435601643704, + "learning_rate": 6.933000945119914e-06, + "loss": 0.8228, + "num_tokens": 46416030671.0, + "step": 11105 + }, + { + "epoch": 1.3197860962566845, + "grad_norm": 0.3290145691850842, + "learning_rate": 6.931455978713375e-06, + "loss": 0.7984, + "num_tokens": 46420190918.0, + "step": 11106 + }, + { + "epoch": 1.3199049316696376, + "grad_norm": 0.3252707070006413, + "learning_rate": 6.929911162978779e-06, + "loss": 0.8745, + "num_tokens": 46424364133.0, + "step": 11107 + }, + { + "epoch": 1.3200237670825907, + "grad_norm": 0.3027332539794466, + "learning_rate": 6.928366497973331e-06, + "loss": 0.8471, + "num_tokens": 46428508195.0, + "step": 11108 + }, + { + "epoch": 1.3201426024955436, + "grad_norm": 0.30293318768630784, + "learning_rate": 6.926821983754238e-06, + "loss": 0.807, + "num_tokens": 46432696790.0, + "step": 11109 + }, + { + "epoch": 1.3202614379084967, + "grad_norm": 0.30616524743380646, + "learning_rate": 6.925277620378698e-06, + "loss": 0.8283, + "num_tokens": 46436885271.0, + "step": 11110 + }, + { + "epoch": 1.3203802733214498, + "grad_norm": 0.28704270405001814, + "learning_rate": 6.923733407903902e-06, + "loss": 0.8277, + "num_tokens": 46441040977.0, + "step": 11111 + }, + { + "epoch": 1.320499108734403, + "grad_norm": 0.31000344541203656, + "learning_rate": 6.922189346387047e-06, + "loss": 0.8069, + "num_tokens": 46445207453.0, + "step": 11112 + }, + { + "epoch": 1.3206179441473558, + "grad_norm": 0.27380128328220105, + "learning_rate": 6.9206454358853045e-06, + "loss": 0.8071, + "num_tokens": 46449343702.0, + "step": 11113 + }, + { + "epoch": 1.3207367795603089, + "grad_norm": 0.35056801528417825, + "learning_rate": 6.919101676455855e-06, + "loss": 0.8472, + "num_tokens": 46453509595.0, + "step": 11114 + }, + { + "epoch": 1.320855614973262, + "grad_norm": 0.2699539233097659, + "learning_rate": 6.917558068155867e-06, + "loss": 0.8139, + "num_tokens": 46457637605.0, + "step": 11115 + }, + { + "epoch": 1.320974450386215, + "grad_norm": 0.31377583562839795, + "learning_rate": 6.916014611042507e-06, + "loss": 0.8369, + "num_tokens": 46461798020.0, + "step": 11116 + }, + { + "epoch": 1.3210932857991682, + "grad_norm": 0.28290262585843434, + "learning_rate": 6.914471305172934e-06, + "loss": 0.8212, + "num_tokens": 46465987105.0, + "step": 11117 + }, + { + "epoch": 1.3212121212121213, + "grad_norm": 0.2941383806478497, + "learning_rate": 6.9129281506043014e-06, + "loss": 0.8294, + "num_tokens": 46470175673.0, + "step": 11118 + }, + { + "epoch": 1.3213309566250744, + "grad_norm": 0.2782526582844557, + "learning_rate": 6.9113851473937585e-06, + "loss": 0.8277, + "num_tokens": 46474345156.0, + "step": 11119 + }, + { + "epoch": 1.3214497920380273, + "grad_norm": 0.28245943603392226, + "learning_rate": 6.909842295598449e-06, + "loss": 0.8127, + "num_tokens": 46478527789.0, + "step": 11120 + }, + { + "epoch": 1.3215686274509804, + "grad_norm": 0.3023635749972368, + "learning_rate": 6.9082995952755075e-06, + "loss": 0.8294, + "num_tokens": 46482705348.0, + "step": 11121 + }, + { + "epoch": 1.3216874628639335, + "grad_norm": 0.28336358357489777, + "learning_rate": 6.906757046482067e-06, + "loss": 0.8412, + "num_tokens": 46486895141.0, + "step": 11122 + }, + { + "epoch": 1.3218062982768866, + "grad_norm": 0.3207994399333931, + "learning_rate": 6.905214649275256e-06, + "loss": 0.842, + "num_tokens": 46491083629.0, + "step": 11123 + }, + { + "epoch": 1.3219251336898394, + "grad_norm": 0.2736562790805401, + "learning_rate": 6.903672403712182e-06, + "loss": 0.823, + "num_tokens": 46495274978.0, + "step": 11124 + }, + { + "epoch": 1.3220439691027925, + "grad_norm": 0.31871902738119934, + "learning_rate": 6.902130309849975e-06, + "loss": 0.8523, + "num_tokens": 46499465153.0, + "step": 11125 + }, + { + "epoch": 1.3221628045157456, + "grad_norm": 0.26739132548483446, + "learning_rate": 6.9005883677457374e-06, + "loss": 0.7736, + "num_tokens": 46503656169.0, + "step": 11126 + }, + { + "epoch": 1.3222816399286987, + "grad_norm": 0.3444776256909395, + "learning_rate": 6.899046577456572e-06, + "loss": 0.8348, + "num_tokens": 46507845539.0, + "step": 11127 + }, + { + "epoch": 1.3224004753416518, + "grad_norm": 0.2744455166320802, + "learning_rate": 6.897504939039582e-06, + "loss": 0.8161, + "num_tokens": 46511994453.0, + "step": 11128 + }, + { + "epoch": 1.322519310754605, + "grad_norm": 0.29954007330824867, + "learning_rate": 6.895963452551852e-06, + "loss": 0.7836, + "num_tokens": 46516183340.0, + "step": 11129 + }, + { + "epoch": 1.322638146167558, + "grad_norm": 0.2600945081756635, + "learning_rate": 6.894422118050466e-06, + "loss": 0.8255, + "num_tokens": 46520372269.0, + "step": 11130 + }, + { + "epoch": 1.322756981580511, + "grad_norm": 0.29531809605898707, + "learning_rate": 6.892880935592515e-06, + "loss": 0.8205, + "num_tokens": 46524562754.0, + "step": 11131 + }, + { + "epoch": 1.322875816993464, + "grad_norm": 0.2950094107368042, + "learning_rate": 6.89133990523507e-06, + "loss": 0.791, + "num_tokens": 46528700092.0, + "step": 11132 + }, + { + "epoch": 1.3229946524064171, + "grad_norm": 0.27296887054967534, + "learning_rate": 6.889799027035202e-06, + "loss": 0.8167, + "num_tokens": 46532888423.0, + "step": 11133 + }, + { + "epoch": 1.3231134878193702, + "grad_norm": 0.30095152456328056, + "learning_rate": 6.888258301049971e-06, + "loss": 0.7964, + "num_tokens": 46537077540.0, + "step": 11134 + }, + { + "epoch": 1.3232323232323233, + "grad_norm": 0.28107362012030807, + "learning_rate": 6.886717727336436e-06, + "loss": 0.8408, + "num_tokens": 46541238269.0, + "step": 11135 + }, + { + "epoch": 1.3233511586452762, + "grad_norm": 0.2858498713599876, + "learning_rate": 6.8851773059516516e-06, + "loss": 0.7787, + "num_tokens": 46545401809.0, + "step": 11136 + }, + { + "epoch": 1.3234699940582293, + "grad_norm": 0.2728948679003612, + "learning_rate": 6.88363703695266e-06, + "loss": 0.8225, + "num_tokens": 46549590414.0, + "step": 11137 + }, + { + "epoch": 1.3235888294711824, + "grad_norm": 0.26014163295146686, + "learning_rate": 6.8820969203965135e-06, + "loss": 0.8434, + "num_tokens": 46553738963.0, + "step": 11138 + }, + { + "epoch": 1.3237076648841355, + "grad_norm": 0.31165812893097644, + "learning_rate": 6.880556956340239e-06, + "loss": 0.7991, + "num_tokens": 46557929094.0, + "step": 11139 + }, + { + "epoch": 1.3238265002970886, + "grad_norm": 0.280497618342499, + "learning_rate": 6.8790171448408674e-06, + "loss": 0.816, + "num_tokens": 46562086626.0, + "step": 11140 + }, + { + "epoch": 1.3239453357100417, + "grad_norm": 0.33020791248549736, + "learning_rate": 6.877477485955425e-06, + "loss": 0.8427, + "num_tokens": 46566261623.0, + "step": 11141 + }, + { + "epoch": 1.3240641711229946, + "grad_norm": 0.2741095608102561, + "learning_rate": 6.87593797974093e-06, + "loss": 0.8023, + "num_tokens": 46570426199.0, + "step": 11142 + }, + { + "epoch": 1.3241830065359477, + "grad_norm": 0.28493725579259754, + "learning_rate": 6.874398626254395e-06, + "loss": 0.8116, + "num_tokens": 46574615917.0, + "step": 11143 + }, + { + "epoch": 1.3243018419489008, + "grad_norm": 0.2739174967629906, + "learning_rate": 6.872859425552827e-06, + "loss": 0.7975, + "num_tokens": 46578797561.0, + "step": 11144 + }, + { + "epoch": 1.3244206773618539, + "grad_norm": 0.28090751118487195, + "learning_rate": 6.871320377693228e-06, + "loss": 0.8136, + "num_tokens": 46582979902.0, + "step": 11145 + }, + { + "epoch": 1.324539512774807, + "grad_norm": 0.2816849887272721, + "learning_rate": 6.869781482732597e-06, + "loss": 0.8162, + "num_tokens": 46587169849.0, + "step": 11146 + }, + { + "epoch": 1.3246583481877598, + "grad_norm": 0.269120771352978, + "learning_rate": 6.868242740727922e-06, + "loss": 0.8346, + "num_tokens": 46591359474.0, + "step": 11147 + }, + { + "epoch": 1.324777183600713, + "grad_norm": 0.300764558516415, + "learning_rate": 6.866704151736188e-06, + "loss": 0.8313, + "num_tokens": 46595548185.0, + "step": 11148 + }, + { + "epoch": 1.324896019013666, + "grad_norm": 0.2713229165373344, + "learning_rate": 6.865165715814372e-06, + "loss": 0.8023, + "num_tokens": 46599701018.0, + "step": 11149 + }, + { + "epoch": 1.3250148544266191, + "grad_norm": 0.31403425481011105, + "learning_rate": 6.863627433019453e-06, + "loss": 0.8036, + "num_tokens": 46603889031.0, + "step": 11150 + }, + { + "epoch": 1.3251336898395722, + "grad_norm": 0.28500652508285546, + "learning_rate": 6.862089303408393e-06, + "loss": 0.8176, + "num_tokens": 46608078397.0, + "step": 11151 + }, + { + "epoch": 1.3252525252525253, + "grad_norm": 0.287383121481397, + "learning_rate": 6.860551327038158e-06, + "loss": 0.8355, + "num_tokens": 46612267198.0, + "step": 11152 + }, + { + "epoch": 1.3253713606654784, + "grad_norm": 0.3146719994795194, + "learning_rate": 6.859013503965703e-06, + "loss": 0.8415, + "num_tokens": 46616434889.0, + "step": 11153 + }, + { + "epoch": 1.3254901960784313, + "grad_norm": 0.26414507278568594, + "learning_rate": 6.857475834247982e-06, + "loss": 0.7861, + "num_tokens": 46620622208.0, + "step": 11154 + }, + { + "epoch": 1.3256090314913844, + "grad_norm": 0.27399112345200305, + "learning_rate": 6.855938317941934e-06, + "loss": 0.7899, + "num_tokens": 46624810626.0, + "step": 11155 + }, + { + "epoch": 1.3257278669043375, + "grad_norm": 0.2716930257951865, + "learning_rate": 6.854400955104498e-06, + "loss": 0.7984, + "num_tokens": 46628998216.0, + "step": 11156 + }, + { + "epoch": 1.3258467023172906, + "grad_norm": 0.271649626199424, + "learning_rate": 6.852863745792615e-06, + "loss": 0.791, + "num_tokens": 46633187052.0, + "step": 11157 + }, + { + "epoch": 1.3259655377302435, + "grad_norm": 0.3072654755242992, + "learning_rate": 6.851326690063209e-06, + "loss": 0.7987, + "num_tokens": 46637375245.0, + "step": 11158 + }, + { + "epoch": 1.3260843731431966, + "grad_norm": 0.2854014098196737, + "learning_rate": 6.8497897879732065e-06, + "loss": 0.813, + "num_tokens": 46641564857.0, + "step": 11159 + }, + { + "epoch": 1.3262032085561497, + "grad_norm": 0.2830566856138131, + "learning_rate": 6.848253039579515e-06, + "loss": 0.8318, + "num_tokens": 46645725719.0, + "step": 11160 + }, + { + "epoch": 1.3263220439691028, + "grad_norm": 0.28795788042707077, + "learning_rate": 6.8467164449390546e-06, + "loss": 0.8038, + "num_tokens": 46649916453.0, + "step": 11161 + }, + { + "epoch": 1.326440879382056, + "grad_norm": 0.2729287873424059, + "learning_rate": 6.84518000410872e-06, + "loss": 0.7955, + "num_tokens": 46654104410.0, + "step": 11162 + }, + { + "epoch": 1.326559714795009, + "grad_norm": 0.29643873990657804, + "learning_rate": 6.843643717145423e-06, + "loss": 0.8184, + "num_tokens": 46658268116.0, + "step": 11163 + }, + { + "epoch": 1.326678550207962, + "grad_norm": 0.2794756212426938, + "learning_rate": 6.842107584106055e-06, + "loss": 0.7913, + "num_tokens": 46662444074.0, + "step": 11164 + }, + { + "epoch": 1.326797385620915, + "grad_norm": 0.2803131470223942, + "learning_rate": 6.840571605047497e-06, + "loss": 0.8183, + "num_tokens": 46666634309.0, + "step": 11165 + }, + { + "epoch": 1.326916221033868, + "grad_norm": 0.2818077885037606, + "learning_rate": 6.839035780026637e-06, + "loss": 0.8506, + "num_tokens": 46670822328.0, + "step": 11166 + }, + { + "epoch": 1.3270350564468212, + "grad_norm": 0.3001570799798278, + "learning_rate": 6.83750010910035e-06, + "loss": 0.8146, + "num_tokens": 46675010885.0, + "step": 11167 + }, + { + "epoch": 1.3271538918597743, + "grad_norm": 0.25565681803904083, + "learning_rate": 6.8359645923255035e-06, + "loss": 0.803, + "num_tokens": 46679198877.0, + "step": 11168 + }, + { + "epoch": 1.3272727272727272, + "grad_norm": 0.33345772314726757, + "learning_rate": 6.834429229758976e-06, + "loss": 0.8065, + "num_tokens": 46683364601.0, + "step": 11169 + }, + { + "epoch": 1.3273915626856803, + "grad_norm": 0.26840404718382854, + "learning_rate": 6.832894021457613e-06, + "loss": 0.7938, + "num_tokens": 46687555598.0, + "step": 11170 + }, + { + "epoch": 1.3275103980986334, + "grad_norm": 0.29418228236059774, + "learning_rate": 6.831358967478274e-06, + "loss": 0.7788, + "num_tokens": 46691745888.0, + "step": 11171 + }, + { + "epoch": 1.3276292335115865, + "grad_norm": 0.30896208397347485, + "learning_rate": 6.829824067877809e-06, + "loss": 0.8419, + "num_tokens": 46695915315.0, + "step": 11172 + }, + { + "epoch": 1.3277480689245396, + "grad_norm": 0.28483985282156055, + "learning_rate": 6.8282893227130556e-06, + "loss": 0.8404, + "num_tokens": 46700097309.0, + "step": 11173 + }, + { + "epoch": 1.3278669043374927, + "grad_norm": 0.28677491891694284, + "learning_rate": 6.826754732040854e-06, + "loss": 0.8186, + "num_tokens": 46704286809.0, + "step": 11174 + }, + { + "epoch": 1.3279857397504458, + "grad_norm": 0.3028056997959525, + "learning_rate": 6.825220295918035e-06, + "loss": 0.8095, + "num_tokens": 46708476148.0, + "step": 11175 + }, + { + "epoch": 1.3281045751633986, + "grad_norm": 0.2909439123208654, + "learning_rate": 6.823686014401425e-06, + "loss": 0.8023, + "num_tokens": 46712665775.0, + "step": 11176 + }, + { + "epoch": 1.3282234105763517, + "grad_norm": 0.310963251958002, + "learning_rate": 6.822151887547841e-06, + "loss": 0.8095, + "num_tokens": 46716830502.0, + "step": 11177 + }, + { + "epoch": 1.3283422459893048, + "grad_norm": 0.27258077437990463, + "learning_rate": 6.820617915414099e-06, + "loss": 0.811, + "num_tokens": 46720978577.0, + "step": 11178 + }, + { + "epoch": 1.328461081402258, + "grad_norm": 0.27303979061487504, + "learning_rate": 6.819084098057007e-06, + "loss": 0.8298, + "num_tokens": 46725126018.0, + "step": 11179 + }, + { + "epoch": 1.3285799168152108, + "grad_norm": 0.27891825117150393, + "learning_rate": 6.817550435533364e-06, + "loss": 0.8219, + "num_tokens": 46729293052.0, + "step": 11180 + }, + { + "epoch": 1.328698752228164, + "grad_norm": 0.263934962889642, + "learning_rate": 6.816016927899972e-06, + "loss": 0.8354, + "num_tokens": 46733480187.0, + "step": 11181 + }, + { + "epoch": 1.328817587641117, + "grad_norm": 0.28880863048585376, + "learning_rate": 6.81448357521362e-06, + "loss": 0.8355, + "num_tokens": 46737669931.0, + "step": 11182 + }, + { + "epoch": 1.3289364230540701, + "grad_norm": 0.27517031609879605, + "learning_rate": 6.812950377531091e-06, + "loss": 0.8089, + "num_tokens": 46741844661.0, + "step": 11183 + }, + { + "epoch": 1.3290552584670232, + "grad_norm": 0.2887527773045765, + "learning_rate": 6.811417334909167e-06, + "loss": 0.8519, + "num_tokens": 46746034006.0, + "step": 11184 + }, + { + "epoch": 1.3291740938799763, + "grad_norm": 0.2866174907989232, + "learning_rate": 6.809884447404621e-06, + "loss": 0.814, + "num_tokens": 46750211081.0, + "step": 11185 + }, + { + "epoch": 1.3292929292929294, + "grad_norm": 0.27559747297526416, + "learning_rate": 6.808351715074225e-06, + "loss": 0.8024, + "num_tokens": 46754399055.0, + "step": 11186 + }, + { + "epoch": 1.3294117647058823, + "grad_norm": 0.2914863182094898, + "learning_rate": 6.806819137974729e-06, + "loss": 0.8697, + "num_tokens": 46758588144.0, + "step": 11187 + }, + { + "epoch": 1.3295306001188354, + "grad_norm": 0.27703157517253457, + "learning_rate": 6.805286716162904e-06, + "loss": 0.8039, + "num_tokens": 46762777897.0, + "step": 11188 + }, + { + "epoch": 1.3296494355317885, + "grad_norm": 0.3078903330849251, + "learning_rate": 6.803754449695492e-06, + "loss": 0.8357, + "num_tokens": 46766952783.0, + "step": 11189 + }, + { + "epoch": 1.3297682709447416, + "grad_norm": 0.3031317956156776, + "learning_rate": 6.802222338629243e-06, + "loss": 0.8283, + "num_tokens": 46771141170.0, + "step": 11190 + }, + { + "epoch": 1.3298871063576945, + "grad_norm": 0.3226001044930436, + "learning_rate": 6.800690383020896e-06, + "loss": 0.8219, + "num_tokens": 46775289483.0, + "step": 11191 + }, + { + "epoch": 1.3300059417706476, + "grad_norm": 0.2893911042279628, + "learning_rate": 6.799158582927177e-06, + "loss": 0.8319, + "num_tokens": 46779471098.0, + "step": 11192 + }, + { + "epoch": 1.3301247771836007, + "grad_norm": 0.33244971123534367, + "learning_rate": 6.797626938404821e-06, + "loss": 0.7779, + "num_tokens": 46783661087.0, + "step": 11193 + }, + { + "epoch": 1.3302436125965538, + "grad_norm": 0.2692142308254899, + "learning_rate": 6.796095449510545e-06, + "loss": 0.8024, + "num_tokens": 46787849992.0, + "step": 11194 + }, + { + "epoch": 1.3303624480095069, + "grad_norm": 0.3479313147366016, + "learning_rate": 6.79456411630107e-06, + "loss": 0.8351, + "num_tokens": 46792027578.0, + "step": 11195 + }, + { + "epoch": 1.33048128342246, + "grad_norm": 0.28632552344551004, + "learning_rate": 6.7930329388331084e-06, + "loss": 0.8384, + "num_tokens": 46796190592.0, + "step": 11196 + }, + { + "epoch": 1.330600118835413, + "grad_norm": 0.3198103964040438, + "learning_rate": 6.791501917163358e-06, + "loss": 0.8345, + "num_tokens": 46800380168.0, + "step": 11197 + }, + { + "epoch": 1.330718954248366, + "grad_norm": 0.30080183275329275, + "learning_rate": 6.789971051348523e-06, + "loss": 0.8229, + "num_tokens": 46804545139.0, + "step": 11198 + }, + { + "epoch": 1.330837789661319, + "grad_norm": 0.28434217634160214, + "learning_rate": 6.788440341445293e-06, + "loss": 0.7858, + "num_tokens": 46808735815.0, + "step": 11199 + }, + { + "epoch": 1.3309566250742721, + "grad_norm": 0.2842072036346117, + "learning_rate": 6.786909787510352e-06, + "loss": 0.8413, + "num_tokens": 46812915074.0, + "step": 11200 + }, + { + "epoch": 1.3310754604872252, + "grad_norm": 0.32429721449776305, + "learning_rate": 6.785379389600393e-06, + "loss": 0.8144, + "num_tokens": 46817105476.0, + "step": 11201 + }, + { + "epoch": 1.3311942959001781, + "grad_norm": 0.2951002191271755, + "learning_rate": 6.783849147772084e-06, + "loss": 0.8346, + "num_tokens": 46821282344.0, + "step": 11202 + }, + { + "epoch": 1.3313131313131312, + "grad_norm": 0.35004865434476257, + "learning_rate": 6.782319062082095e-06, + "loss": 0.8153, + "num_tokens": 46825446301.0, + "step": 11203 + }, + { + "epoch": 1.3314319667260843, + "grad_norm": 0.27741483438771225, + "learning_rate": 6.7807891325870916e-06, + "loss": 0.8591, + "num_tokens": 46829632672.0, + "step": 11204 + }, + { + "epoch": 1.3315508021390374, + "grad_norm": 0.317693651939263, + "learning_rate": 6.779259359343732e-06, + "loss": 0.8034, + "num_tokens": 46833823177.0, + "step": 11205 + }, + { + "epoch": 1.3316696375519905, + "grad_norm": 0.3011101691525971, + "learning_rate": 6.77772974240867e-06, + "loss": 0.8139, + "num_tokens": 46837980071.0, + "step": 11206 + }, + { + "epoch": 1.3317884729649436, + "grad_norm": 0.3263918158329054, + "learning_rate": 6.77620028183855e-06, + "loss": 0.8088, + "num_tokens": 46842169842.0, + "step": 11207 + }, + { + "epoch": 1.3319073083778967, + "grad_norm": 0.29529951392298187, + "learning_rate": 6.774670977690014e-06, + "loss": 0.8577, + "num_tokens": 46846343373.0, + "step": 11208 + }, + { + "epoch": 1.3320261437908496, + "grad_norm": 0.2879305764107243, + "learning_rate": 6.773141830019699e-06, + "loss": 0.8411, + "num_tokens": 46850522959.0, + "step": 11209 + }, + { + "epoch": 1.3321449792038027, + "grad_norm": 0.3032082476201738, + "learning_rate": 6.771612838884234e-06, + "loss": 0.8403, + "num_tokens": 46854711925.0, + "step": 11210 + }, + { + "epoch": 1.3322638146167558, + "grad_norm": 0.26339002132752487, + "learning_rate": 6.77008400434024e-06, + "loss": 0.8099, + "num_tokens": 46858888979.0, + "step": 11211 + }, + { + "epoch": 1.332382650029709, + "grad_norm": 0.30106667038962787, + "learning_rate": 6.768555326444344e-06, + "loss": 0.845, + "num_tokens": 46863079401.0, + "step": 11212 + }, + { + "epoch": 1.3325014854426618, + "grad_norm": 0.28714002319459203, + "learning_rate": 6.76702680525314e-06, + "loss": 0.8334, + "num_tokens": 46867256001.0, + "step": 11213 + }, + { + "epoch": 1.3326203208556149, + "grad_norm": 0.2853664987299066, + "learning_rate": 6.765498440823252e-06, + "loss": 0.7759, + "num_tokens": 46871444903.0, + "step": 11214 + }, + { + "epoch": 1.332739156268568, + "grad_norm": 0.3075783521006999, + "learning_rate": 6.7639702332112744e-06, + "loss": 0.8117, + "num_tokens": 46875598062.0, + "step": 11215 + }, + { + "epoch": 1.332857991681521, + "grad_norm": 0.29000689602210467, + "learning_rate": 6.762442182473799e-06, + "loss": 0.845, + "num_tokens": 46879787901.0, + "step": 11216 + }, + { + "epoch": 1.3329768270944742, + "grad_norm": 0.3015614307163158, + "learning_rate": 6.760914288667421e-06, + "loss": 0.7971, + "num_tokens": 46883977021.0, + "step": 11217 + }, + { + "epoch": 1.3330956625074273, + "grad_norm": 0.29453542310851727, + "learning_rate": 6.759386551848717e-06, + "loss": 0.8125, + "num_tokens": 46888166636.0, + "step": 11218 + }, + { + "epoch": 1.3332144979203804, + "grad_norm": 0.3080516508443111, + "learning_rate": 6.757858972074263e-06, + "loss": 0.8092, + "num_tokens": 46892348199.0, + "step": 11219 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.30512342242281637, + "learning_rate": 6.756331549400636e-06, + "loss": 0.8477, + "num_tokens": 46896538094.0, + "step": 11220 + }, + { + "epoch": 1.3334521687462864, + "grad_norm": 0.31951409193873637, + "learning_rate": 6.7548042838844045e-06, + "loss": 0.789, + "num_tokens": 46900725540.0, + "step": 11221 + }, + { + "epoch": 1.3335710041592395, + "grad_norm": 0.2848664991484293, + "learning_rate": 6.7532771755821235e-06, + "loss": 0.8095, + "num_tokens": 46904915940.0, + "step": 11222 + }, + { + "epoch": 1.3336898395721926, + "grad_norm": 0.30914257355810726, + "learning_rate": 6.751750224550344e-06, + "loss": 0.8053, + "num_tokens": 46909105392.0, + "step": 11223 + }, + { + "epoch": 1.3338086749851454, + "grad_norm": 0.2755457597309919, + "learning_rate": 6.750223430845619e-06, + "loss": 0.8379, + "num_tokens": 46913295349.0, + "step": 11224 + }, + { + "epoch": 1.3339275103980985, + "grad_norm": 0.2921444855184, + "learning_rate": 6.748696794524487e-06, + "loss": 0.8317, + "num_tokens": 46917484998.0, + "step": 11225 + }, + { + "epoch": 1.3340463458110516, + "grad_norm": 0.2744770836210916, + "learning_rate": 6.747170315643485e-06, + "loss": 0.7629, + "num_tokens": 46921660145.0, + "step": 11226 + }, + { + "epoch": 1.3341651812240047, + "grad_norm": 0.2830252933235466, + "learning_rate": 6.74564399425915e-06, + "loss": 0.8119, + "num_tokens": 46925849487.0, + "step": 11227 + }, + { + "epoch": 1.3342840166369578, + "grad_norm": 0.26455061718663353, + "learning_rate": 6.744117830428001e-06, + "loss": 0.7935, + "num_tokens": 46930039496.0, + "step": 11228 + }, + { + "epoch": 1.334402852049911, + "grad_norm": 0.29907137179760185, + "learning_rate": 6.742591824206559e-06, + "loss": 0.8444, + "num_tokens": 46934206146.0, + "step": 11229 + }, + { + "epoch": 1.334521687462864, + "grad_norm": 0.26993810639824206, + "learning_rate": 6.741065975651335e-06, + "loss": 0.819, + "num_tokens": 46938371286.0, + "step": 11230 + }, + { + "epoch": 1.334640522875817, + "grad_norm": 0.2859493151219402, + "learning_rate": 6.739540284818837e-06, + "loss": 0.8447, + "num_tokens": 46942560323.0, + "step": 11231 + }, + { + "epoch": 1.33475935828877, + "grad_norm": 0.31719584579176974, + "learning_rate": 6.738014751765568e-06, + "loss": 0.787, + "num_tokens": 46946749758.0, + "step": 11232 + }, + { + "epoch": 1.3348781937017231, + "grad_norm": 0.26185566840459856, + "learning_rate": 6.736489376548023e-06, + "loss": 0.8073, + "num_tokens": 46950917824.0, + "step": 11233 + }, + { + "epoch": 1.3349970291146762, + "grad_norm": 0.29870985649054865, + "learning_rate": 6.734964159222693e-06, + "loss": 0.8367, + "num_tokens": 46955107484.0, + "step": 11234 + }, + { + "epoch": 1.3351158645276293, + "grad_norm": 0.2564334876137959, + "learning_rate": 6.733439099846057e-06, + "loss": 0.8122, + "num_tokens": 46959290279.0, + "step": 11235 + }, + { + "epoch": 1.3352346999405822, + "grad_norm": 0.28143875039897387, + "learning_rate": 6.731914198474599e-06, + "loss": 0.8317, + "num_tokens": 46963479701.0, + "step": 11236 + }, + { + "epoch": 1.3353535353535353, + "grad_norm": 0.26384877664556267, + "learning_rate": 6.730389455164789e-06, + "loss": 0.8606, + "num_tokens": 46967667840.0, + "step": 11237 + }, + { + "epoch": 1.3354723707664884, + "grad_norm": 0.2664865511460966, + "learning_rate": 6.728864869973093e-06, + "loss": 0.8465, + "num_tokens": 46971846119.0, + "step": 11238 + }, + { + "epoch": 1.3355912061794415, + "grad_norm": 0.2661927800065847, + "learning_rate": 6.727340442955972e-06, + "loss": 0.8342, + "num_tokens": 46976034917.0, + "step": 11239 + }, + { + "epoch": 1.3357100415923946, + "grad_norm": 0.28291713722280953, + "learning_rate": 6.725816174169881e-06, + "loss": 0.7973, + "num_tokens": 46980191379.0, + "step": 11240 + }, + { + "epoch": 1.3358288770053477, + "grad_norm": 0.26731967063664397, + "learning_rate": 6.7242920636712675e-06, + "loss": 0.7993, + "num_tokens": 46984375462.0, + "step": 11241 + }, + { + "epoch": 1.3359477124183008, + "grad_norm": 0.2673307359678491, + "learning_rate": 6.722768111516575e-06, + "loss": 0.8466, + "num_tokens": 46988564741.0, + "step": 11242 + }, + { + "epoch": 1.3360665478312537, + "grad_norm": 0.26451542772107967, + "learning_rate": 6.721244317762247e-06, + "loss": 0.8373, + "num_tokens": 46992752656.0, + "step": 11243 + }, + { + "epoch": 1.3361853832442068, + "grad_norm": 0.26817929727240725, + "learning_rate": 6.719720682464704e-06, + "loss": 0.8168, + "num_tokens": 46996940011.0, + "step": 11244 + }, + { + "epoch": 1.3363042186571599, + "grad_norm": 0.3052996247214081, + "learning_rate": 6.718197205680373e-06, + "loss": 0.8256, + "num_tokens": 47001124145.0, + "step": 11245 + }, + { + "epoch": 1.336423054070113, + "grad_norm": 0.2604905365323677, + "learning_rate": 6.716673887465679e-06, + "loss": 0.8157, + "num_tokens": 47005313643.0, + "step": 11246 + }, + { + "epoch": 1.3365418894830658, + "grad_norm": 0.2605427387351676, + "learning_rate": 6.7151507278770325e-06, + "loss": 0.8124, + "num_tokens": 47009502086.0, + "step": 11247 + }, + { + "epoch": 1.336660724896019, + "grad_norm": 0.27928296702844907, + "learning_rate": 6.713627726970846e-06, + "loss": 0.8051, + "num_tokens": 47013691040.0, + "step": 11248 + }, + { + "epoch": 1.336779560308972, + "grad_norm": 0.266644206482086, + "learning_rate": 6.712104884803515e-06, + "loss": 0.7759, + "num_tokens": 47017879348.0, + "step": 11249 + }, + { + "epoch": 1.3368983957219251, + "grad_norm": 0.29205684430179196, + "learning_rate": 6.710582201431437e-06, + "loss": 0.8074, + "num_tokens": 47022068641.0, + "step": 11250 + }, + { + "epoch": 1.3370172311348782, + "grad_norm": 0.26601513939978505, + "learning_rate": 6.7090596769110006e-06, + "loss": 0.8525, + "num_tokens": 47026244270.0, + "step": 11251 + }, + { + "epoch": 1.3371360665478313, + "grad_norm": 0.3109665166546938, + "learning_rate": 6.7075373112985944e-06, + "loss": 0.8235, + "num_tokens": 47030402064.0, + "step": 11252 + }, + { + "epoch": 1.3372549019607844, + "grad_norm": 0.2856779159180345, + "learning_rate": 6.706015104650598e-06, + "loss": 0.838, + "num_tokens": 47034563053.0, + "step": 11253 + }, + { + "epoch": 1.3373737373737373, + "grad_norm": 0.2874433131128667, + "learning_rate": 6.704493057023378e-06, + "loss": 0.8022, + "num_tokens": 47038752374.0, + "step": 11254 + }, + { + "epoch": 1.3374925727866904, + "grad_norm": 0.2849511868580638, + "learning_rate": 6.702971168473303e-06, + "loss": 0.8008, + "num_tokens": 47042942629.0, + "step": 11255 + }, + { + "epoch": 1.3376114081996435, + "grad_norm": 0.2846978058182856, + "learning_rate": 6.701449439056735e-06, + "loss": 0.8218, + "num_tokens": 47047131965.0, + "step": 11256 + }, + { + "epoch": 1.3377302436125966, + "grad_norm": 0.2696064092299577, + "learning_rate": 6.699927868830025e-06, + "loss": 0.7998, + "num_tokens": 47051291547.0, + "step": 11257 + }, + { + "epoch": 1.3378490790255495, + "grad_norm": 0.29358767651121215, + "learning_rate": 6.698406457849531e-06, + "loss": 0.7799, + "num_tokens": 47055474319.0, + "step": 11258 + }, + { + "epoch": 1.3379679144385026, + "grad_norm": 0.28809234267484674, + "learning_rate": 6.696885206171586e-06, + "loss": 0.7941, + "num_tokens": 47059663358.0, + "step": 11259 + }, + { + "epoch": 1.3380867498514557, + "grad_norm": 0.27481138603936944, + "learning_rate": 6.695364113852532e-06, + "loss": 0.7948, + "num_tokens": 47063853774.0, + "step": 11260 + }, + { + "epoch": 1.3382055852644088, + "grad_norm": 0.2963049391475148, + "learning_rate": 6.693843180948698e-06, + "loss": 0.7841, + "num_tokens": 47068014693.0, + "step": 11261 + }, + { + "epoch": 1.338324420677362, + "grad_norm": 0.2821740941494923, + "learning_rate": 6.692322407516411e-06, + "loss": 0.8145, + "num_tokens": 47072203637.0, + "step": 11262 + }, + { + "epoch": 1.338443256090315, + "grad_norm": 0.28676709827802077, + "learning_rate": 6.690801793611991e-06, + "loss": 0.8255, + "num_tokens": 47076391898.0, + "step": 11263 + }, + { + "epoch": 1.338562091503268, + "grad_norm": 0.29636597165639206, + "learning_rate": 6.6892813392917485e-06, + "loss": 0.8089, + "num_tokens": 47080568997.0, + "step": 11264 + }, + { + "epoch": 1.338680926916221, + "grad_norm": 0.3009275185712489, + "learning_rate": 6.6877610446119935e-06, + "loss": 0.7928, + "num_tokens": 47084758130.0, + "step": 11265 + }, + { + "epoch": 1.338799762329174, + "grad_norm": 0.2599223237216056, + "learning_rate": 6.686240909629028e-06, + "loss": 0.8342, + "num_tokens": 47088946389.0, + "step": 11266 + }, + { + "epoch": 1.3389185977421272, + "grad_norm": 0.31521214431792394, + "learning_rate": 6.684720934399146e-06, + "loss": 0.8152, + "num_tokens": 47093135872.0, + "step": 11267 + }, + { + "epoch": 1.3390374331550803, + "grad_norm": 0.26923881118632187, + "learning_rate": 6.683201118978637e-06, + "loss": 0.8528, + "num_tokens": 47097297660.0, + "step": 11268 + }, + { + "epoch": 1.3391562685680332, + "grad_norm": 0.2763467503053058, + "learning_rate": 6.68168146342379e-06, + "loss": 0.802, + "num_tokens": 47101456509.0, + "step": 11269 + }, + { + "epoch": 1.3392751039809863, + "grad_norm": 0.3044426475998552, + "learning_rate": 6.6801619677908745e-06, + "loss": 0.8068, + "num_tokens": 47105647298.0, + "step": 11270 + }, + { + "epoch": 1.3393939393939394, + "grad_norm": 0.264039326269988, + "learning_rate": 6.678642632136168e-06, + "loss": 0.8354, + "num_tokens": 47109833044.0, + "step": 11271 + }, + { + "epoch": 1.3395127748068925, + "grad_norm": 0.27336511760598103, + "learning_rate": 6.6771234565159375e-06, + "loss": 0.7783, + "num_tokens": 47114023000.0, + "step": 11272 + }, + { + "epoch": 1.3396316102198456, + "grad_norm": 0.29128253615757044, + "learning_rate": 6.6756044409864386e-06, + "loss": 0.8532, + "num_tokens": 47118197799.0, + "step": 11273 + }, + { + "epoch": 1.3397504456327987, + "grad_norm": 0.26158379826147904, + "learning_rate": 6.6740855856039335e-06, + "loss": 0.8054, + "num_tokens": 47122386893.0, + "step": 11274 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.2804531701946332, + "learning_rate": 6.672566890424664e-06, + "loss": 0.8284, + "num_tokens": 47126577448.0, + "step": 11275 + }, + { + "epoch": 1.3399881164587046, + "grad_norm": 0.2830384579121013, + "learning_rate": 6.671048355504866e-06, + "loss": 0.8327, + "num_tokens": 47130766723.0, + "step": 11276 + }, + { + "epoch": 1.3401069518716577, + "grad_norm": 0.27436048203983693, + "learning_rate": 6.669529980900791e-06, + "loss": 0.8228, + "num_tokens": 47134941156.0, + "step": 11277 + }, + { + "epoch": 1.3402257872846108, + "grad_norm": 0.282637465801749, + "learning_rate": 6.6680117666686606e-06, + "loss": 0.8064, + "num_tokens": 47139129961.0, + "step": 11278 + }, + { + "epoch": 1.340344622697564, + "grad_norm": 0.2760221380119018, + "learning_rate": 6.666493712864706e-06, + "loss": 0.8276, + "num_tokens": 47143306757.0, + "step": 11279 + }, + { + "epoch": 1.3404634581105168, + "grad_norm": 0.2843260626945736, + "learning_rate": 6.664975819545136e-06, + "loss": 0.7738, + "num_tokens": 47147470283.0, + "step": 11280 + }, + { + "epoch": 1.34058229352347, + "grad_norm": 0.3093153340704913, + "learning_rate": 6.663458086766171e-06, + "loss": 0.8057, + "num_tokens": 47151661042.0, + "step": 11281 + }, + { + "epoch": 1.340701128936423, + "grad_norm": 0.26894614534157313, + "learning_rate": 6.661940514584013e-06, + "loss": 0.7621, + "num_tokens": 47155849999.0, + "step": 11282 + }, + { + "epoch": 1.3408199643493761, + "grad_norm": 0.32061751087978824, + "learning_rate": 6.660423103054863e-06, + "loss": 0.7599, + "num_tokens": 47160027398.0, + "step": 11283 + }, + { + "epoch": 1.3409387997623292, + "grad_norm": 0.29216016590620314, + "learning_rate": 6.658905852234925e-06, + "loss": 0.8039, + "num_tokens": 47164216415.0, + "step": 11284 + }, + { + "epoch": 1.3410576351752823, + "grad_norm": 0.24951798956865837, + "learning_rate": 6.657388762180375e-06, + "loss": 0.8363, + "num_tokens": 47168405455.0, + "step": 11285 + }, + { + "epoch": 1.3411764705882354, + "grad_norm": 0.28422791086087495, + "learning_rate": 6.655871832947405e-06, + "loss": 0.7983, + "num_tokens": 47172594199.0, + "step": 11286 + }, + { + "epoch": 1.3412953060011883, + "grad_norm": 0.27414940815886196, + "learning_rate": 6.654355064592186e-06, + "loss": 0.7837, + "num_tokens": 47176783450.0, + "step": 11287 + }, + { + "epoch": 1.3414141414141414, + "grad_norm": 0.29625769109328026, + "learning_rate": 6.652838457170894e-06, + "loss": 0.8024, + "num_tokens": 47180937342.0, + "step": 11288 + }, + { + "epoch": 1.3415329768270945, + "grad_norm": 0.31422057652726876, + "learning_rate": 6.651322010739691e-06, + "loss": 0.8134, + "num_tokens": 47185113418.0, + "step": 11289 + }, + { + "epoch": 1.3416518122400476, + "grad_norm": 0.29586474834899784, + "learning_rate": 6.649805725354739e-06, + "loss": 0.8057, + "num_tokens": 47189301767.0, + "step": 11290 + }, + { + "epoch": 1.3417706476530005, + "grad_norm": 0.30932009895888035, + "learning_rate": 6.648289601072186e-06, + "loss": 0.8043, + "num_tokens": 47193463227.0, + "step": 11291 + }, + { + "epoch": 1.3418894830659536, + "grad_norm": 0.2751717863107897, + "learning_rate": 6.646773637948185e-06, + "loss": 0.8096, + "num_tokens": 47197652882.0, + "step": 11292 + }, + { + "epoch": 1.3420083184789067, + "grad_norm": 0.3069831046251126, + "learning_rate": 6.645257836038874e-06, + "loss": 0.7664, + "num_tokens": 47201841514.0, + "step": 11293 + }, + { + "epoch": 1.3421271538918598, + "grad_norm": 0.3031846301067641, + "learning_rate": 6.643742195400391e-06, + "loss": 0.8181, + "num_tokens": 47206029959.0, + "step": 11294 + }, + { + "epoch": 1.3422459893048129, + "grad_norm": 0.2577262400528067, + "learning_rate": 6.64222671608886e-06, + "loss": 0.8402, + "num_tokens": 47210219809.0, + "step": 11295 + }, + { + "epoch": 1.342364824717766, + "grad_norm": 0.3186022795994702, + "learning_rate": 6.640711398160411e-06, + "loss": 0.8401, + "num_tokens": 47214409249.0, + "step": 11296 + }, + { + "epoch": 1.342483660130719, + "grad_norm": 0.2791527133462178, + "learning_rate": 6.639196241671155e-06, + "loss": 0.8135, + "num_tokens": 47218598778.0, + "step": 11297 + }, + { + "epoch": 1.342602495543672, + "grad_norm": 0.29390634371474506, + "learning_rate": 6.637681246677209e-06, + "loss": 0.8237, + "num_tokens": 47222767046.0, + "step": 11298 + }, + { + "epoch": 1.342721330956625, + "grad_norm": 0.302355050208079, + "learning_rate": 6.636166413234675e-06, + "loss": 0.7855, + "num_tokens": 47226956326.0, + "step": 11299 + }, + { + "epoch": 1.3428401663695781, + "grad_norm": 0.26440275939103464, + "learning_rate": 6.634651741399652e-06, + "loss": 0.851, + "num_tokens": 47231113547.0, + "step": 11300 + }, + { + "epoch": 1.3429590017825312, + "grad_norm": 0.26223987800739856, + "learning_rate": 6.633137231228239e-06, + "loss": 0.7895, + "num_tokens": 47235302363.0, + "step": 11301 + }, + { + "epoch": 1.3430778371954841, + "grad_norm": 0.299108331026681, + "learning_rate": 6.6316228827765115e-06, + "loss": 0.8237, + "num_tokens": 47239462387.0, + "step": 11302 + }, + { + "epoch": 1.3431966726084372, + "grad_norm": 0.2767975178309282, + "learning_rate": 6.630108696100564e-06, + "loss": 0.7794, + "num_tokens": 47243639721.0, + "step": 11303 + }, + { + "epoch": 1.3433155080213903, + "grad_norm": 0.30692746989821856, + "learning_rate": 6.628594671256463e-06, + "loss": 0.8453, + "num_tokens": 47247820156.0, + "step": 11304 + }, + { + "epoch": 1.3434343434343434, + "grad_norm": 0.2778534037837299, + "learning_rate": 6.627080808300284e-06, + "loss": 0.8063, + "num_tokens": 47251994284.0, + "step": 11305 + }, + { + "epoch": 1.3435531788472965, + "grad_norm": 0.3414087137414391, + "learning_rate": 6.62556710728809e-06, + "loss": 0.8346, + "num_tokens": 47256165976.0, + "step": 11306 + }, + { + "epoch": 1.3436720142602496, + "grad_norm": 0.2784592520958034, + "learning_rate": 6.6240535682759345e-06, + "loss": 0.8012, + "num_tokens": 47260320137.0, + "step": 11307 + }, + { + "epoch": 1.3437908496732027, + "grad_norm": 0.3369073468952834, + "learning_rate": 6.622540191319866e-06, + "loss": 0.7987, + "num_tokens": 47264498315.0, + "step": 11308 + }, + { + "epoch": 1.3439096850861556, + "grad_norm": 0.28776015647525055, + "learning_rate": 6.621026976475938e-06, + "loss": 0.8256, + "num_tokens": 47268642936.0, + "step": 11309 + }, + { + "epoch": 1.3440285204991087, + "grad_norm": 0.30213544393561853, + "learning_rate": 6.619513923800192e-06, + "loss": 0.7774, + "num_tokens": 47272830937.0, + "step": 11310 + }, + { + "epoch": 1.3441473559120618, + "grad_norm": 0.27758436410268594, + "learning_rate": 6.61800103334865e-06, + "loss": 0.8082, + "num_tokens": 47277008488.0, + "step": 11311 + }, + { + "epoch": 1.344266191325015, + "grad_norm": 0.26263401223711574, + "learning_rate": 6.616488305177346e-06, + "loss": 0.7971, + "num_tokens": 47281173124.0, + "step": 11312 + }, + { + "epoch": 1.3443850267379678, + "grad_norm": 0.2599331984322958, + "learning_rate": 6.614975739342304e-06, + "loss": 0.8136, + "num_tokens": 47285362523.0, + "step": 11313 + }, + { + "epoch": 1.3445038621509209, + "grad_norm": 0.2999242234601297, + "learning_rate": 6.613463335899533e-06, + "loss": 0.8297, + "num_tokens": 47289513079.0, + "step": 11314 + }, + { + "epoch": 1.344622697563874, + "grad_norm": 0.267921755167025, + "learning_rate": 6.611951094905049e-06, + "loss": 0.8255, + "num_tokens": 47293703394.0, + "step": 11315 + }, + { + "epoch": 1.344741532976827, + "grad_norm": 0.27310817721623554, + "learning_rate": 6.610439016414849e-06, + "loss": 0.8032, + "num_tokens": 47297891730.0, + "step": 11316 + }, + { + "epoch": 1.3448603683897802, + "grad_norm": 0.29634911678027615, + "learning_rate": 6.6089271004849365e-06, + "loss": 0.7971, + "num_tokens": 47302080969.0, + "step": 11317 + }, + { + "epoch": 1.3449792038027333, + "grad_norm": 0.2867826005047819, + "learning_rate": 6.607415347171298e-06, + "loss": 0.8174, + "num_tokens": 47306269798.0, + "step": 11318 + }, + { + "epoch": 1.3450980392156864, + "grad_norm": 0.31784844920175204, + "learning_rate": 6.605903756529922e-06, + "loss": 0.8175, + "num_tokens": 47310459392.0, + "step": 11319 + }, + { + "epoch": 1.3452168746286393, + "grad_norm": 0.26078811995100387, + "learning_rate": 6.604392328616787e-06, + "loss": 0.7707, + "num_tokens": 47314649209.0, + "step": 11320 + }, + { + "epoch": 1.3453357100415924, + "grad_norm": 0.3514287685236722, + "learning_rate": 6.602881063487866e-06, + "loss": 0.8027, + "num_tokens": 47318837527.0, + "step": 11321 + }, + { + "epoch": 1.3454545454545455, + "grad_norm": 0.26866532328848663, + "learning_rate": 6.601369961199127e-06, + "loss": 0.83, + "num_tokens": 47323008273.0, + "step": 11322 + }, + { + "epoch": 1.3455733808674986, + "grad_norm": 0.32797115577551916, + "learning_rate": 6.599859021806527e-06, + "loss": 0.8599, + "num_tokens": 47327176222.0, + "step": 11323 + }, + { + "epoch": 1.3456922162804514, + "grad_norm": 0.269972395412621, + "learning_rate": 6.598348245366027e-06, + "loss": 0.832, + "num_tokens": 47331365247.0, + "step": 11324 + }, + { + "epoch": 1.3458110516934045, + "grad_norm": 0.3347846112065501, + "learning_rate": 6.596837631933575e-06, + "loss": 0.8072, + "num_tokens": 47335547312.0, + "step": 11325 + }, + { + "epoch": 1.3459298871063576, + "grad_norm": 0.2883585270804078, + "learning_rate": 6.5953271815651095e-06, + "loss": 0.834, + "num_tokens": 47339736090.0, + "step": 11326 + }, + { + "epoch": 1.3460487225193107, + "grad_norm": 0.33501075268779246, + "learning_rate": 6.5938168943165734e-06, + "loss": 0.8146, + "num_tokens": 47343906273.0, + "step": 11327 + }, + { + "epoch": 1.3461675579322638, + "grad_norm": 0.3198036303712465, + "learning_rate": 6.592306770243897e-06, + "loss": 0.8372, + "num_tokens": 47348077617.0, + "step": 11328 + }, + { + "epoch": 1.346286393345217, + "grad_norm": 0.2847314306150854, + "learning_rate": 6.590796809403002e-06, + "loss": 0.8101, + "num_tokens": 47352266351.0, + "step": 11329 + }, + { + "epoch": 1.34640522875817, + "grad_norm": 0.3257097021969457, + "learning_rate": 6.589287011849807e-06, + "loss": 0.806, + "num_tokens": 47356454533.0, + "step": 11330 + }, + { + "epoch": 1.346524064171123, + "grad_norm": 0.30349904578342574, + "learning_rate": 6.587777377640228e-06, + "loss": 0.8396, + "num_tokens": 47360644345.0, + "step": 11331 + }, + { + "epoch": 1.346642899584076, + "grad_norm": 0.26561432306329996, + "learning_rate": 6.586267906830177e-06, + "loss": 0.8631, + "num_tokens": 47364833602.0, + "step": 11332 + }, + { + "epoch": 1.3467617349970291, + "grad_norm": 0.3201371717727609, + "learning_rate": 6.584758599475542e-06, + "loss": 0.8261, + "num_tokens": 47369022615.0, + "step": 11333 + }, + { + "epoch": 1.3468805704099822, + "grad_norm": 0.2724343716302907, + "learning_rate": 6.583249455632223e-06, + "loss": 0.832, + "num_tokens": 47373212090.0, + "step": 11334 + }, + { + "epoch": 1.3469994058229353, + "grad_norm": 0.2871919433675737, + "learning_rate": 6.581740475356112e-06, + "loss": 0.8203, + "num_tokens": 47377401075.0, + "step": 11335 + }, + { + "epoch": 1.3471182412358882, + "grad_norm": 0.2903031480557955, + "learning_rate": 6.580231658703089e-06, + "loss": 0.8351, + "num_tokens": 47381572461.0, + "step": 11336 + }, + { + "epoch": 1.3472370766488413, + "grad_norm": 0.25096714345044996, + "learning_rate": 6.578723005729037e-06, + "loss": 0.8019, + "num_tokens": 47385761224.0, + "step": 11337 + }, + { + "epoch": 1.3473559120617944, + "grad_norm": 0.26614542122828583, + "learning_rate": 6.577214516489816e-06, + "loss": 0.7943, + "num_tokens": 47389919428.0, + "step": 11338 + }, + { + "epoch": 1.3474747474747475, + "grad_norm": 0.2688959238152933, + "learning_rate": 6.575706191041296e-06, + "loss": 0.8204, + "num_tokens": 47394094452.0, + "step": 11339 + }, + { + "epoch": 1.3475935828877006, + "grad_norm": 0.28807093182366633, + "learning_rate": 6.574198029439331e-06, + "loss": 0.7928, + "num_tokens": 47398242863.0, + "step": 11340 + }, + { + "epoch": 1.3477124183006537, + "grad_norm": 0.283927669222138, + "learning_rate": 6.572690031739779e-06, + "loss": 0.7742, + "num_tokens": 47402432708.0, + "step": 11341 + }, + { + "epoch": 1.3478312537136068, + "grad_norm": 0.2938537472341821, + "learning_rate": 6.571182197998489e-06, + "loss": 0.8252, + "num_tokens": 47406620483.0, + "step": 11342 + }, + { + "epoch": 1.3479500891265597, + "grad_norm": 0.29098146804385777, + "learning_rate": 6.5696745282712935e-06, + "loss": 0.7711, + "num_tokens": 47410805369.0, + "step": 11343 + }, + { + "epoch": 1.3480689245395128, + "grad_norm": 0.3376737216762632, + "learning_rate": 6.568167022614028e-06, + "loss": 0.7981, + "num_tokens": 47414995926.0, + "step": 11344 + }, + { + "epoch": 1.3481877599524659, + "grad_norm": 0.26865589586223326, + "learning_rate": 6.566659681082524e-06, + "loss": 0.7692, + "num_tokens": 47419185285.0, + "step": 11345 + }, + { + "epoch": 1.348306595365419, + "grad_norm": 0.33053621021173535, + "learning_rate": 6.565152503732595e-06, + "loss": 0.7979, + "num_tokens": 47423373266.0, + "step": 11346 + }, + { + "epoch": 1.3484254307783718, + "grad_norm": 0.2874059824349285, + "learning_rate": 6.563645490620072e-06, + "loss": 0.8467, + "num_tokens": 47427533987.0, + "step": 11347 + }, + { + "epoch": 1.348544266191325, + "grad_norm": 0.26630688438953515, + "learning_rate": 6.562138641800753e-06, + "loss": 0.8184, + "num_tokens": 47431721967.0, + "step": 11348 + }, + { + "epoch": 1.348663101604278, + "grad_norm": 0.287979166109018, + "learning_rate": 6.560631957330443e-06, + "loss": 0.8283, + "num_tokens": 47435889939.0, + "step": 11349 + }, + { + "epoch": 1.3487819370172311, + "grad_norm": 0.28800255006251413, + "learning_rate": 6.5591254372649446e-06, + "loss": 0.8481, + "num_tokens": 47440058141.0, + "step": 11350 + }, + { + "epoch": 1.3489007724301842, + "grad_norm": 0.2844183538475698, + "learning_rate": 6.557619081660045e-06, + "loss": 0.818, + "num_tokens": 47444248797.0, + "step": 11351 + }, + { + "epoch": 1.3490196078431373, + "grad_norm": 0.30334330597590475, + "learning_rate": 6.556112890571531e-06, + "loss": 0.8022, + "num_tokens": 47448438771.0, + "step": 11352 + }, + { + "epoch": 1.3491384432560904, + "grad_norm": 0.2625435913171001, + "learning_rate": 6.554606864055181e-06, + "loss": 0.8173, + "num_tokens": 47452626111.0, + "step": 11353 + }, + { + "epoch": 1.3492572786690433, + "grad_norm": 0.3170453030228126, + "learning_rate": 6.553101002166768e-06, + "loss": 0.8089, + "num_tokens": 47456814576.0, + "step": 11354 + }, + { + "epoch": 1.3493761140819964, + "grad_norm": 0.2625860077506233, + "learning_rate": 6.5515953049620626e-06, + "loss": 0.7597, + "num_tokens": 47460952383.0, + "step": 11355 + }, + { + "epoch": 1.3494949494949495, + "grad_norm": 0.3344800077937551, + "learning_rate": 6.550089772496823e-06, + "loss": 0.8384, + "num_tokens": 47465141543.0, + "step": 11356 + }, + { + "epoch": 1.3496137849079026, + "grad_norm": 0.27181199486787383, + "learning_rate": 6.548584404826803e-06, + "loss": 0.8238, + "num_tokens": 47469331586.0, + "step": 11357 + }, + { + "epoch": 1.3497326203208555, + "grad_norm": 0.2983233533051422, + "learning_rate": 6.547079202007758e-06, + "loss": 0.7944, + "num_tokens": 47473521562.0, + "step": 11358 + }, + { + "epoch": 1.3498514557338086, + "grad_norm": 0.2982724117341554, + "learning_rate": 6.545574164095419e-06, + "loss": 0.781, + "num_tokens": 47477710941.0, + "step": 11359 + }, + { + "epoch": 1.3499702911467617, + "grad_norm": 0.28318869354556264, + "learning_rate": 6.544069291145533e-06, + "loss": 0.8036, + "num_tokens": 47481882185.0, + "step": 11360 + }, + { + "epoch": 1.3500891265597148, + "grad_norm": 0.317295015302225, + "learning_rate": 6.5425645832138265e-06, + "loss": 0.8006, + "num_tokens": 47486071212.0, + "step": 11361 + }, + { + "epoch": 1.350207961972668, + "grad_norm": 0.28172881403514716, + "learning_rate": 6.541060040356024e-06, + "loss": 0.8101, + "num_tokens": 47490218804.0, + "step": 11362 + }, + { + "epoch": 1.350326797385621, + "grad_norm": 0.28191029101766735, + "learning_rate": 6.5395556626278474e-06, + "loss": 0.806, + "num_tokens": 47494388083.0, + "step": 11363 + }, + { + "epoch": 1.350445632798574, + "grad_norm": 0.29351834628721213, + "learning_rate": 6.538051450085003e-06, + "loss": 0.8174, + "num_tokens": 47498566155.0, + "step": 11364 + }, + { + "epoch": 1.350564468211527, + "grad_norm": 0.29339677961456856, + "learning_rate": 6.536547402783197e-06, + "loss": 0.7826, + "num_tokens": 47502737281.0, + "step": 11365 + }, + { + "epoch": 1.35068330362448, + "grad_norm": 0.3350505370353463, + "learning_rate": 6.535043520778134e-06, + "loss": 0.8248, + "num_tokens": 47506927357.0, + "step": 11366 + }, + { + "epoch": 1.3508021390374332, + "grad_norm": 0.43542385619402435, + "learning_rate": 6.533539804125506e-06, + "loss": 0.8301, + "num_tokens": 47511095763.0, + "step": 11367 + }, + { + "epoch": 1.3509209744503863, + "grad_norm": 0.29580311861208214, + "learning_rate": 6.532036252881004e-06, + "loss": 0.8048, + "num_tokens": 47515284797.0, + "step": 11368 + }, + { + "epoch": 1.3510398098633392, + "grad_norm": 0.25197647666171574, + "learning_rate": 6.530532867100304e-06, + "loss": 0.7959, + "num_tokens": 47519474093.0, + "step": 11369 + }, + { + "epoch": 1.3511586452762923, + "grad_norm": 0.274919818045181, + "learning_rate": 6.5290296468390825e-06, + "loss": 0.7839, + "num_tokens": 47523663572.0, + "step": 11370 + }, + { + "epoch": 1.3512774806892454, + "grad_norm": 0.25396286608326435, + "learning_rate": 6.5275265921530085e-06, + "loss": 0.8061, + "num_tokens": 47527852895.0, + "step": 11371 + }, + { + "epoch": 1.3513963161021985, + "grad_norm": 0.26216058709983225, + "learning_rate": 6.526023703097746e-06, + "loss": 0.8074, + "num_tokens": 47532031733.0, + "step": 11372 + }, + { + "epoch": 1.3515151515151516, + "grad_norm": 0.27283873887373805, + "learning_rate": 6.524520979728957e-06, + "loss": 0.8423, + "num_tokens": 47536221560.0, + "step": 11373 + }, + { + "epoch": 1.3516339869281047, + "grad_norm": 0.2672681979407828, + "learning_rate": 6.523018422102286e-06, + "loss": 0.8212, + "num_tokens": 47540411448.0, + "step": 11374 + }, + { + "epoch": 1.3517528223410578, + "grad_norm": 0.2745661880953608, + "learning_rate": 6.521516030273381e-06, + "loss": 0.8181, + "num_tokens": 47544600106.0, + "step": 11375 + }, + { + "epoch": 1.3518716577540106, + "grad_norm": 0.31133986850683076, + "learning_rate": 6.520013804297877e-06, + "loss": 0.8278, + "num_tokens": 47548789605.0, + "step": 11376 + }, + { + "epoch": 1.3519904931669637, + "grad_norm": 0.2599581834950979, + "learning_rate": 6.518511744231409e-06, + "loss": 0.8315, + "num_tokens": 47552974954.0, + "step": 11377 + }, + { + "epoch": 1.3521093285799168, + "grad_norm": 0.3420421066764704, + "learning_rate": 6.517009850129605e-06, + "loss": 0.807, + "num_tokens": 47557135547.0, + "step": 11378 + }, + { + "epoch": 1.35222816399287, + "grad_norm": 0.2800032475689739, + "learning_rate": 6.5155081220480845e-06, + "loss": 0.8249, + "num_tokens": 47561324152.0, + "step": 11379 + }, + { + "epoch": 1.3523469994058228, + "grad_norm": 0.3016704832261553, + "learning_rate": 6.514006560042458e-06, + "loss": 0.8506, + "num_tokens": 47565498458.0, + "step": 11380 + }, + { + "epoch": 1.352465834818776, + "grad_norm": 0.28426306154370634, + "learning_rate": 6.512505164168338e-06, + "loss": 0.801, + "num_tokens": 47569661871.0, + "step": 11381 + }, + { + "epoch": 1.352584670231729, + "grad_norm": 0.289654629802696, + "learning_rate": 6.511003934481324e-06, + "loss": 0.8374, + "num_tokens": 47573850601.0, + "step": 11382 + }, + { + "epoch": 1.3527035056446821, + "grad_norm": 0.31265014396312046, + "learning_rate": 6.509502871037011e-06, + "loss": 0.8181, + "num_tokens": 47578015915.0, + "step": 11383 + }, + { + "epoch": 1.3528223410576352, + "grad_norm": 0.2630229121345309, + "learning_rate": 6.508001973890994e-06, + "loss": 0.8255, + "num_tokens": 47582203878.0, + "step": 11384 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.3245503510979161, + "learning_rate": 6.5065012430988435e-06, + "loss": 0.8196, + "num_tokens": 47586385231.0, + "step": 11385 + }, + { + "epoch": 1.3530600118835414, + "grad_norm": 0.2982341686625569, + "learning_rate": 6.505000678716149e-06, + "loss": 0.8822, + "num_tokens": 47590567631.0, + "step": 11386 + }, + { + "epoch": 1.3531788472964943, + "grad_norm": 0.3086853667924785, + "learning_rate": 6.503500280798478e-06, + "loss": 0.8073, + "num_tokens": 47594757637.0, + "step": 11387 + }, + { + "epoch": 1.3532976827094474, + "grad_norm": 0.31650471568001387, + "learning_rate": 6.502000049401394e-06, + "loss": 0.8563, + "num_tokens": 47598946108.0, + "step": 11388 + }, + { + "epoch": 1.3534165181224005, + "grad_norm": 0.325593973932179, + "learning_rate": 6.5004999845804605e-06, + "loss": 0.8499, + "num_tokens": 47603133761.0, + "step": 11389 + }, + { + "epoch": 1.3535353535353536, + "grad_norm": 0.3059354192868425, + "learning_rate": 6.4990000863912204e-06, + "loss": 0.8046, + "num_tokens": 47607322501.0, + "step": 11390 + }, + { + "epoch": 1.3536541889483065, + "grad_norm": 0.28155407410637623, + "learning_rate": 6.497500354889223e-06, + "loss": 0.8035, + "num_tokens": 47611486999.0, + "step": 11391 + }, + { + "epoch": 1.3537730243612596, + "grad_norm": 0.35576950525379963, + "learning_rate": 6.496000790130015e-06, + "loss": 0.8304, + "num_tokens": 47615674849.0, + "step": 11392 + }, + { + "epoch": 1.3538918597742127, + "grad_norm": 0.26190538645730976, + "learning_rate": 6.494501392169125e-06, + "loss": 0.7898, + "num_tokens": 47619861033.0, + "step": 11393 + }, + { + "epoch": 1.3540106951871658, + "grad_norm": 0.29169816036478696, + "learning_rate": 6.493002161062087e-06, + "loss": 0.8091, + "num_tokens": 47624025250.0, + "step": 11394 + }, + { + "epoch": 1.3541295306001189, + "grad_norm": 0.26557760071667313, + "learning_rate": 6.491503096864412e-06, + "loss": 0.7959, + "num_tokens": 47628213536.0, + "step": 11395 + }, + { + "epoch": 1.354248366013072, + "grad_norm": 0.2672048426132931, + "learning_rate": 6.490004199631621e-06, + "loss": 0.8029, + "num_tokens": 47632401410.0, + "step": 11396 + }, + { + "epoch": 1.354367201426025, + "grad_norm": 0.26887526060111516, + "learning_rate": 6.48850546941922e-06, + "loss": 0.8193, + "num_tokens": 47636591566.0, + "step": 11397 + }, + { + "epoch": 1.354486036838978, + "grad_norm": 0.2633825806309956, + "learning_rate": 6.48700690628272e-06, + "loss": 0.8179, + "num_tokens": 47640704718.0, + "step": 11398 + }, + { + "epoch": 1.354604872251931, + "grad_norm": 0.26976961912432496, + "learning_rate": 6.4855085102776165e-06, + "loss": 0.82, + "num_tokens": 47644895307.0, + "step": 11399 + }, + { + "epoch": 1.3547237076648841, + "grad_norm": 0.2608949001609507, + "learning_rate": 6.484010281459392e-06, + "loss": 0.7802, + "num_tokens": 47649084045.0, + "step": 11400 + }, + { + "epoch": 1.3548425430778372, + "grad_norm": 0.2627348008883106, + "learning_rate": 6.482512219883535e-06, + "loss": 0.8021, + "num_tokens": 47653260477.0, + "step": 11401 + }, + { + "epoch": 1.3549613784907901, + "grad_norm": 0.2769284258928272, + "learning_rate": 6.481014325605526e-06, + "loss": 0.8439, + "num_tokens": 47657422415.0, + "step": 11402 + }, + { + "epoch": 1.3550802139037432, + "grad_norm": 0.2668809780969118, + "learning_rate": 6.479516598680835e-06, + "loss": 0.7991, + "num_tokens": 47661610096.0, + "step": 11403 + }, + { + "epoch": 1.3551990493166963, + "grad_norm": 0.2717664126648671, + "learning_rate": 6.478019039164929e-06, + "loss": 0.7832, + "num_tokens": 47665799731.0, + "step": 11404 + }, + { + "epoch": 1.3553178847296494, + "grad_norm": 0.27315692850988693, + "learning_rate": 6.476521647113266e-06, + "loss": 0.8276, + "num_tokens": 47669990059.0, + "step": 11405 + }, + { + "epoch": 1.3554367201426025, + "grad_norm": 0.2642250448330912, + "learning_rate": 6.475024422581302e-06, + "loss": 0.8239, + "num_tokens": 47674176477.0, + "step": 11406 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.266511012308956, + "learning_rate": 6.473527365624483e-06, + "loss": 0.8247, + "num_tokens": 47678354336.0, + "step": 11407 + }, + { + "epoch": 1.3556743909685087, + "grad_norm": 0.29275846061393307, + "learning_rate": 6.472030476298248e-06, + "loss": 0.8097, + "num_tokens": 47682543598.0, + "step": 11408 + }, + { + "epoch": 1.3557932263814616, + "grad_norm": 0.28258546883290364, + "learning_rate": 6.470533754658036e-06, + "loss": 0.8088, + "num_tokens": 47686734182.0, + "step": 11409 + }, + { + "epoch": 1.3559120617944147, + "grad_norm": 0.275149184082832, + "learning_rate": 6.469037200759272e-06, + "loss": 0.8192, + "num_tokens": 47690899329.0, + "step": 11410 + }, + { + "epoch": 1.3560308972073678, + "grad_norm": 0.28305611805168807, + "learning_rate": 6.467540814657382e-06, + "loss": 0.7994, + "num_tokens": 47695088087.0, + "step": 11411 + }, + { + "epoch": 1.356149732620321, + "grad_norm": 0.27246930619696985, + "learning_rate": 6.466044596407779e-06, + "loss": 0.8016, + "num_tokens": 47699277703.0, + "step": 11412 + }, + { + "epoch": 1.3562685680332738, + "grad_norm": 0.2975135270396717, + "learning_rate": 6.464548546065873e-06, + "loss": 0.8076, + "num_tokens": 47703467999.0, + "step": 11413 + }, + { + "epoch": 1.3563874034462269, + "grad_norm": 0.25099078004914427, + "learning_rate": 6.463052663687071e-06, + "loss": 0.7957, + "num_tokens": 47707633466.0, + "step": 11414 + }, + { + "epoch": 1.35650623885918, + "grad_norm": 0.3218514283851087, + "learning_rate": 6.4615569493267714e-06, + "loss": 0.8006, + "num_tokens": 47711822380.0, + "step": 11415 + }, + { + "epoch": 1.356625074272133, + "grad_norm": 0.29046757246690574, + "learning_rate": 6.460061403040356e-06, + "loss": 0.807, + "num_tokens": 47715995836.0, + "step": 11416 + }, + { + "epoch": 1.3567439096850862, + "grad_norm": 0.3165002147834696, + "learning_rate": 6.458566024883221e-06, + "loss": 0.8253, + "num_tokens": 47720183628.0, + "step": 11417 + }, + { + "epoch": 1.3568627450980393, + "grad_norm": 0.2717339667965982, + "learning_rate": 6.45707081491074e-06, + "loss": 0.8287, + "num_tokens": 47724373745.0, + "step": 11418 + }, + { + "epoch": 1.3569815805109924, + "grad_norm": 0.3080862149523097, + "learning_rate": 6.455575773178287e-06, + "loss": 0.8191, + "num_tokens": 47728561433.0, + "step": 11419 + }, + { + "epoch": 1.3571004159239453, + "grad_norm": 0.2736283436013952, + "learning_rate": 6.454080899741231e-06, + "loss": 0.8448, + "num_tokens": 47732725840.0, + "step": 11420 + }, + { + "epoch": 1.3572192513368984, + "grad_norm": 0.3150476988751064, + "learning_rate": 6.452586194654926e-06, + "loss": 0.8311, + "num_tokens": 47736893302.0, + "step": 11421 + }, + { + "epoch": 1.3573380867498515, + "grad_norm": 0.25677400177308146, + "learning_rate": 6.45109165797473e-06, + "loss": 0.8504, + "num_tokens": 47741082684.0, + "step": 11422 + }, + { + "epoch": 1.3574569221628046, + "grad_norm": 0.28605095767912425, + "learning_rate": 6.449597289755987e-06, + "loss": 0.8367, + "num_tokens": 47745272934.0, + "step": 11423 + }, + { + "epoch": 1.3575757575757577, + "grad_norm": 0.2875089842349269, + "learning_rate": 6.448103090054044e-06, + "loss": 0.8043, + "num_tokens": 47749461678.0, + "step": 11424 + }, + { + "epoch": 1.3576945929887105, + "grad_norm": 0.2936374238810244, + "learning_rate": 6.446609058924238e-06, + "loss": 0.8408, + "num_tokens": 47753651020.0, + "step": 11425 + }, + { + "epoch": 1.3578134284016636, + "grad_norm": 0.2828266700698034, + "learning_rate": 6.44511519642189e-06, + "loss": 0.7745, + "num_tokens": 47757839132.0, + "step": 11426 + }, + { + "epoch": 1.3579322638146167, + "grad_norm": 0.28066976347134387, + "learning_rate": 6.4436215026023276e-06, + "loss": 0.8244, + "num_tokens": 47762027390.0, + "step": 11427 + }, + { + "epoch": 1.3580510992275698, + "grad_norm": 0.291619160766523, + "learning_rate": 6.442127977520867e-06, + "loss": 0.7992, + "num_tokens": 47766198389.0, + "step": 11428 + }, + { + "epoch": 1.358169934640523, + "grad_norm": 0.2962831386663559, + "learning_rate": 6.440634621232813e-06, + "loss": 0.7997, + "num_tokens": 47770361123.0, + "step": 11429 + }, + { + "epoch": 1.358288770053476, + "grad_norm": 0.2706300455790927, + "learning_rate": 6.439141433793481e-06, + "loss": 0.8203, + "num_tokens": 47774520523.0, + "step": 11430 + }, + { + "epoch": 1.358407605466429, + "grad_norm": 0.31669043051845464, + "learning_rate": 6.437648415258162e-06, + "loss": 0.8137, + "num_tokens": 47778710223.0, + "step": 11431 + }, + { + "epoch": 1.358526440879382, + "grad_norm": 0.28597455963145907, + "learning_rate": 6.436155565682145e-06, + "loss": 0.7977, + "num_tokens": 47782898685.0, + "step": 11432 + }, + { + "epoch": 1.3586452762923351, + "grad_norm": 0.2965733806831938, + "learning_rate": 6.4346628851207195e-06, + "loss": 0.8043, + "num_tokens": 47787088192.0, + "step": 11433 + }, + { + "epoch": 1.3587641117052882, + "grad_norm": 0.26626154206273656, + "learning_rate": 6.4331703736291605e-06, + "loss": 0.8357, + "num_tokens": 47791277157.0, + "step": 11434 + }, + { + "epoch": 1.3588829471182413, + "grad_norm": 0.2711286110264063, + "learning_rate": 6.431678031262746e-06, + "loss": 0.8186, + "num_tokens": 47795444231.0, + "step": 11435 + }, + { + "epoch": 1.3590017825311942, + "grad_norm": 0.30816138107834984, + "learning_rate": 6.4301858580767395e-06, + "loss": 0.7972, + "num_tokens": 47799634729.0, + "step": 11436 + }, + { + "epoch": 1.3591206179441473, + "grad_norm": 0.28386929358964436, + "learning_rate": 6.428693854126401e-06, + "loss": 0.8055, + "num_tokens": 47803824761.0, + "step": 11437 + }, + { + "epoch": 1.3592394533571004, + "grad_norm": 0.2866567015151178, + "learning_rate": 6.427202019466985e-06, + "loss": 0.7338, + "num_tokens": 47808012579.0, + "step": 11438 + }, + { + "epoch": 1.3593582887700535, + "grad_norm": 0.27993653174603406, + "learning_rate": 6.42571035415374e-06, + "loss": 0.8233, + "num_tokens": 47812200965.0, + "step": 11439 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.268506610854707, + "learning_rate": 6.4242188582419065e-06, + "loss": 0.8272, + "num_tokens": 47816388944.0, + "step": 11440 + }, + { + "epoch": 1.3595959595959597, + "grad_norm": 0.2902749580606804, + "learning_rate": 6.422727531786719e-06, + "loss": 0.8387, + "num_tokens": 47820578244.0, + "step": 11441 + }, + { + "epoch": 1.3597147950089128, + "grad_norm": 0.27062299927260636, + "learning_rate": 6.421236374843407e-06, + "loss": 0.814, + "num_tokens": 47824766967.0, + "step": 11442 + }, + { + "epoch": 1.3598336304218657, + "grad_norm": 0.29630541122754966, + "learning_rate": 6.4197453874671956e-06, + "loss": 0.842, + "num_tokens": 47828956215.0, + "step": 11443 + }, + { + "epoch": 1.3599524658348188, + "grad_norm": 0.2700567151606391, + "learning_rate": 6.418254569713296e-06, + "loss": 0.79, + "num_tokens": 47833145569.0, + "step": 11444 + }, + { + "epoch": 1.3600713012477719, + "grad_norm": 0.2654683251586604, + "learning_rate": 6.416763921636921e-06, + "loss": 0.8125, + "num_tokens": 47837334882.0, + "step": 11445 + }, + { + "epoch": 1.360190136660725, + "grad_norm": 0.29537199082248433, + "learning_rate": 6.415273443293275e-06, + "loss": 0.7973, + "num_tokens": 47841524000.0, + "step": 11446 + }, + { + "epoch": 1.3603089720736778, + "grad_norm": 0.27197429067823625, + "learning_rate": 6.413783134737559e-06, + "loss": 0.8002, + "num_tokens": 47845680566.0, + "step": 11447 + }, + { + "epoch": 1.360427807486631, + "grad_norm": 0.2710806560238293, + "learning_rate": 6.412292996024951e-06, + "loss": 0.8214, + "num_tokens": 47849868994.0, + "step": 11448 + }, + { + "epoch": 1.360546642899584, + "grad_norm": 0.29378275774685914, + "learning_rate": 6.41080302721065e-06, + "loss": 0.7873, + "num_tokens": 47854057654.0, + "step": 11449 + }, + { + "epoch": 1.3606654783125371, + "grad_norm": 0.2726621774108659, + "learning_rate": 6.40931322834983e-06, + "loss": 0.7949, + "num_tokens": 47858245454.0, + "step": 11450 + }, + { + "epoch": 1.3607843137254902, + "grad_norm": 0.2814050437886072, + "learning_rate": 6.40782359949766e-06, + "loss": 0.8155, + "num_tokens": 47862433360.0, + "step": 11451 + }, + { + "epoch": 1.3609031491384433, + "grad_norm": 0.3123555352527649, + "learning_rate": 6.406334140709314e-06, + "loss": 0.787, + "num_tokens": 47866609113.0, + "step": 11452 + }, + { + "epoch": 1.3610219845513964, + "grad_norm": 0.2739689739874955, + "learning_rate": 6.404844852039942e-06, + "loss": 0.8354, + "num_tokens": 47870797575.0, + "step": 11453 + }, + { + "epoch": 1.3611408199643493, + "grad_norm": 0.29217213505155865, + "learning_rate": 6.403355733544698e-06, + "loss": 0.8332, + "num_tokens": 47874960834.0, + "step": 11454 + }, + { + "epoch": 1.3612596553773024, + "grad_norm": 0.26395450958834193, + "learning_rate": 6.401866785278739e-06, + "loss": 0.8273, + "num_tokens": 47879150757.0, + "step": 11455 + }, + { + "epoch": 1.3613784907902555, + "grad_norm": 0.2619463593164096, + "learning_rate": 6.400378007297197e-06, + "loss": 0.8314, + "num_tokens": 47883338818.0, + "step": 11456 + }, + { + "epoch": 1.3614973262032086, + "grad_norm": 0.2763711265382287, + "learning_rate": 6.398889399655212e-06, + "loss": 0.8457, + "num_tokens": 47887508811.0, + "step": 11457 + }, + { + "epoch": 1.3616161616161615, + "grad_norm": 0.27359603691877443, + "learning_rate": 6.397400962407907e-06, + "loss": 0.8652, + "num_tokens": 47891695965.0, + "step": 11458 + }, + { + "epoch": 1.3617349970291146, + "grad_norm": 0.2623308984904516, + "learning_rate": 6.395912695610407e-06, + "loss": 0.8334, + "num_tokens": 47895882738.0, + "step": 11459 + }, + { + "epoch": 1.3618538324420677, + "grad_norm": 0.2628410601659636, + "learning_rate": 6.3944245993178255e-06, + "loss": 0.8217, + "num_tokens": 47900048252.0, + "step": 11460 + }, + { + "epoch": 1.3619726678550208, + "grad_norm": 0.29386021791753153, + "learning_rate": 6.392936673585269e-06, + "loss": 0.8069, + "num_tokens": 47904235811.0, + "step": 11461 + }, + { + "epoch": 1.362091503267974, + "grad_norm": 0.2725077549628102, + "learning_rate": 6.391448918467853e-06, + "loss": 0.8466, + "num_tokens": 47908417310.0, + "step": 11462 + }, + { + "epoch": 1.362210338680927, + "grad_norm": 0.27508477517256436, + "learning_rate": 6.38996133402066e-06, + "loss": 0.8258, + "num_tokens": 47912563618.0, + "step": 11463 + }, + { + "epoch": 1.36232917409388, + "grad_norm": 0.2669115051072702, + "learning_rate": 6.388473920298786e-06, + "loss": 0.8016, + "num_tokens": 47916746697.0, + "step": 11464 + }, + { + "epoch": 1.362448009506833, + "grad_norm": 0.27087203155742035, + "learning_rate": 6.3869866773573165e-06, + "loss": 0.8225, + "num_tokens": 47920936152.0, + "step": 11465 + }, + { + "epoch": 1.362566844919786, + "grad_norm": 0.2767174766972031, + "learning_rate": 6.385499605251325e-06, + "loss": 0.8162, + "num_tokens": 47925120320.0, + "step": 11466 + }, + { + "epoch": 1.3626856803327392, + "grad_norm": 0.25622648266703263, + "learning_rate": 6.384012704035887e-06, + "loss": 0.7989, + "num_tokens": 47929310282.0, + "step": 11467 + }, + { + "epoch": 1.3628045157456923, + "grad_norm": 0.26162364307441965, + "learning_rate": 6.382525973766063e-06, + "loss": 0.8456, + "num_tokens": 47933498981.0, + "step": 11468 + }, + { + "epoch": 1.3629233511586452, + "grad_norm": 0.27329361236673655, + "learning_rate": 6.3810394144969166e-06, + "loss": 0.7963, + "num_tokens": 47937659363.0, + "step": 11469 + }, + { + "epoch": 1.3630421865715983, + "grad_norm": 0.26386952431730926, + "learning_rate": 6.379553026283497e-06, + "loss": 0.7964, + "num_tokens": 47941849290.0, + "step": 11470 + }, + { + "epoch": 1.3631610219845514, + "grad_norm": 0.2796085232276842, + "learning_rate": 6.378066809180852e-06, + "loss": 0.8261, + "num_tokens": 47946018008.0, + "step": 11471 + }, + { + "epoch": 1.3632798573975045, + "grad_norm": 0.30405446394677177, + "learning_rate": 6.376580763244018e-06, + "loss": 0.8311, + "num_tokens": 47950186462.0, + "step": 11472 + }, + { + "epoch": 1.3633986928104576, + "grad_norm": 0.2568284740164794, + "learning_rate": 6.375094888528035e-06, + "loss": 0.8269, + "num_tokens": 47954362473.0, + "step": 11473 + }, + { + "epoch": 1.3635175282234107, + "grad_norm": 0.28108269452299733, + "learning_rate": 6.373609185087919e-06, + "loss": 0.8446, + "num_tokens": 47958551185.0, + "step": 11474 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.2659689464785289, + "learning_rate": 6.372123652978701e-06, + "loss": 0.7643, + "num_tokens": 47962740982.0, + "step": 11475 + }, + { + "epoch": 1.3637551990493166, + "grad_norm": 0.25791291091571833, + "learning_rate": 6.37063829225539e-06, + "loss": 0.7907, + "num_tokens": 47966931020.0, + "step": 11476 + }, + { + "epoch": 1.3638740344622697, + "grad_norm": 0.27056059999776144, + "learning_rate": 6.3691531029729955e-06, + "loss": 0.8564, + "num_tokens": 47971120202.0, + "step": 11477 + }, + { + "epoch": 1.3639928698752228, + "grad_norm": 0.2641394292881598, + "learning_rate": 6.367668085186524e-06, + "loss": 0.7489, + "num_tokens": 47975309319.0, + "step": 11478 + }, + { + "epoch": 1.364111705288176, + "grad_norm": 0.26288240025011483, + "learning_rate": 6.36618323895096e-06, + "loss": 0.8106, + "num_tokens": 47979472724.0, + "step": 11479 + }, + { + "epoch": 1.3642305407011288, + "grad_norm": 0.25401460510679047, + "learning_rate": 6.3646985643212945e-06, + "loss": 0.79, + "num_tokens": 47983635426.0, + "step": 11480 + }, + { + "epoch": 1.364349376114082, + "grad_norm": 0.26531277769915085, + "learning_rate": 6.363214061352519e-06, + "loss": 0.8327, + "num_tokens": 47987826169.0, + "step": 11481 + }, + { + "epoch": 1.364468211527035, + "grad_norm": 0.26479625996973927, + "learning_rate": 6.361729730099601e-06, + "loss": 0.8321, + "num_tokens": 47992016379.0, + "step": 11482 + }, + { + "epoch": 1.3645870469399881, + "grad_norm": 0.26476490024247273, + "learning_rate": 6.3602455706175205e-06, + "loss": 0.8166, + "num_tokens": 47996188478.0, + "step": 11483 + }, + { + "epoch": 1.3647058823529412, + "grad_norm": 0.2652224830040159, + "learning_rate": 6.358761582961228e-06, + "loss": 0.8163, + "num_tokens": 48000358099.0, + "step": 11484 + }, + { + "epoch": 1.3648247177658943, + "grad_norm": 0.31280964049578425, + "learning_rate": 6.357277767185688e-06, + "loss": 0.8216, + "num_tokens": 48004511569.0, + "step": 11485 + }, + { + "epoch": 1.3649435531788474, + "grad_norm": 0.2657314395727229, + "learning_rate": 6.3557941233458446e-06, + "loss": 0.8184, + "num_tokens": 48008680697.0, + "step": 11486 + }, + { + "epoch": 1.3650623885918003, + "grad_norm": 0.3401761817863252, + "learning_rate": 6.354310651496653e-06, + "loss": 0.8034, + "num_tokens": 48012866601.0, + "step": 11487 + }, + { + "epoch": 1.3651812240047534, + "grad_norm": 0.27838451061087927, + "learning_rate": 6.352827351693047e-06, + "loss": 0.8067, + "num_tokens": 48017054505.0, + "step": 11488 + }, + { + "epoch": 1.3653000594177065, + "grad_norm": 0.35337787495632905, + "learning_rate": 6.3513442239899525e-06, + "loss": 0.81, + "num_tokens": 48021243528.0, + "step": 11489 + }, + { + "epoch": 1.3654188948306596, + "grad_norm": 0.299910628617636, + "learning_rate": 6.349861268442301e-06, + "loss": 0.7797, + "num_tokens": 48025429000.0, + "step": 11490 + }, + { + "epoch": 1.3655377302436125, + "grad_norm": 0.3290450034290859, + "learning_rate": 6.348378485105009e-06, + "loss": 0.7854, + "num_tokens": 48029616588.0, + "step": 11491 + }, + { + "epoch": 1.3656565656565656, + "grad_norm": 0.29111187798955623, + "learning_rate": 6.346895874032988e-06, + "loss": 0.8157, + "num_tokens": 48033805664.0, + "step": 11492 + }, + { + "epoch": 1.3657754010695187, + "grad_norm": 0.29101346830302144, + "learning_rate": 6.3454134352811455e-06, + "loss": 0.7977, + "num_tokens": 48037989832.0, + "step": 11493 + }, + { + "epoch": 1.3658942364824718, + "grad_norm": 0.2729293369036999, + "learning_rate": 6.34393116890438e-06, + "loss": 0.8066, + "num_tokens": 48042178195.0, + "step": 11494 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.28544997796778443, + "learning_rate": 6.342449074957588e-06, + "loss": 0.8166, + "num_tokens": 48046367986.0, + "step": 11495 + }, + { + "epoch": 1.366131907308378, + "grad_norm": 0.2814171774877566, + "learning_rate": 6.340967153495654e-06, + "loss": 0.8353, + "num_tokens": 48050556953.0, + "step": 11496 + }, + { + "epoch": 1.366250742721331, + "grad_norm": 0.28841097245461605, + "learning_rate": 6.3394854045734574e-06, + "loss": 0.8195, + "num_tokens": 48054746839.0, + "step": 11497 + }, + { + "epoch": 1.366369578134284, + "grad_norm": 0.2639403061546902, + "learning_rate": 6.338003828245876e-06, + "loss": 0.7939, + "num_tokens": 48058937459.0, + "step": 11498 + }, + { + "epoch": 1.366488413547237, + "grad_norm": 0.27824755169962556, + "learning_rate": 6.336522424567774e-06, + "loss": 0.8205, + "num_tokens": 48063126053.0, + "step": 11499 + }, + { + "epoch": 1.3666072489601901, + "grad_norm": 0.2807794833335551, + "learning_rate": 6.335041193594013e-06, + "loss": 0.797, + "num_tokens": 48067267425.0, + "step": 11500 + }, + { + "epoch": 1.3667260843731432, + "grad_norm": 0.27678364184122467, + "learning_rate": 6.333560135379451e-06, + "loss": 0.7966, + "num_tokens": 48071457589.0, + "step": 11501 + }, + { + "epoch": 1.3668449197860961, + "grad_norm": 0.3280849741513503, + "learning_rate": 6.332079249978935e-06, + "loss": 0.8622, + "num_tokens": 48075603959.0, + "step": 11502 + }, + { + "epoch": 1.3669637551990492, + "grad_norm": 0.2779985182793534, + "learning_rate": 6.3305985374473055e-06, + "loss": 0.8306, + "num_tokens": 48079756042.0, + "step": 11503 + }, + { + "epoch": 1.3670825906120023, + "grad_norm": 0.29789492488109476, + "learning_rate": 6.329117997839405e-06, + "loss": 0.815, + "num_tokens": 48083888326.0, + "step": 11504 + }, + { + "epoch": 1.3672014260249554, + "grad_norm": 0.264647372751406, + "learning_rate": 6.32763763121005e-06, + "loss": 0.7876, + "num_tokens": 48088077602.0, + "step": 11505 + }, + { + "epoch": 1.3673202614379085, + "grad_norm": 0.26740368568995004, + "learning_rate": 6.326157437614074e-06, + "loss": 0.8113, + "num_tokens": 48092267301.0, + "step": 11506 + }, + { + "epoch": 1.3674390968508616, + "grad_norm": 0.2817286728030408, + "learning_rate": 6.324677417106291e-06, + "loss": 0.8254, + "num_tokens": 48096456909.0, + "step": 11507 + }, + { + "epoch": 1.3675579322638147, + "grad_norm": 0.2628948888485417, + "learning_rate": 6.32319756974151e-06, + "loss": 0.8128, + "num_tokens": 48100646542.0, + "step": 11508 + }, + { + "epoch": 1.3676767676767676, + "grad_norm": 0.2518650980402919, + "learning_rate": 6.321717895574542e-06, + "loss": 0.8105, + "num_tokens": 48104836744.0, + "step": 11509 + }, + { + "epoch": 1.3677956030897207, + "grad_norm": 0.28839049138895106, + "learning_rate": 6.320238394660172e-06, + "loss": 0.8517, + "num_tokens": 48109025317.0, + "step": 11510 + }, + { + "epoch": 1.3679144385026738, + "grad_norm": 0.26295039404880577, + "learning_rate": 6.318759067053197e-06, + "loss": 0.8083, + "num_tokens": 48113175228.0, + "step": 11511 + }, + { + "epoch": 1.368033273915627, + "grad_norm": 0.3416004744959827, + "learning_rate": 6.3172799128083985e-06, + "loss": 0.796, + "num_tokens": 48117365383.0, + "step": 11512 + }, + { + "epoch": 1.3681521093285798, + "grad_norm": 0.262212340446659, + "learning_rate": 6.3158009319805624e-06, + "loss": 0.8061, + "num_tokens": 48121555348.0, + "step": 11513 + }, + { + "epoch": 1.3682709447415329, + "grad_norm": 0.31711508085491935, + "learning_rate": 6.3143221246244565e-06, + "loss": 0.753, + "num_tokens": 48125745660.0, + "step": 11514 + }, + { + "epoch": 1.368389780154486, + "grad_norm": 0.2914873136361001, + "learning_rate": 6.312843490794842e-06, + "loss": 0.777, + "num_tokens": 48129935123.0, + "step": 11515 + }, + { + "epoch": 1.368508615567439, + "grad_norm": 0.27578573037692694, + "learning_rate": 6.311365030546481e-06, + "loss": 0.8201, + "num_tokens": 48134084157.0, + "step": 11516 + }, + { + "epoch": 1.3686274509803922, + "grad_norm": 0.3099202282189282, + "learning_rate": 6.309886743934126e-06, + "loss": 0.8555, + "num_tokens": 48138274310.0, + "step": 11517 + }, + { + "epoch": 1.3687462863933453, + "grad_norm": 0.3041446947768801, + "learning_rate": 6.308408631012519e-06, + "loss": 0.8035, + "num_tokens": 48142464013.0, + "step": 11518 + }, + { + "epoch": 1.3688651218062984, + "grad_norm": 0.27726996480686217, + "learning_rate": 6.3069306918364105e-06, + "loss": 0.7999, + "num_tokens": 48146653809.0, + "step": 11519 + }, + { + "epoch": 1.3689839572192513, + "grad_norm": 0.27634925413428707, + "learning_rate": 6.305452926460521e-06, + "loss": 0.7968, + "num_tokens": 48150842985.0, + "step": 11520 + }, + { + "epoch": 1.3691027926322044, + "grad_norm": 0.2953807957883713, + "learning_rate": 6.303975334939584e-06, + "loss": 0.7818, + "num_tokens": 48155011503.0, + "step": 11521 + }, + { + "epoch": 1.3692216280451575, + "grad_norm": 0.2904202681954042, + "learning_rate": 6.302497917328316e-06, + "loss": 0.8363, + "num_tokens": 48159199534.0, + "step": 11522 + }, + { + "epoch": 1.3693404634581106, + "grad_norm": 0.28833162488654557, + "learning_rate": 6.301020673681435e-06, + "loss": 0.8187, + "num_tokens": 48163381487.0, + "step": 11523 + }, + { + "epoch": 1.3694592988710637, + "grad_norm": 0.27273231239494916, + "learning_rate": 6.299543604053644e-06, + "loss": 0.7988, + "num_tokens": 48167571201.0, + "step": 11524 + }, + { + "epoch": 1.3695781342840165, + "grad_norm": 0.2798940205631205, + "learning_rate": 6.2980667084996485e-06, + "loss": 0.8116, + "num_tokens": 48171746236.0, + "step": 11525 + }, + { + "epoch": 1.3696969696969696, + "grad_norm": 0.27663123625069497, + "learning_rate": 6.296589987074138e-06, + "loss": 0.8637, + "num_tokens": 48175907328.0, + "step": 11526 + }, + { + "epoch": 1.3698158051099227, + "grad_norm": 0.28549056415083607, + "learning_rate": 6.2951134398318035e-06, + "loss": 0.8036, + "num_tokens": 48180098142.0, + "step": 11527 + }, + { + "epoch": 1.3699346405228758, + "grad_norm": 0.27723342620690744, + "learning_rate": 6.2936370668273265e-06, + "loss": 0.7888, + "num_tokens": 48184288022.0, + "step": 11528 + }, + { + "epoch": 1.370053475935829, + "grad_norm": 0.2883643650911478, + "learning_rate": 6.29216086811538e-06, + "loss": 0.8405, + "num_tokens": 48188444856.0, + "step": 11529 + }, + { + "epoch": 1.370172311348782, + "grad_norm": 0.2923801242024212, + "learning_rate": 6.290684843750639e-06, + "loss": 0.8036, + "num_tokens": 48192634028.0, + "step": 11530 + }, + { + "epoch": 1.3702911467617351, + "grad_norm": 0.2592404808830675, + "learning_rate": 6.289208993787754e-06, + "loss": 0.7916, + "num_tokens": 48196823932.0, + "step": 11531 + }, + { + "epoch": 1.370409982174688, + "grad_norm": 0.2918707871747622, + "learning_rate": 6.28773331828139e-06, + "loss": 0.823, + "num_tokens": 48201002824.0, + "step": 11532 + }, + { + "epoch": 1.3705288175876411, + "grad_norm": 0.2754380748755416, + "learning_rate": 6.286257817286195e-06, + "loss": 0.8225, + "num_tokens": 48205135590.0, + "step": 11533 + }, + { + "epoch": 1.3706476530005942, + "grad_norm": 0.2910763886818235, + "learning_rate": 6.284782490856811e-06, + "loss": 0.807, + "num_tokens": 48209306050.0, + "step": 11534 + }, + { + "epoch": 1.3707664884135473, + "grad_norm": 0.28742867092472413, + "learning_rate": 6.283307339047875e-06, + "loss": 0.838, + "num_tokens": 48213496393.0, + "step": 11535 + }, + { + "epoch": 1.3708853238265002, + "grad_norm": 0.2821474745333817, + "learning_rate": 6.2818323619140135e-06, + "loss": 0.8159, + "num_tokens": 48217686000.0, + "step": 11536 + }, + { + "epoch": 1.3710041592394533, + "grad_norm": 0.2748731031182935, + "learning_rate": 6.28035755950985e-06, + "loss": 0.8019, + "num_tokens": 48221875679.0, + "step": 11537 + }, + { + "epoch": 1.3711229946524064, + "grad_norm": 0.2868575499179919, + "learning_rate": 6.278882931890005e-06, + "loss": 0.8152, + "num_tokens": 48226034640.0, + "step": 11538 + }, + { + "epoch": 1.3712418300653595, + "grad_norm": 0.2598787267006003, + "learning_rate": 6.277408479109089e-06, + "loss": 0.7628, + "num_tokens": 48230222618.0, + "step": 11539 + }, + { + "epoch": 1.3713606654783126, + "grad_norm": 0.27435758027036494, + "learning_rate": 6.275934201221706e-06, + "loss": 0.8191, + "num_tokens": 48234402152.0, + "step": 11540 + }, + { + "epoch": 1.3714795008912657, + "grad_norm": 0.26307369641312195, + "learning_rate": 6.27446009828245e-06, + "loss": 0.8069, + "num_tokens": 48238583240.0, + "step": 11541 + }, + { + "epoch": 1.3715983363042188, + "grad_norm": 0.25879613430613585, + "learning_rate": 6.272986170345914e-06, + "loss": 0.7898, + "num_tokens": 48242773542.0, + "step": 11542 + }, + { + "epoch": 1.3717171717171717, + "grad_norm": 0.27709513773176864, + "learning_rate": 6.2715124174666834e-06, + "loss": 0.8384, + "num_tokens": 48246951367.0, + "step": 11543 + }, + { + "epoch": 1.3718360071301248, + "grad_norm": 0.281411911935403, + "learning_rate": 6.270038839699329e-06, + "loss": 0.7858, + "num_tokens": 48251140507.0, + "step": 11544 + }, + { + "epoch": 1.3719548425430779, + "grad_norm": 0.26112153936228344, + "learning_rate": 6.268565437098439e-06, + "loss": 0.7784, + "num_tokens": 48255330021.0, + "step": 11545 + }, + { + "epoch": 1.372073677956031, + "grad_norm": 0.28490079340688235, + "learning_rate": 6.267092209718562e-06, + "loss": 0.8241, + "num_tokens": 48259501590.0, + "step": 11546 + }, + { + "epoch": 1.3721925133689838, + "grad_norm": 0.2905863640534911, + "learning_rate": 6.265619157614265e-06, + "loss": 0.774, + "num_tokens": 48263691023.0, + "step": 11547 + }, + { + "epoch": 1.372311348781937, + "grad_norm": 0.2952583515231399, + "learning_rate": 6.264146280840097e-06, + "loss": 0.8318, + "num_tokens": 48267879764.0, + "step": 11548 + }, + { + "epoch": 1.37243018419489, + "grad_norm": 0.28595222248614677, + "learning_rate": 6.262673579450605e-06, + "loss": 0.8214, + "num_tokens": 48272070587.0, + "step": 11549 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.28688998721436215, + "learning_rate": 6.261201053500325e-06, + "loss": 0.8193, + "num_tokens": 48276258401.0, + "step": 11550 + }, + { + "epoch": 1.3726678550207962, + "grad_norm": 0.29631391223180137, + "learning_rate": 6.259728703043796e-06, + "loss": 0.8404, + "num_tokens": 48280447924.0, + "step": 11551 + }, + { + "epoch": 1.3727866904337493, + "grad_norm": 0.28434876562833855, + "learning_rate": 6.258256528135538e-06, + "loss": 0.8688, + "num_tokens": 48284597064.0, + "step": 11552 + }, + { + "epoch": 1.3729055258467024, + "grad_norm": 0.27126155977502303, + "learning_rate": 6.256784528830073e-06, + "loss": 0.7718, + "num_tokens": 48288788451.0, + "step": 11553 + }, + { + "epoch": 1.3730243612596553, + "grad_norm": 0.27129721312653426, + "learning_rate": 6.2553127051819155e-06, + "loss": 0.7798, + "num_tokens": 48292971552.0, + "step": 11554 + }, + { + "epoch": 1.3731431966726084, + "grad_norm": 0.27592602989677206, + "learning_rate": 6.25384105724557e-06, + "loss": 0.806, + "num_tokens": 48297162859.0, + "step": 11555 + }, + { + "epoch": 1.3732620320855615, + "grad_norm": 0.2860449163216841, + "learning_rate": 6.252369585075537e-06, + "loss": 0.8277, + "num_tokens": 48301352425.0, + "step": 11556 + }, + { + "epoch": 1.3733808674985146, + "grad_norm": 0.2571437673239434, + "learning_rate": 6.250898288726311e-06, + "loss": 0.8581, + "num_tokens": 48305539890.0, + "step": 11557 + }, + { + "epoch": 1.3734997029114675, + "grad_norm": 0.28075697156938795, + "learning_rate": 6.249427168252378e-06, + "loss": 0.8191, + "num_tokens": 48309684063.0, + "step": 11558 + }, + { + "epoch": 1.3736185383244206, + "grad_norm": 0.2639292034388915, + "learning_rate": 6.247956223708219e-06, + "loss": 0.7716, + "num_tokens": 48313872758.0, + "step": 11559 + }, + { + "epoch": 1.3737373737373737, + "grad_norm": 0.27590955445646725, + "learning_rate": 6.2464854551483075e-06, + "loss": 0.7933, + "num_tokens": 48318059267.0, + "step": 11560 + }, + { + "epoch": 1.3738562091503268, + "grad_norm": 0.2688361209266329, + "learning_rate": 6.245014862627115e-06, + "loss": 0.8597, + "num_tokens": 48322249180.0, + "step": 11561 + }, + { + "epoch": 1.37397504456328, + "grad_norm": 0.26869253565956375, + "learning_rate": 6.243544446199098e-06, + "loss": 0.7894, + "num_tokens": 48326362226.0, + "step": 11562 + }, + { + "epoch": 1.374093879976233, + "grad_norm": 0.31152875109738565, + "learning_rate": 6.2420742059187074e-06, + "loss": 0.8106, + "num_tokens": 48330525656.0, + "step": 11563 + }, + { + "epoch": 1.374212715389186, + "grad_norm": 0.268715256249847, + "learning_rate": 6.2406041418403995e-06, + "loss": 0.7942, + "num_tokens": 48334657680.0, + "step": 11564 + }, + { + "epoch": 1.374331550802139, + "grad_norm": 0.3086097461976024, + "learning_rate": 6.239134254018612e-06, + "loss": 0.8283, + "num_tokens": 48338823673.0, + "step": 11565 + }, + { + "epoch": 1.374450386215092, + "grad_norm": 0.2823907836380608, + "learning_rate": 6.237664542507778e-06, + "loss": 0.7993, + "num_tokens": 48342995654.0, + "step": 11566 + }, + { + "epoch": 1.3745692216280452, + "grad_norm": 0.274439241444952, + "learning_rate": 6.2361950073623335e-06, + "loss": 0.7999, + "num_tokens": 48347185210.0, + "step": 11567 + }, + { + "epoch": 1.3746880570409983, + "grad_norm": 0.2611705922019705, + "learning_rate": 6.234725648636691e-06, + "loss": 0.8088, + "num_tokens": 48351372911.0, + "step": 11568 + }, + { + "epoch": 1.3748068924539512, + "grad_norm": 0.2778970441484161, + "learning_rate": 6.2332564663852665e-06, + "loss": 0.8438, + "num_tokens": 48355561143.0, + "step": 11569 + }, + { + "epoch": 1.3749257278669043, + "grad_norm": 0.26245177899369493, + "learning_rate": 6.231787460662474e-06, + "loss": 0.8069, + "num_tokens": 48359750970.0, + "step": 11570 + }, + { + "epoch": 1.3750445632798574, + "grad_norm": 0.29639669621890347, + "learning_rate": 6.230318631522714e-06, + "loss": 0.8121, + "num_tokens": 48363939974.0, + "step": 11571 + }, + { + "epoch": 1.3751633986928105, + "grad_norm": 0.27684988133775623, + "learning_rate": 6.228849979020385e-06, + "loss": 0.8412, + "num_tokens": 48368117859.0, + "step": 11572 + }, + { + "epoch": 1.3752822341057636, + "grad_norm": 0.2530942002043001, + "learning_rate": 6.22738150320987e-06, + "loss": 0.8357, + "num_tokens": 48372306733.0, + "step": 11573 + }, + { + "epoch": 1.3754010695187167, + "grad_norm": 0.3014200111033193, + "learning_rate": 6.2259132041455525e-06, + "loss": 0.8257, + "num_tokens": 48376482197.0, + "step": 11574 + }, + { + "epoch": 1.3755199049316698, + "grad_norm": 0.27194411342239355, + "learning_rate": 6.22444508188181e-06, + "loss": 0.8271, + "num_tokens": 48380651369.0, + "step": 11575 + }, + { + "epoch": 1.3756387403446226, + "grad_norm": 0.2841856987911746, + "learning_rate": 6.222977136473015e-06, + "loss": 0.8188, + "num_tokens": 48384832154.0, + "step": 11576 + }, + { + "epoch": 1.3757575757575757, + "grad_norm": 0.25990239624226563, + "learning_rate": 6.221509367973532e-06, + "loss": 0.8186, + "num_tokens": 48388993492.0, + "step": 11577 + }, + { + "epoch": 1.3758764111705288, + "grad_norm": 0.28080176469946433, + "learning_rate": 6.220041776437708e-06, + "loss": 0.8151, + "num_tokens": 48393184292.0, + "step": 11578 + }, + { + "epoch": 1.375995246583482, + "grad_norm": 0.2901835389221518, + "learning_rate": 6.2185743619199e-06, + "loss": 0.8018, + "num_tokens": 48397373963.0, + "step": 11579 + }, + { + "epoch": 1.3761140819964348, + "grad_norm": 0.31855039634978316, + "learning_rate": 6.2171071244744504e-06, + "loss": 0.8323, + "num_tokens": 48401564689.0, + "step": 11580 + }, + { + "epoch": 1.376232917409388, + "grad_norm": 0.31476337260730974, + "learning_rate": 6.215640064155695e-06, + "loss": 0.8256, + "num_tokens": 48405738933.0, + "step": 11581 + }, + { + "epoch": 1.376351752822341, + "grad_norm": 0.3148495014608969, + "learning_rate": 6.214173181017964e-06, + "loss": 0.8281, + "num_tokens": 48409928617.0, + "step": 11582 + }, + { + "epoch": 1.3764705882352941, + "grad_norm": 0.2742307583374493, + "learning_rate": 6.212706475115582e-06, + "loss": 0.7937, + "num_tokens": 48414109789.0, + "step": 11583 + }, + { + "epoch": 1.3765894236482472, + "grad_norm": 0.28010682991827185, + "learning_rate": 6.211239946502865e-06, + "loss": 0.7945, + "num_tokens": 48418241774.0, + "step": 11584 + }, + { + "epoch": 1.3767082590612003, + "grad_norm": 0.2761630369118625, + "learning_rate": 6.209773595234124e-06, + "loss": 0.8345, + "num_tokens": 48422428413.0, + "step": 11585 + }, + { + "epoch": 1.3768270944741534, + "grad_norm": 0.29122015461071454, + "learning_rate": 6.208307421363663e-06, + "loss": 0.792, + "num_tokens": 48426591452.0, + "step": 11586 + }, + { + "epoch": 1.3769459298871063, + "grad_norm": 0.2595396919121454, + "learning_rate": 6.20684142494578e-06, + "loss": 0.782, + "num_tokens": 48430781278.0, + "step": 11587 + }, + { + "epoch": 1.3770647653000594, + "grad_norm": 0.27200194256299204, + "learning_rate": 6.205375606034764e-06, + "loss": 0.8151, + "num_tokens": 48434971341.0, + "step": 11588 + }, + { + "epoch": 1.3771836007130125, + "grad_norm": 0.30578201790104986, + "learning_rate": 6.2039099646849e-06, + "loss": 0.8103, + "num_tokens": 48439160893.0, + "step": 11589 + }, + { + "epoch": 1.3773024361259656, + "grad_norm": 0.2565218129503793, + "learning_rate": 6.202444500950466e-06, + "loss": 0.7942, + "num_tokens": 48443349267.0, + "step": 11590 + }, + { + "epoch": 1.3774212715389185, + "grad_norm": 0.2698844055051415, + "learning_rate": 6.200979214885734e-06, + "loss": 0.797, + "num_tokens": 48447539124.0, + "step": 11591 + }, + { + "epoch": 1.3775401069518716, + "grad_norm": 0.27142702250813033, + "learning_rate": 6.199514106544966e-06, + "loss": 0.8187, + "num_tokens": 48451729216.0, + "step": 11592 + }, + { + "epoch": 1.3776589423648247, + "grad_norm": 0.27005012296702846, + "learning_rate": 6.198049175982427e-06, + "loss": 0.7995, + "num_tokens": 48455895716.0, + "step": 11593 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.27371734490917743, + "learning_rate": 6.196584423252354e-06, + "loss": 0.8285, + "num_tokens": 48460083422.0, + "step": 11594 + }, + { + "epoch": 1.3778966131907309, + "grad_norm": 0.2751170966160357, + "learning_rate": 6.195119848409006e-06, + "loss": 0.8199, + "num_tokens": 48464272176.0, + "step": 11595 + }, + { + "epoch": 1.378015448603684, + "grad_norm": 0.2822379583050002, + "learning_rate": 6.193655451506615e-06, + "loss": 0.8133, + "num_tokens": 48468461318.0, + "step": 11596 + }, + { + "epoch": 1.378134284016637, + "grad_norm": 0.27872068521683857, + "learning_rate": 6.1921912325994115e-06, + "loss": 0.7665, + "num_tokens": 48472638648.0, + "step": 11597 + }, + { + "epoch": 1.37825311942959, + "grad_norm": 0.26935938965726686, + "learning_rate": 6.190727191741628e-06, + "loss": 0.8156, + "num_tokens": 48476824530.0, + "step": 11598 + }, + { + "epoch": 1.378371954842543, + "grad_norm": 0.24593801375127447, + "learning_rate": 6.189263328987471e-06, + "loss": 0.7961, + "num_tokens": 48481006812.0, + "step": 11599 + }, + { + "epoch": 1.3784907902554961, + "grad_norm": 0.2752156772772311, + "learning_rate": 6.187799644391162e-06, + "loss": 0.7847, + "num_tokens": 48485152536.0, + "step": 11600 + }, + { + "epoch": 1.3786096256684492, + "grad_norm": 0.26888706397447293, + "learning_rate": 6.1863361380068965e-06, + "loss": 0.8406, + "num_tokens": 48489342611.0, + "step": 11601 + }, + { + "epoch": 1.3787284610814021, + "grad_norm": 0.25937322767766596, + "learning_rate": 6.184872809888883e-06, + "loss": 0.8063, + "num_tokens": 48493508769.0, + "step": 11602 + }, + { + "epoch": 1.3788472964943552, + "grad_norm": 0.269448688885482, + "learning_rate": 6.183409660091314e-06, + "loss": 0.8059, + "num_tokens": 48497697445.0, + "step": 11603 + }, + { + "epoch": 1.3789661319073083, + "grad_norm": 0.2684007788914602, + "learning_rate": 6.181946688668367e-06, + "loss": 0.7874, + "num_tokens": 48501869275.0, + "step": 11604 + }, + { + "epoch": 1.3790849673202614, + "grad_norm": 0.2642518945319889, + "learning_rate": 6.180483895674223e-06, + "loss": 0.7959, + "num_tokens": 48506059553.0, + "step": 11605 + }, + { + "epoch": 1.3792038027332145, + "grad_norm": 0.2659154851294897, + "learning_rate": 6.1790212811630564e-06, + "loss": 0.7963, + "num_tokens": 48510241039.0, + "step": 11606 + }, + { + "epoch": 1.3793226381461676, + "grad_norm": 0.2509688289844939, + "learning_rate": 6.177558845189029e-06, + "loss": 0.7735, + "num_tokens": 48514431361.0, + "step": 11607 + }, + { + "epoch": 1.3794414735591207, + "grad_norm": 0.2594383476958796, + "learning_rate": 6.17609658780631e-06, + "loss": 0.8051, + "num_tokens": 48518614180.0, + "step": 11608 + }, + { + "epoch": 1.3795603089720736, + "grad_norm": 0.26096360669384805, + "learning_rate": 6.174634509069038e-06, + "loss": 0.8121, + "num_tokens": 48522804537.0, + "step": 11609 + }, + { + "epoch": 1.3796791443850267, + "grad_norm": 0.26890936586941067, + "learning_rate": 6.173172609031368e-06, + "loss": 0.8012, + "num_tokens": 48526983497.0, + "step": 11610 + }, + { + "epoch": 1.3797979797979798, + "grad_norm": 0.2550262881144769, + "learning_rate": 6.171710887747435e-06, + "loss": 0.8028, + "num_tokens": 48531172445.0, + "step": 11611 + }, + { + "epoch": 1.379916815210933, + "grad_norm": 0.2875505225494928, + "learning_rate": 6.170249345271373e-06, + "loss": 0.7704, + "num_tokens": 48535361498.0, + "step": 11612 + }, + { + "epoch": 1.380035650623886, + "grad_norm": 0.2685361220031916, + "learning_rate": 6.168787981657307e-06, + "loss": 0.8741, + "num_tokens": 48539526695.0, + "step": 11613 + }, + { + "epoch": 1.3801544860368389, + "grad_norm": 0.2895896114075103, + "learning_rate": 6.1673267969593584e-06, + "loss": 0.775, + "num_tokens": 48543697601.0, + "step": 11614 + }, + { + "epoch": 1.380273321449792, + "grad_norm": 0.2741459628739024, + "learning_rate": 6.165865791231635e-06, + "loss": 0.8149, + "num_tokens": 48547887710.0, + "step": 11615 + }, + { + "epoch": 1.380392156862745, + "grad_norm": 0.29843535135141014, + "learning_rate": 6.164404964528249e-06, + "loss": 0.8127, + "num_tokens": 48552077172.0, + "step": 11616 + }, + { + "epoch": 1.3805109922756982, + "grad_norm": 0.2860125019490078, + "learning_rate": 6.1629443169032956e-06, + "loss": 0.8307, + "num_tokens": 48556267158.0, + "step": 11617 + }, + { + "epoch": 1.3806298276886513, + "grad_norm": 0.2944052243793328, + "learning_rate": 6.161483848410868e-06, + "loss": 0.8089, + "num_tokens": 48560426771.0, + "step": 11618 + }, + { + "epoch": 1.3807486631016044, + "grad_norm": 0.2912845846267707, + "learning_rate": 6.160023559105057e-06, + "loss": 0.8208, + "num_tokens": 48564572040.0, + "step": 11619 + }, + { + "epoch": 1.3808674985145573, + "grad_norm": 0.3001039333517108, + "learning_rate": 6.158563449039931e-06, + "loss": 0.8333, + "num_tokens": 48568761963.0, + "step": 11620 + }, + { + "epoch": 1.3809863339275104, + "grad_norm": 0.26681839123809414, + "learning_rate": 6.1571035182695735e-06, + "loss": 0.7975, + "num_tokens": 48572951376.0, + "step": 11621 + }, + { + "epoch": 1.3811051693404635, + "grad_norm": 0.27589078373603204, + "learning_rate": 6.1556437668480466e-06, + "loss": 0.8087, + "num_tokens": 48577113932.0, + "step": 11622 + }, + { + "epoch": 1.3812240047534166, + "grad_norm": 0.26295415794191423, + "learning_rate": 6.15418419482941e-06, + "loss": 0.7777, + "num_tokens": 48581303485.0, + "step": 11623 + }, + { + "epoch": 1.3813428401663697, + "grad_norm": 0.2633778402043057, + "learning_rate": 6.152724802267721e-06, + "loss": 0.8251, + "num_tokens": 48585492088.0, + "step": 11624 + }, + { + "epoch": 1.3814616755793225, + "grad_norm": 0.2918930490486255, + "learning_rate": 6.151265589217017e-06, + "loss": 0.7784, + "num_tokens": 48589681081.0, + "step": 11625 + }, + { + "epoch": 1.3815805109922756, + "grad_norm": 0.2785157841914869, + "learning_rate": 6.149806555731339e-06, + "loss": 0.7839, + "num_tokens": 48593845624.0, + "step": 11626 + }, + { + "epoch": 1.3816993464052287, + "grad_norm": 0.28958557129458895, + "learning_rate": 6.148347701864728e-06, + "loss": 0.8067, + "num_tokens": 48598018557.0, + "step": 11627 + }, + { + "epoch": 1.3818181818181818, + "grad_norm": 0.26713222149802945, + "learning_rate": 6.146889027671204e-06, + "loss": 0.8255, + "num_tokens": 48602181782.0, + "step": 11628 + }, + { + "epoch": 1.381937017231135, + "grad_norm": 0.27532260459997676, + "learning_rate": 6.1454305332047916e-06, + "loss": 0.8035, + "num_tokens": 48606359251.0, + "step": 11629 + }, + { + "epoch": 1.382055852644088, + "grad_norm": 0.2582284258234224, + "learning_rate": 6.143972218519499e-06, + "loss": 0.8065, + "num_tokens": 48610549408.0, + "step": 11630 + }, + { + "epoch": 1.3821746880570411, + "grad_norm": 0.2715168644361417, + "learning_rate": 6.14251408366933e-06, + "loss": 0.8058, + "num_tokens": 48614738417.0, + "step": 11631 + }, + { + "epoch": 1.382293523469994, + "grad_norm": 0.298946991154319, + "learning_rate": 6.14105612870829e-06, + "loss": 0.8209, + "num_tokens": 48618914678.0, + "step": 11632 + }, + { + "epoch": 1.3824123588829471, + "grad_norm": 0.2486638228142931, + "learning_rate": 6.139598353690366e-06, + "loss": 0.8248, + "num_tokens": 48623075561.0, + "step": 11633 + }, + { + "epoch": 1.3825311942959002, + "grad_norm": 0.3081527566943555, + "learning_rate": 6.138140758669556e-06, + "loss": 0.8082, + "num_tokens": 48627265041.0, + "step": 11634 + }, + { + "epoch": 1.3826500297088533, + "grad_norm": 0.25810311661814234, + "learning_rate": 6.136683343699828e-06, + "loss": 0.8062, + "num_tokens": 48631437878.0, + "step": 11635 + }, + { + "epoch": 1.3827688651218062, + "grad_norm": 0.3386774071952336, + "learning_rate": 6.13522610883516e-06, + "loss": 0.8264, + "num_tokens": 48635626831.0, + "step": 11636 + }, + { + "epoch": 1.3828877005347593, + "grad_norm": 0.26670181924049957, + "learning_rate": 6.133769054129517e-06, + "loss": 0.8444, + "num_tokens": 48639815142.0, + "step": 11637 + }, + { + "epoch": 1.3830065359477124, + "grad_norm": 0.3148391893191338, + "learning_rate": 6.132312179636858e-06, + "loss": 0.8062, + "num_tokens": 48644004536.0, + "step": 11638 + }, + { + "epoch": 1.3831253713606655, + "grad_norm": 0.27348588778242827, + "learning_rate": 6.130855485411138e-06, + "loss": 0.7851, + "num_tokens": 48648179378.0, + "step": 11639 + }, + { + "epoch": 1.3832442067736186, + "grad_norm": 0.2649445463564252, + "learning_rate": 6.129398971506302e-06, + "loss": 0.7947, + "num_tokens": 48652369394.0, + "step": 11640 + }, + { + "epoch": 1.3833630421865717, + "grad_norm": 0.30411662160064484, + "learning_rate": 6.12794263797629e-06, + "loss": 0.8328, + "num_tokens": 48656559140.0, + "step": 11641 + }, + { + "epoch": 1.3834818775995248, + "grad_norm": 0.31830767967942036, + "learning_rate": 6.126486484875035e-06, + "loss": 0.8259, + "num_tokens": 48660747884.0, + "step": 11642 + }, + { + "epoch": 1.3836007130124777, + "grad_norm": 0.2681089195617238, + "learning_rate": 6.125030512256463e-06, + "loss": 0.8009, + "num_tokens": 48664937316.0, + "step": 11643 + }, + { + "epoch": 1.3837195484254308, + "grad_norm": 0.2828257596007282, + "learning_rate": 6.123574720174495e-06, + "loss": 0.7936, + "num_tokens": 48669077225.0, + "step": 11644 + }, + { + "epoch": 1.3838383838383839, + "grad_norm": 0.2682054837624244, + "learning_rate": 6.122119108683041e-06, + "loss": 0.8179, + "num_tokens": 48673239043.0, + "step": 11645 + }, + { + "epoch": 1.383957219251337, + "grad_norm": 0.2708213721961007, + "learning_rate": 6.12066367783601e-06, + "loss": 0.7835, + "num_tokens": 48677420568.0, + "step": 11646 + }, + { + "epoch": 1.3840760546642898, + "grad_norm": 0.26727336744838026, + "learning_rate": 6.1192084276872994e-06, + "loss": 0.8256, + "num_tokens": 48681608501.0, + "step": 11647 + }, + { + "epoch": 1.384194890077243, + "grad_norm": 0.27343123837531913, + "learning_rate": 6.117753358290803e-06, + "loss": 0.7951, + "num_tokens": 48685798031.0, + "step": 11648 + }, + { + "epoch": 1.384313725490196, + "grad_norm": 0.27546343798022094, + "learning_rate": 6.1162984697004095e-06, + "loss": 0.8029, + "num_tokens": 48689987574.0, + "step": 11649 + }, + { + "epoch": 1.3844325609031491, + "grad_norm": 0.3391149530950515, + "learning_rate": 6.114843761969998e-06, + "loss": 0.8375, + "num_tokens": 48694176246.0, + "step": 11650 + }, + { + "epoch": 1.3845513963161022, + "grad_norm": 0.2661301094564209, + "learning_rate": 6.113389235153434e-06, + "loss": 0.8079, + "num_tokens": 48698365169.0, + "step": 11651 + }, + { + "epoch": 1.3846702317290553, + "grad_norm": 0.3214614474240505, + "learning_rate": 6.111934889304587e-06, + "loss": 0.8607, + "num_tokens": 48702553989.0, + "step": 11652 + }, + { + "epoch": 1.3847890671420084, + "grad_norm": 0.2790444485022628, + "learning_rate": 6.11048072447732e-06, + "loss": 0.8224, + "num_tokens": 48706743588.0, + "step": 11653 + }, + { + "epoch": 1.3849079025549613, + "grad_norm": 0.30627277956898585, + "learning_rate": 6.109026740725486e-06, + "loss": 0.7857, + "num_tokens": 48710932115.0, + "step": 11654 + }, + { + "epoch": 1.3850267379679144, + "grad_norm": 0.2914487511498976, + "learning_rate": 6.107572938102928e-06, + "loss": 0.8118, + "num_tokens": 48715075350.0, + "step": 11655 + }, + { + "epoch": 1.3851455733808675, + "grad_norm": 0.33030663336876026, + "learning_rate": 6.106119316663486e-06, + "loss": 0.8289, + "num_tokens": 48719228327.0, + "step": 11656 + }, + { + "epoch": 1.3852644087938206, + "grad_norm": 0.27089783932753103, + "learning_rate": 6.104665876460991e-06, + "loss": 0.8154, + "num_tokens": 48723413647.0, + "step": 11657 + }, + { + "epoch": 1.3853832442067735, + "grad_norm": 0.2794478748900525, + "learning_rate": 6.103212617549267e-06, + "loss": 0.7936, + "num_tokens": 48727603559.0, + "step": 11658 + }, + { + "epoch": 1.3855020796197266, + "grad_norm": 0.27863084005534955, + "learning_rate": 6.101759539982139e-06, + "loss": 0.8296, + "num_tokens": 48731791766.0, + "step": 11659 + }, + { + "epoch": 1.3856209150326797, + "grad_norm": 0.3107070156700952, + "learning_rate": 6.1003066438134205e-06, + "loss": 0.8205, + "num_tokens": 48735979483.0, + "step": 11660 + }, + { + "epoch": 1.3857397504456328, + "grad_norm": 0.28331610129953505, + "learning_rate": 6.098853929096909e-06, + "loss": 0.8066, + "num_tokens": 48740142623.0, + "step": 11661 + }, + { + "epoch": 1.385858585858586, + "grad_norm": 0.2921647481354928, + "learning_rate": 6.09740139588641e-06, + "loss": 0.784, + "num_tokens": 48744279239.0, + "step": 11662 + }, + { + "epoch": 1.385977421271539, + "grad_norm": 0.271027719259743, + "learning_rate": 6.0959490442357115e-06, + "loss": 0.7815, + "num_tokens": 48748439432.0, + "step": 11663 + }, + { + "epoch": 1.386096256684492, + "grad_norm": 0.32966264067487827, + "learning_rate": 6.094496874198596e-06, + "loss": 0.823, + "num_tokens": 48752628032.0, + "step": 11664 + }, + { + "epoch": 1.386215092097445, + "grad_norm": 0.2702395472241995, + "learning_rate": 6.093044885828857e-06, + "loss": 0.8148, + "num_tokens": 48756784496.0, + "step": 11665 + }, + { + "epoch": 1.386333927510398, + "grad_norm": 0.29934617509349476, + "learning_rate": 6.091593079180251e-06, + "loss": 0.8013, + "num_tokens": 48760975608.0, + "step": 11666 + }, + { + "epoch": 1.3864527629233512, + "grad_norm": 0.3064735666760007, + "learning_rate": 6.0901414543065516e-06, + "loss": 0.8307, + "num_tokens": 48765165960.0, + "step": 11667 + }, + { + "epoch": 1.3865715983363043, + "grad_norm": 0.28805796081388807, + "learning_rate": 6.088690011261513e-06, + "loss": 0.8143, + "num_tokens": 48769340713.0, + "step": 11668 + }, + { + "epoch": 1.3866904337492572, + "grad_norm": 0.28660136638711203, + "learning_rate": 6.0872387500988895e-06, + "loss": 0.824, + "num_tokens": 48773530895.0, + "step": 11669 + }, + { + "epoch": 1.3868092691622103, + "grad_norm": 0.28516858934967265, + "learning_rate": 6.085787670872424e-06, + "loss": 0.8109, + "num_tokens": 48777717999.0, + "step": 11670 + }, + { + "epoch": 1.3869281045751634, + "grad_norm": 0.3008442107685668, + "learning_rate": 6.084336773635857e-06, + "loss": 0.8097, + "num_tokens": 48781890279.0, + "step": 11671 + }, + { + "epoch": 1.3870469399881165, + "grad_norm": 0.27816263090759197, + "learning_rate": 6.08288605844292e-06, + "loss": 0.8319, + "num_tokens": 48786075461.0, + "step": 11672 + }, + { + "epoch": 1.3871657754010696, + "grad_norm": 0.3017593319400989, + "learning_rate": 6.081435525347338e-06, + "loss": 0.7933, + "num_tokens": 48790264731.0, + "step": 11673 + }, + { + "epoch": 1.3872846108140227, + "grad_norm": 0.2658688204003623, + "learning_rate": 6.0799851744028286e-06, + "loss": 0.8331, + "num_tokens": 48794453783.0, + "step": 11674 + }, + { + "epoch": 1.3874034462269758, + "grad_norm": 0.3169900363627196, + "learning_rate": 6.0785350056631024e-06, + "loss": 0.7824, + "num_tokens": 48798631514.0, + "step": 11675 + }, + { + "epoch": 1.3875222816399286, + "grad_norm": 0.26327835926648435, + "learning_rate": 6.0770850191818645e-06, + "loss": 0.8194, + "num_tokens": 48802820894.0, + "step": 11676 + }, + { + "epoch": 1.3876411170528817, + "grad_norm": 0.3043785998807094, + "learning_rate": 6.0756352150128125e-06, + "loss": 0.8518, + "num_tokens": 48807009455.0, + "step": 11677 + }, + { + "epoch": 1.3877599524658348, + "grad_norm": 0.2895170482087746, + "learning_rate": 6.074185593209641e-06, + "loss": 0.869, + "num_tokens": 48811198390.0, + "step": 11678 + }, + { + "epoch": 1.387878787878788, + "grad_norm": 0.3306381796749513, + "learning_rate": 6.072736153826029e-06, + "loss": 0.8406, + "num_tokens": 48815347112.0, + "step": 11679 + }, + { + "epoch": 1.3879976232917408, + "grad_norm": 0.28983633131266673, + "learning_rate": 6.071286896915658e-06, + "loss": 0.8431, + "num_tokens": 48819504771.0, + "step": 11680 + }, + { + "epoch": 1.388116458704694, + "grad_norm": 0.32926240935160983, + "learning_rate": 6.069837822532196e-06, + "loss": 0.795, + "num_tokens": 48823661762.0, + "step": 11681 + }, + { + "epoch": 1.388235294117647, + "grad_norm": 0.27543283321075324, + "learning_rate": 6.068388930729313e-06, + "loss": 0.815, + "num_tokens": 48827838836.0, + "step": 11682 + }, + { + "epoch": 1.3883541295306001, + "grad_norm": 0.30065801923993524, + "learning_rate": 6.066940221560656e-06, + "loss": 0.8396, + "num_tokens": 48832003535.0, + "step": 11683 + }, + { + "epoch": 1.3884729649435532, + "grad_norm": 0.29070444272001084, + "learning_rate": 6.0654916950798835e-06, + "loss": 0.8501, + "num_tokens": 48836193281.0, + "step": 11684 + }, + { + "epoch": 1.3885918003565063, + "grad_norm": 0.2732164102464452, + "learning_rate": 6.0640433513406395e-06, + "loss": 0.8081, + "num_tokens": 48840371812.0, + "step": 11685 + }, + { + "epoch": 1.3887106357694594, + "grad_norm": 0.296238140152377, + "learning_rate": 6.062595190396557e-06, + "loss": 0.8204, + "num_tokens": 48844559448.0, + "step": 11686 + }, + { + "epoch": 1.3888294711824123, + "grad_norm": 0.26948689187176306, + "learning_rate": 6.0611472123012736e-06, + "loss": 0.8036, + "num_tokens": 48848749785.0, + "step": 11687 + }, + { + "epoch": 1.3889483065953654, + "grad_norm": 0.3652299153322071, + "learning_rate": 6.059699417108402e-06, + "loss": 0.8041, + "num_tokens": 48852939005.0, + "step": 11688 + }, + { + "epoch": 1.3890671420083185, + "grad_norm": 0.2781021348415912, + "learning_rate": 6.0582518048715666e-06, + "loss": 0.7957, + "num_tokens": 48857127227.0, + "step": 11689 + }, + { + "epoch": 1.3891859774212716, + "grad_norm": 0.3426988339457182, + "learning_rate": 6.0568043756443715e-06, + "loss": 0.783, + "num_tokens": 48861310431.0, + "step": 11690 + }, + { + "epoch": 1.3893048128342245, + "grad_norm": 0.27905057944926265, + "learning_rate": 6.0553571294804255e-06, + "loss": 0.8068, + "num_tokens": 48865499604.0, + "step": 11691 + }, + { + "epoch": 1.3894236482471776, + "grad_norm": 0.30672581658359976, + "learning_rate": 6.053910066433328e-06, + "loss": 0.8007, + "num_tokens": 48869672423.0, + "step": 11692 + }, + { + "epoch": 1.3895424836601307, + "grad_norm": 0.2903089339151702, + "learning_rate": 6.052463186556662e-06, + "loss": 0.8371, + "num_tokens": 48873861670.0, + "step": 11693 + }, + { + "epoch": 1.3896613190730838, + "grad_norm": 0.30767427950202586, + "learning_rate": 6.0510164899040105e-06, + "loss": 0.8228, + "num_tokens": 48878045246.0, + "step": 11694 + }, + { + "epoch": 1.3897801544860369, + "grad_norm": 0.2722404690002333, + "learning_rate": 6.049569976528951e-06, + "loss": 0.7851, + "num_tokens": 48882235681.0, + "step": 11695 + }, + { + "epoch": 1.38989898989899, + "grad_norm": 0.2790218205736037, + "learning_rate": 6.048123646485054e-06, + "loss": 0.7726, + "num_tokens": 48886423859.0, + "step": 11696 + }, + { + "epoch": 1.390017825311943, + "grad_norm": 0.3071368119331151, + "learning_rate": 6.046677499825879e-06, + "loss": 0.8432, + "num_tokens": 48890591347.0, + "step": 11697 + }, + { + "epoch": 1.390136660724896, + "grad_norm": 0.27628589264750664, + "learning_rate": 6.045231536604987e-06, + "loss": 0.7947, + "num_tokens": 48894763311.0, + "step": 11698 + }, + { + "epoch": 1.390255496137849, + "grad_norm": 0.29700124187805765, + "learning_rate": 6.043785756875922e-06, + "loss": 0.8097, + "num_tokens": 48898951070.0, + "step": 11699 + }, + { + "epoch": 1.3903743315508021, + "grad_norm": 0.28737062170588235, + "learning_rate": 6.042340160692226e-06, + "loss": 0.8186, + "num_tokens": 48903139858.0, + "step": 11700 + }, + { + "epoch": 1.3904931669637552, + "grad_norm": 0.28752317147272677, + "learning_rate": 6.0408947481074386e-06, + "loss": 0.8398, + "num_tokens": 48907311493.0, + "step": 11701 + }, + { + "epoch": 1.3906120023767081, + "grad_norm": 0.2852154786792859, + "learning_rate": 6.039449519175085e-06, + "loss": 0.8384, + "num_tokens": 48911501419.0, + "step": 11702 + }, + { + "epoch": 1.3907308377896612, + "grad_norm": 0.25622498526233717, + "learning_rate": 6.038004473948689e-06, + "loss": 0.8037, + "num_tokens": 48915663412.0, + "step": 11703 + }, + { + "epoch": 1.3908496732026143, + "grad_norm": 0.29680380894304415, + "learning_rate": 6.036559612481763e-06, + "loss": 0.8177, + "num_tokens": 48919849477.0, + "step": 11704 + }, + { + "epoch": 1.3909685086155674, + "grad_norm": 0.2532722719818583, + "learning_rate": 6.035114934827815e-06, + "loss": 0.8059, + "num_tokens": 48924039116.0, + "step": 11705 + }, + { + "epoch": 1.3910873440285205, + "grad_norm": 0.265498595415978, + "learning_rate": 6.0336704410403494e-06, + "loss": 0.8238, + "num_tokens": 48928201324.0, + "step": 11706 + }, + { + "epoch": 1.3912061794414736, + "grad_norm": 0.2930497100229, + "learning_rate": 6.03222613117286e-06, + "loss": 0.7848, + "num_tokens": 48932358020.0, + "step": 11707 + }, + { + "epoch": 1.3913250148544267, + "grad_norm": 0.2682333693451301, + "learning_rate": 6.0307820052788345e-06, + "loss": 0.7789, + "num_tokens": 48936548809.0, + "step": 11708 + }, + { + "epoch": 1.3914438502673796, + "grad_norm": 0.2895956175829536, + "learning_rate": 6.029338063411746e-06, + "loss": 0.8412, + "num_tokens": 48940737991.0, + "step": 11709 + }, + { + "epoch": 1.3915626856803327, + "grad_norm": 0.25026058226183767, + "learning_rate": 6.027894305625079e-06, + "loss": 0.7814, + "num_tokens": 48944923756.0, + "step": 11710 + }, + { + "epoch": 1.3916815210932858, + "grad_norm": 0.3299249103239956, + "learning_rate": 6.026450731972297e-06, + "loss": 0.8283, + "num_tokens": 48949112824.0, + "step": 11711 + }, + { + "epoch": 1.391800356506239, + "grad_norm": 0.3019386233043615, + "learning_rate": 6.025007342506861e-06, + "loss": 0.819, + "num_tokens": 48953301840.0, + "step": 11712 + }, + { + "epoch": 1.391919191919192, + "grad_norm": 0.30781545245859704, + "learning_rate": 6.023564137282227e-06, + "loss": 0.8058, + "num_tokens": 48957489967.0, + "step": 11713 + }, + { + "epoch": 1.3920380273321449, + "grad_norm": 0.29818223685890655, + "learning_rate": 6.022121116351836e-06, + "loss": 0.8067, + "num_tokens": 48961658035.0, + "step": 11714 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.3004190845028536, + "learning_rate": 6.0206782797691266e-06, + "loss": 0.853, + "num_tokens": 48965846390.0, + "step": 11715 + }, + { + "epoch": 1.392275698158051, + "grad_norm": 0.3512932988494707, + "learning_rate": 6.019235627587538e-06, + "loss": 0.8521, + "num_tokens": 48970026324.0, + "step": 11716 + }, + { + "epoch": 1.3923945335710042, + "grad_norm": 0.27179076686573883, + "learning_rate": 6.017793159860497e-06, + "loss": 0.8282, + "num_tokens": 48974214993.0, + "step": 11717 + }, + { + "epoch": 1.3925133689839573, + "grad_norm": 0.33231818007781383, + "learning_rate": 6.016350876641421e-06, + "loss": 0.7926, + "num_tokens": 48978404435.0, + "step": 11718 + }, + { + "epoch": 1.3926322043969104, + "grad_norm": 0.2797770116772767, + "learning_rate": 6.014908777983723e-06, + "loss": 0.8388, + "num_tokens": 48982593195.0, + "step": 11719 + }, + { + "epoch": 1.3927510398098633, + "grad_norm": 0.30674429903945755, + "learning_rate": 6.0134668639408044e-06, + "loss": 0.8341, + "num_tokens": 48986751810.0, + "step": 11720 + }, + { + "epoch": 1.3928698752228164, + "grad_norm": 0.2863404419173556, + "learning_rate": 6.012025134566069e-06, + "loss": 0.7989, + "num_tokens": 48990940921.0, + "step": 11721 + }, + { + "epoch": 1.3929887106357695, + "grad_norm": 0.3414633839282239, + "learning_rate": 6.010583589912904e-06, + "loss": 0.8223, + "num_tokens": 48995130614.0, + "step": 11722 + }, + { + "epoch": 1.3931075460487226, + "grad_norm": 0.3170416386402946, + "learning_rate": 6.009142230034705e-06, + "loss": 0.7934, + "num_tokens": 48999320468.0, + "step": 11723 + }, + { + "epoch": 1.3932263814616757, + "grad_norm": 0.3249314807446968, + "learning_rate": 6.00770105498484e-06, + "loss": 0.7885, + "num_tokens": 49003509226.0, + "step": 11724 + }, + { + "epoch": 1.3933452168746285, + "grad_norm": 0.28460694195828845, + "learning_rate": 6.006260064816685e-06, + "loss": 0.7953, + "num_tokens": 49007631367.0, + "step": 11725 + }, + { + "epoch": 1.3934640522875816, + "grad_norm": 0.3211454856145514, + "learning_rate": 6.004819259583604e-06, + "loss": 0.8281, + "num_tokens": 49011821776.0, + "step": 11726 + }, + { + "epoch": 1.3935828877005347, + "grad_norm": 0.2930618875561988, + "learning_rate": 6.003378639338954e-06, + "loss": 0.8539, + "num_tokens": 49016010827.0, + "step": 11727 + }, + { + "epoch": 1.3937017231134878, + "grad_norm": 0.29205376795227045, + "learning_rate": 6.001938204136087e-06, + "loss": 0.8013, + "num_tokens": 49020200081.0, + "step": 11728 + }, + { + "epoch": 1.393820558526441, + "grad_norm": 0.28331205141653115, + "learning_rate": 6.000497954028348e-06, + "loss": 0.8114, + "num_tokens": 49024388593.0, + "step": 11729 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 0.3094204669090038, + "learning_rate": 5.999057889069073e-06, + "loss": 0.7903, + "num_tokens": 49028545341.0, + "step": 11730 + }, + { + "epoch": 1.3940582293523471, + "grad_norm": 0.2593966032584163, + "learning_rate": 5.997618009311594e-06, + "loss": 0.8277, + "num_tokens": 49032734953.0, + "step": 11731 + }, + { + "epoch": 1.3941770647653, + "grad_norm": 0.28292143354725435, + "learning_rate": 5.996178314809233e-06, + "loss": 0.7966, + "num_tokens": 49036909676.0, + "step": 11732 + }, + { + "epoch": 1.3942959001782531, + "grad_norm": 0.2673039393131759, + "learning_rate": 5.994738805615306e-06, + "loss": 0.8184, + "num_tokens": 49041099585.0, + "step": 11733 + }, + { + "epoch": 1.3944147355912062, + "grad_norm": 0.3056825536962617, + "learning_rate": 5.993299481783126e-06, + "loss": 0.782, + "num_tokens": 49045288392.0, + "step": 11734 + }, + { + "epoch": 1.3945335710041593, + "grad_norm": 0.2676028273416132, + "learning_rate": 5.991860343365994e-06, + "loss": 0.8246, + "num_tokens": 49049450306.0, + "step": 11735 + }, + { + "epoch": 1.3946524064171122, + "grad_norm": 0.30717438551418025, + "learning_rate": 5.9904213904172045e-06, + "loss": 0.8127, + "num_tokens": 49053639556.0, + "step": 11736 + }, + { + "epoch": 1.3947712418300653, + "grad_norm": 0.257698143163627, + "learning_rate": 5.9889826229900515e-06, + "loss": 0.7887, + "num_tokens": 49057826712.0, + "step": 11737 + }, + { + "epoch": 1.3948900772430184, + "grad_norm": 0.3225226584043411, + "learning_rate": 5.987544041137814e-06, + "loss": 0.8371, + "num_tokens": 49061995357.0, + "step": 11738 + }, + { + "epoch": 1.3950089126559715, + "grad_norm": 0.26641937472734606, + "learning_rate": 5.98610564491377e-06, + "loss": 0.827, + "num_tokens": 49066169014.0, + "step": 11739 + }, + { + "epoch": 1.3951277480689246, + "grad_norm": 0.3439017261843928, + "learning_rate": 5.984667434371183e-06, + "loss": 0.8404, + "num_tokens": 49070359607.0, + "step": 11740 + }, + { + "epoch": 1.3952465834818777, + "grad_norm": 0.27218325533685617, + "learning_rate": 5.983229409563316e-06, + "loss": 0.7872, + "num_tokens": 49074549502.0, + "step": 11741 + }, + { + "epoch": 1.3953654188948308, + "grad_norm": 0.3651835511481437, + "learning_rate": 5.981791570543428e-06, + "loss": 0.8145, + "num_tokens": 49078708869.0, + "step": 11742 + }, + { + "epoch": 1.3954842543077837, + "grad_norm": 0.2701684723721785, + "learning_rate": 5.980353917364767e-06, + "loss": 0.8352, + "num_tokens": 49082899154.0, + "step": 11743 + }, + { + "epoch": 1.3956030897207368, + "grad_norm": 0.3062317322525319, + "learning_rate": 5.978916450080574e-06, + "loss": 0.842, + "num_tokens": 49087088781.0, + "step": 11744 + }, + { + "epoch": 1.3957219251336899, + "grad_norm": 0.2691485082392287, + "learning_rate": 5.97747916874408e-06, + "loss": 0.8174, + "num_tokens": 49091260970.0, + "step": 11745 + }, + { + "epoch": 1.395840760546643, + "grad_norm": 0.3086549544106341, + "learning_rate": 5.976042073408513e-06, + "loss": 0.7852, + "num_tokens": 49095449923.0, + "step": 11746 + }, + { + "epoch": 1.3959595959595958, + "grad_norm": 0.2989348939170193, + "learning_rate": 5.97460516412709e-06, + "loss": 0.7965, + "num_tokens": 49099639553.0, + "step": 11747 + }, + { + "epoch": 1.396078431372549, + "grad_norm": 0.27748405552556477, + "learning_rate": 5.973168440953035e-06, + "loss": 0.8, + "num_tokens": 49103828491.0, + "step": 11748 + }, + { + "epoch": 1.396197266785502, + "grad_norm": 0.2994025370714784, + "learning_rate": 5.971731903939549e-06, + "loss": 0.8141, + "num_tokens": 49108016181.0, + "step": 11749 + }, + { + "epoch": 1.3963161021984551, + "grad_norm": 0.26530312176174353, + "learning_rate": 5.9702955531398296e-06, + "loss": 0.8086, + "num_tokens": 49112207280.0, + "step": 11750 + }, + { + "epoch": 1.3964349376114082, + "grad_norm": 0.27230734765351283, + "learning_rate": 5.968859388607072e-06, + "loss": 0.7984, + "num_tokens": 49116395976.0, + "step": 11751 + }, + { + "epoch": 1.3965537730243613, + "grad_norm": 0.2503754458601, + "learning_rate": 5.967423410394461e-06, + "loss": 0.776, + "num_tokens": 49120580126.0, + "step": 11752 + }, + { + "epoch": 1.3966726084373144, + "grad_norm": 0.2782574777437034, + "learning_rate": 5.965987618555173e-06, + "loss": 0.7982, + "num_tokens": 49124745241.0, + "step": 11753 + }, + { + "epoch": 1.3967914438502673, + "grad_norm": 0.24827925808578838, + "learning_rate": 5.964552013142393e-06, + "loss": 0.8509, + "num_tokens": 49128914443.0, + "step": 11754 + }, + { + "epoch": 1.3969102792632204, + "grad_norm": 0.250359724585381, + "learning_rate": 5.963116594209273e-06, + "loss": 0.7962, + "num_tokens": 49133098378.0, + "step": 11755 + }, + { + "epoch": 1.3970291146761735, + "grad_norm": 0.2338368006997253, + "learning_rate": 5.961681361808973e-06, + "loss": 0.7816, + "num_tokens": 49137288709.0, + "step": 11756 + }, + { + "epoch": 1.3971479500891266, + "grad_norm": 0.28069515820164065, + "learning_rate": 5.96024631599465e-06, + "loss": 0.8378, + "num_tokens": 49141460333.0, + "step": 11757 + }, + { + "epoch": 1.3972667855020795, + "grad_norm": 0.29520151898117813, + "learning_rate": 5.958811456819446e-06, + "loss": 0.8548, + "num_tokens": 49145580970.0, + "step": 11758 + }, + { + "epoch": 1.3973856209150326, + "grad_norm": 0.2675197654910467, + "learning_rate": 5.957376784336497e-06, + "loss": 0.81, + "num_tokens": 49149749766.0, + "step": 11759 + }, + { + "epoch": 1.3975044563279857, + "grad_norm": 0.28700013801967234, + "learning_rate": 5.955942298598934e-06, + "loss": 0.8076, + "num_tokens": 49153938442.0, + "step": 11760 + }, + { + "epoch": 1.3976232917409388, + "grad_norm": 0.26139937739913655, + "learning_rate": 5.954507999659885e-06, + "loss": 0.8082, + "num_tokens": 49158098384.0, + "step": 11761 + }, + { + "epoch": 1.397742127153892, + "grad_norm": 0.2768557321788818, + "learning_rate": 5.953073887572463e-06, + "loss": 0.8487, + "num_tokens": 49162280882.0, + "step": 11762 + }, + { + "epoch": 1.397860962566845, + "grad_norm": 0.2600185125300496, + "learning_rate": 5.951639962389777e-06, + "loss": 0.7763, + "num_tokens": 49166453209.0, + "step": 11763 + }, + { + "epoch": 1.397979797979798, + "grad_norm": 0.26474110250001515, + "learning_rate": 5.950206224164934e-06, + "loss": 0.802, + "num_tokens": 49170642480.0, + "step": 11764 + }, + { + "epoch": 1.398098633392751, + "grad_norm": 0.26670393393802794, + "learning_rate": 5.9487726729510305e-06, + "loss": 0.8075, + "num_tokens": 49174831931.0, + "step": 11765 + }, + { + "epoch": 1.398217468805704, + "grad_norm": 0.2574044882724575, + "learning_rate": 5.947339308801148e-06, + "loss": 0.7837, + "num_tokens": 49179018120.0, + "step": 11766 + }, + { + "epoch": 1.3983363042186572, + "grad_norm": 0.2722194635571118, + "learning_rate": 5.945906131768378e-06, + "loss": 0.8, + "num_tokens": 49183206449.0, + "step": 11767 + }, + { + "epoch": 1.3984551396316103, + "grad_norm": 0.27218223036384465, + "learning_rate": 5.944473141905791e-06, + "loss": 0.8143, + "num_tokens": 49187371050.0, + "step": 11768 + }, + { + "epoch": 1.3985739750445632, + "grad_norm": 0.2689812210024139, + "learning_rate": 5.943040339266458e-06, + "loss": 0.8158, + "num_tokens": 49191540976.0, + "step": 11769 + }, + { + "epoch": 1.3986928104575163, + "grad_norm": 0.2720947258199616, + "learning_rate": 5.941607723903442e-06, + "loss": 0.8341, + "num_tokens": 49195657970.0, + "step": 11770 + }, + { + "epoch": 1.3988116458704694, + "grad_norm": 0.2840523535649371, + "learning_rate": 5.940175295869791e-06, + "loss": 0.7666, + "num_tokens": 49199848293.0, + "step": 11771 + }, + { + "epoch": 1.3989304812834225, + "grad_norm": 0.2708657451476003, + "learning_rate": 5.9387430552185536e-06, + "loss": 0.8451, + "num_tokens": 49204004557.0, + "step": 11772 + }, + { + "epoch": 1.3990493166963756, + "grad_norm": 0.29399880898259895, + "learning_rate": 5.937311002002777e-06, + "loss": 0.8494, + "num_tokens": 49208174522.0, + "step": 11773 + }, + { + "epoch": 1.3991681521093287, + "grad_norm": 0.25367379982387, + "learning_rate": 5.935879136275489e-06, + "loss": 0.7873, + "num_tokens": 49212362866.0, + "step": 11774 + }, + { + "epoch": 1.3992869875222818, + "grad_norm": 0.28932136732011704, + "learning_rate": 5.934447458089725e-06, + "loss": 0.8459, + "num_tokens": 49216537506.0, + "step": 11775 + }, + { + "epoch": 1.3994058229352346, + "grad_norm": 0.2892425425776694, + "learning_rate": 5.933015967498495e-06, + "loss": 0.8118, + "num_tokens": 49220726045.0, + "step": 11776 + }, + { + "epoch": 1.3995246583481877, + "grad_norm": 0.2910609139699313, + "learning_rate": 5.931584664554813e-06, + "loss": 0.7897, + "num_tokens": 49224915165.0, + "step": 11777 + }, + { + "epoch": 1.3996434937611408, + "grad_norm": 0.24782698741460912, + "learning_rate": 5.93015354931169e-06, + "loss": 0.789, + "num_tokens": 49229092466.0, + "step": 11778 + }, + { + "epoch": 1.399762329174094, + "grad_norm": 0.2693596818944332, + "learning_rate": 5.928722621822119e-06, + "loss": 0.8, + "num_tokens": 49233281074.0, + "step": 11779 + }, + { + "epoch": 1.3998811645870468, + "grad_norm": 0.2762613671546381, + "learning_rate": 5.927291882139102e-06, + "loss": 0.8486, + "num_tokens": 49237469054.0, + "step": 11780 + }, + { + "epoch": 1.4, + "grad_norm": 0.26737571675781224, + "learning_rate": 5.925861330315614e-06, + "loss": 0.8166, + "num_tokens": 49241656305.0, + "step": 11781 + }, + { + "epoch": 1.400118835412953, + "grad_norm": 0.2821983574035442, + "learning_rate": 5.924430966404638e-06, + "loss": 0.8389, + "num_tokens": 49245844897.0, + "step": 11782 + }, + { + "epoch": 1.4002376708259061, + "grad_norm": 0.26448677728961123, + "learning_rate": 5.923000790459145e-06, + "loss": 0.8254, + "num_tokens": 49250008964.0, + "step": 11783 + }, + { + "epoch": 1.4003565062388592, + "grad_norm": 0.2659505240365331, + "learning_rate": 5.9215708025320976e-06, + "loss": 0.8107, + "num_tokens": 49254188439.0, + "step": 11784 + }, + { + "epoch": 1.4004753416518123, + "grad_norm": 0.2610902292320558, + "learning_rate": 5.920141002676453e-06, + "loss": 0.8125, + "num_tokens": 49258377031.0, + "step": 11785 + }, + { + "epoch": 1.4005941770647654, + "grad_norm": 0.2760158710335504, + "learning_rate": 5.918711390945162e-06, + "loss": 0.803, + "num_tokens": 49262564968.0, + "step": 11786 + }, + { + "epoch": 1.4007130124777183, + "grad_norm": 0.2720179569633272, + "learning_rate": 5.91728196739117e-06, + "loss": 0.8189, + "num_tokens": 49266707224.0, + "step": 11787 + }, + { + "epoch": 1.4008318478906714, + "grad_norm": 0.2595437389136638, + "learning_rate": 5.9158527320674116e-06, + "loss": 0.7821, + "num_tokens": 49270895347.0, + "step": 11788 + }, + { + "epoch": 1.4009506833036245, + "grad_norm": 0.2781790828638302, + "learning_rate": 5.914423685026818e-06, + "loss": 0.817, + "num_tokens": 49275055688.0, + "step": 11789 + }, + { + "epoch": 1.4010695187165776, + "grad_norm": 0.2582339598623187, + "learning_rate": 5.912994826322307e-06, + "loss": 0.7786, + "num_tokens": 49279218214.0, + "step": 11790 + }, + { + "epoch": 1.4011883541295305, + "grad_norm": 0.3433660792429859, + "learning_rate": 5.911566156006798e-06, + "loss": 0.7775, + "num_tokens": 49283406966.0, + "step": 11791 + }, + { + "epoch": 1.4013071895424836, + "grad_norm": 0.28378826841721505, + "learning_rate": 5.910137674133201e-06, + "loss": 0.8229, + "num_tokens": 49287579189.0, + "step": 11792 + }, + { + "epoch": 1.4014260249554367, + "grad_norm": 0.2811914767610397, + "learning_rate": 5.908709380754411e-06, + "loss": 0.8024, + "num_tokens": 49291726553.0, + "step": 11793 + }, + { + "epoch": 1.4015448603683898, + "grad_norm": 0.27622803070242774, + "learning_rate": 5.907281275923331e-06, + "loss": 0.8128, + "num_tokens": 49295844365.0, + "step": 11794 + }, + { + "epoch": 1.4016636957813429, + "grad_norm": 0.38032676885761735, + "learning_rate": 5.905853359692841e-06, + "loss": 0.8055, + "num_tokens": 49300033702.0, + "step": 11795 + }, + { + "epoch": 1.401782531194296, + "grad_norm": 0.30237094912212353, + "learning_rate": 5.904425632115829e-06, + "loss": 0.7975, + "num_tokens": 49304224059.0, + "step": 11796 + }, + { + "epoch": 1.401901366607249, + "grad_norm": 0.3000979618726644, + "learning_rate": 5.9029980932451625e-06, + "loss": 0.8185, + "num_tokens": 49308413755.0, + "step": 11797 + }, + { + "epoch": 1.402020202020202, + "grad_norm": 0.271616243110926, + "learning_rate": 5.901570743133706e-06, + "loss": 0.8046, + "num_tokens": 49312601749.0, + "step": 11798 + }, + { + "epoch": 1.402139037433155, + "grad_norm": 0.2904586189567427, + "learning_rate": 5.900143581834326e-06, + "loss": 0.8415, + "num_tokens": 49316791617.0, + "step": 11799 + }, + { + "epoch": 1.4022578728461081, + "grad_norm": 0.2615068809551013, + "learning_rate": 5.898716609399872e-06, + "loss": 0.8402, + "num_tokens": 49320980186.0, + "step": 11800 + }, + { + "epoch": 1.4023767082590612, + "grad_norm": 0.29942591554464776, + "learning_rate": 5.897289825883192e-06, + "loss": 0.8222, + "num_tokens": 49325137779.0, + "step": 11801 + }, + { + "epoch": 1.4024955436720141, + "grad_norm": 0.2588448971823155, + "learning_rate": 5.895863231337122e-06, + "loss": 0.8514, + "num_tokens": 49329315002.0, + "step": 11802 + }, + { + "epoch": 1.4026143790849672, + "grad_norm": 0.30047761597094735, + "learning_rate": 5.8944368258144915e-06, + "loss": 0.8159, + "num_tokens": 49333477465.0, + "step": 11803 + }, + { + "epoch": 1.4027332144979203, + "grad_norm": 0.25802436214729796, + "learning_rate": 5.8930106093681264e-06, + "loss": 0.7905, + "num_tokens": 49337666432.0, + "step": 11804 + }, + { + "epoch": 1.4028520499108734, + "grad_norm": 0.32535404733947715, + "learning_rate": 5.891584582050846e-06, + "loss": 0.7934, + "num_tokens": 49341843222.0, + "step": 11805 + }, + { + "epoch": 1.4029708853238265, + "grad_norm": 0.27205336615190023, + "learning_rate": 5.890158743915466e-06, + "loss": 0.8263, + "num_tokens": 49346002264.0, + "step": 11806 + }, + { + "epoch": 1.4030897207367796, + "grad_norm": 0.33601377442067965, + "learning_rate": 5.888733095014781e-06, + "loss": 0.8242, + "num_tokens": 49350170922.0, + "step": 11807 + }, + { + "epoch": 1.4032085561497327, + "grad_norm": 0.260638770869024, + "learning_rate": 5.887307635401591e-06, + "loss": 0.8257, + "num_tokens": 49354309847.0, + "step": 11808 + }, + { + "epoch": 1.4033273915626856, + "grad_norm": 0.34657595805066316, + "learning_rate": 5.885882365128686e-06, + "loss": 0.8289, + "num_tokens": 49358499199.0, + "step": 11809 + }, + { + "epoch": 1.4034462269756387, + "grad_norm": 0.28687357121209955, + "learning_rate": 5.884457284248848e-06, + "loss": 0.7627, + "num_tokens": 49362687192.0, + "step": 11810 + }, + { + "epoch": 1.4035650623885918, + "grad_norm": 0.31779618162084516, + "learning_rate": 5.883032392814853e-06, + "loss": 0.8238, + "num_tokens": 49366861260.0, + "step": 11811 + }, + { + "epoch": 1.403683897801545, + "grad_norm": 0.3010098784808861, + "learning_rate": 5.881607690879469e-06, + "loss": 0.7897, + "num_tokens": 49371049721.0, + "step": 11812 + }, + { + "epoch": 1.403802733214498, + "grad_norm": 0.3170324267129232, + "learning_rate": 5.880183178495457e-06, + "loss": 0.8169, + "num_tokens": 49375238575.0, + "step": 11813 + }, + { + "epoch": 1.4039215686274509, + "grad_norm": 0.293899156665796, + "learning_rate": 5.878758855715574e-06, + "loss": 0.7911, + "num_tokens": 49379427833.0, + "step": 11814 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.29310940450815365, + "learning_rate": 5.877334722592565e-06, + "loss": 0.8134, + "num_tokens": 49383615522.0, + "step": 11815 + }, + { + "epoch": 1.404159239453357, + "grad_norm": 0.28705384401879774, + "learning_rate": 5.875910779179172e-06, + "loss": 0.8333, + "num_tokens": 49387802933.0, + "step": 11816 + }, + { + "epoch": 1.4042780748663102, + "grad_norm": 0.286734354121205, + "learning_rate": 5.874487025528126e-06, + "loss": 0.7981, + "num_tokens": 49391972249.0, + "step": 11817 + }, + { + "epoch": 1.4043969102792633, + "grad_norm": 0.2968127600187919, + "learning_rate": 5.873063461692159e-06, + "loss": 0.7957, + "num_tokens": 49396134663.0, + "step": 11818 + }, + { + "epoch": 1.4045157456922164, + "grad_norm": 0.28674388447025967, + "learning_rate": 5.871640087723983e-06, + "loss": 0.8259, + "num_tokens": 49400322960.0, + "step": 11819 + }, + { + "epoch": 1.4046345811051695, + "grad_norm": 0.2917831512688583, + "learning_rate": 5.870216903676315e-06, + "loss": 0.7827, + "num_tokens": 49404486047.0, + "step": 11820 + }, + { + "epoch": 1.4047534165181224, + "grad_norm": 0.3066092990351643, + "learning_rate": 5.8687939096018595e-06, + "loss": 0.8059, + "num_tokens": 49408658054.0, + "step": 11821 + }, + { + "epoch": 1.4048722519310755, + "grad_norm": 0.2933934981802002, + "learning_rate": 5.8673711055533146e-06, + "loss": 0.8122, + "num_tokens": 49412846977.0, + "step": 11822 + }, + { + "epoch": 1.4049910873440286, + "grad_norm": 0.3069143514099466, + "learning_rate": 5.865948491583371e-06, + "loss": 0.7817, + "num_tokens": 49417035089.0, + "step": 11823 + }, + { + "epoch": 1.4051099227569817, + "grad_norm": 0.301761251395894, + "learning_rate": 5.864526067744713e-06, + "loss": 0.8254, + "num_tokens": 49421224864.0, + "step": 11824 + }, + { + "epoch": 1.4052287581699345, + "grad_norm": 0.30891729391194084, + "learning_rate": 5.86310383409002e-06, + "loss": 0.8152, + "num_tokens": 49425386002.0, + "step": 11825 + }, + { + "epoch": 1.4053475935828876, + "grad_norm": 0.3016955664949851, + "learning_rate": 5.861681790671959e-06, + "loss": 0.8007, + "num_tokens": 49429575537.0, + "step": 11826 + }, + { + "epoch": 1.4054664289958407, + "grad_norm": 0.2927700790234477, + "learning_rate": 5.860259937543195e-06, + "loss": 0.8352, + "num_tokens": 49433765925.0, + "step": 11827 + }, + { + "epoch": 1.4055852644087938, + "grad_norm": 0.31002734502270884, + "learning_rate": 5.858838274756386e-06, + "loss": 0.7712, + "num_tokens": 49437955122.0, + "step": 11828 + }, + { + "epoch": 1.405704099821747, + "grad_norm": 0.29737057441443465, + "learning_rate": 5.857416802364176e-06, + "loss": 0.8225, + "num_tokens": 49442129243.0, + "step": 11829 + }, + { + "epoch": 1.4058229352347, + "grad_norm": 0.2777696960352382, + "learning_rate": 5.855995520419204e-06, + "loss": 0.8512, + "num_tokens": 49446318868.0, + "step": 11830 + }, + { + "epoch": 1.4059417706476531, + "grad_norm": 0.2929568448781889, + "learning_rate": 5.854574428974116e-06, + "loss": 0.8189, + "num_tokens": 49450508753.0, + "step": 11831 + }, + { + "epoch": 1.406060606060606, + "grad_norm": 0.2983777038716978, + "learning_rate": 5.853153528081532e-06, + "loss": 0.818, + "num_tokens": 49454696680.0, + "step": 11832 + }, + { + "epoch": 1.406179441473559, + "grad_norm": 0.2848368758805276, + "learning_rate": 5.85173281779408e-06, + "loss": 0.8252, + "num_tokens": 49458886323.0, + "step": 11833 + }, + { + "epoch": 1.4062982768865122, + "grad_norm": 0.3195809714474725, + "learning_rate": 5.8503122981643626e-06, + "loss": 0.7812, + "num_tokens": 49463075661.0, + "step": 11834 + }, + { + "epoch": 1.4064171122994653, + "grad_norm": 0.298986347429809, + "learning_rate": 5.848891969244994e-06, + "loss": 0.785, + "num_tokens": 49467264881.0, + "step": 11835 + }, + { + "epoch": 1.4065359477124182, + "grad_norm": 0.315470088394168, + "learning_rate": 5.8474718310885695e-06, + "loss": 0.8205, + "num_tokens": 49471454731.0, + "step": 11836 + }, + { + "epoch": 1.4066547831253713, + "grad_norm": 0.26952068993011036, + "learning_rate": 5.846051883747685e-06, + "loss": 0.7965, + "num_tokens": 49475587625.0, + "step": 11837 + }, + { + "epoch": 1.4067736185383244, + "grad_norm": 0.29198700357459323, + "learning_rate": 5.844632127274932e-06, + "loss": 0.7883, + "num_tokens": 49479776198.0, + "step": 11838 + }, + { + "epoch": 1.4068924539512775, + "grad_norm": 0.27045116206585457, + "learning_rate": 5.843212561722875e-06, + "loss": 0.8102, + "num_tokens": 49483952589.0, + "step": 11839 + }, + { + "epoch": 1.4070112893642306, + "grad_norm": 0.27446611596196707, + "learning_rate": 5.841793187144094e-06, + "loss": 0.8381, + "num_tokens": 49488136203.0, + "step": 11840 + }, + { + "epoch": 1.4071301247771837, + "grad_norm": 0.2761148505399152, + "learning_rate": 5.840374003591153e-06, + "loss": 0.7689, + "num_tokens": 49492258066.0, + "step": 11841 + }, + { + "epoch": 1.4072489601901368, + "grad_norm": 0.27494716738938374, + "learning_rate": 5.8389550111166024e-06, + "loss": 0.8142, + "num_tokens": 49496446509.0, + "step": 11842 + }, + { + "epoch": 1.4073677956030897, + "grad_norm": 0.2868894779930015, + "learning_rate": 5.837536209773005e-06, + "loss": 0.8129, + "num_tokens": 49500555955.0, + "step": 11843 + }, + { + "epoch": 1.4074866310160428, + "grad_norm": 0.24971081348247798, + "learning_rate": 5.836117599612893e-06, + "loss": 0.8099, + "num_tokens": 49504745494.0, + "step": 11844 + }, + { + "epoch": 1.4076054664289959, + "grad_norm": 0.26953497675514776, + "learning_rate": 5.834699180688806e-06, + "loss": 0.8239, + "num_tokens": 49508934919.0, + "step": 11845 + }, + { + "epoch": 1.407724301841949, + "grad_norm": 0.2551622855152163, + "learning_rate": 5.833280953053272e-06, + "loss": 0.8343, + "num_tokens": 49513123952.0, + "step": 11846 + }, + { + "epoch": 1.4078431372549018, + "grad_norm": 0.2802789638424753, + "learning_rate": 5.831862916758813e-06, + "loss": 0.8086, + "num_tokens": 49517295610.0, + "step": 11847 + }, + { + "epoch": 1.407961972667855, + "grad_norm": 0.23415184427533853, + "learning_rate": 5.830445071857947e-06, + "loss": 0.8197, + "num_tokens": 49521485665.0, + "step": 11848 + }, + { + "epoch": 1.408080808080808, + "grad_norm": 0.28438945247212005, + "learning_rate": 5.829027418403175e-06, + "loss": 0.8027, + "num_tokens": 49525672588.0, + "step": 11849 + }, + { + "epoch": 1.4081996434937611, + "grad_norm": 0.2699768656259779, + "learning_rate": 5.827609956447002e-06, + "loss": 0.8486, + "num_tokens": 49529843517.0, + "step": 11850 + }, + { + "epoch": 1.4083184789067142, + "grad_norm": 0.26334521329664073, + "learning_rate": 5.826192686041922e-06, + "loss": 0.8039, + "num_tokens": 49534002537.0, + "step": 11851 + }, + { + "epoch": 1.4084373143196673, + "grad_norm": 0.2852541336626987, + "learning_rate": 5.824775607240421e-06, + "loss": 0.8038, + "num_tokens": 49538190390.0, + "step": 11852 + }, + { + "epoch": 1.4085561497326204, + "grad_norm": 0.280104530929124, + "learning_rate": 5.823358720094974e-06, + "loss": 0.831, + "num_tokens": 49542379614.0, + "step": 11853 + }, + { + "epoch": 1.4086749851455733, + "grad_norm": 0.2760697165058491, + "learning_rate": 5.821942024658061e-06, + "loss": 0.8226, + "num_tokens": 49546570298.0, + "step": 11854 + }, + { + "epoch": 1.4087938205585264, + "grad_norm": 0.29094614001699465, + "learning_rate": 5.820525520982136e-06, + "loss": 0.7852, + "num_tokens": 49550761052.0, + "step": 11855 + }, + { + "epoch": 1.4089126559714795, + "grad_norm": 0.26301568442250095, + "learning_rate": 5.819109209119667e-06, + "loss": 0.8217, + "num_tokens": 49554917959.0, + "step": 11856 + }, + { + "epoch": 1.4090314913844326, + "grad_norm": 0.2762783189195698, + "learning_rate": 5.8176930891231e-06, + "loss": 0.8328, + "num_tokens": 49559076894.0, + "step": 11857 + }, + { + "epoch": 1.4091503267973855, + "grad_norm": 0.2693326167933935, + "learning_rate": 5.8162771610448785e-06, + "loss": 0.7846, + "num_tokens": 49563234592.0, + "step": 11858 + }, + { + "epoch": 1.4092691622103386, + "grad_norm": 0.26189345607545855, + "learning_rate": 5.814861424937446e-06, + "loss": 0.7679, + "num_tokens": 49567424080.0, + "step": 11859 + }, + { + "epoch": 1.4093879976232917, + "grad_norm": 0.26832429322206114, + "learning_rate": 5.813445880853221e-06, + "loss": 0.7872, + "num_tokens": 49571582752.0, + "step": 11860 + }, + { + "epoch": 1.4095068330362448, + "grad_norm": 0.27682496517688926, + "learning_rate": 5.812030528844632e-06, + "loss": 0.83, + "num_tokens": 49575772190.0, + "step": 11861 + }, + { + "epoch": 1.409625668449198, + "grad_norm": 0.2912741627742181, + "learning_rate": 5.810615368964091e-06, + "loss": 0.7802, + "num_tokens": 49579962827.0, + "step": 11862 + }, + { + "epoch": 1.409744503862151, + "grad_norm": 0.2820632385596575, + "learning_rate": 5.80920040126401e-06, + "loss": 0.8342, + "num_tokens": 49584150626.0, + "step": 11863 + }, + { + "epoch": 1.409863339275104, + "grad_norm": 0.2774359877190489, + "learning_rate": 5.807785625796793e-06, + "loss": 0.818, + "num_tokens": 49588338897.0, + "step": 11864 + }, + { + "epoch": 1.409982174688057, + "grad_norm": 0.2773916981228258, + "learning_rate": 5.806371042614827e-06, + "loss": 0.8068, + "num_tokens": 49592483393.0, + "step": 11865 + }, + { + "epoch": 1.41010101010101, + "grad_norm": 0.2723599370323617, + "learning_rate": 5.804956651770502e-06, + "loss": 0.794, + "num_tokens": 49596619874.0, + "step": 11866 + }, + { + "epoch": 1.4102198455139632, + "grad_norm": 0.28236607659181473, + "learning_rate": 5.803542453316194e-06, + "loss": 0.7865, + "num_tokens": 49600809187.0, + "step": 11867 + }, + { + "epoch": 1.4103386809269163, + "grad_norm": 0.2870071038113753, + "learning_rate": 5.802128447304278e-06, + "loss": 0.8281, + "num_tokens": 49604997550.0, + "step": 11868 + }, + { + "epoch": 1.4104575163398692, + "grad_norm": 0.28338419751969324, + "learning_rate": 5.800714633787126e-06, + "loss": 0.8395, + "num_tokens": 49609184976.0, + "step": 11869 + }, + { + "epoch": 1.4105763517528223, + "grad_norm": 0.2770508053668571, + "learning_rate": 5.799301012817086e-06, + "loss": 0.841, + "num_tokens": 49613373567.0, + "step": 11870 + }, + { + "epoch": 1.4106951871657754, + "grad_norm": 0.29029577355691893, + "learning_rate": 5.797887584446514e-06, + "loss": 0.8349, + "num_tokens": 49617540595.0, + "step": 11871 + }, + { + "epoch": 1.4108140225787285, + "grad_norm": 0.2933694658762321, + "learning_rate": 5.796474348727754e-06, + "loss": 0.8313, + "num_tokens": 49621729623.0, + "step": 11872 + }, + { + "epoch": 1.4109328579916816, + "grad_norm": 0.2730968294604975, + "learning_rate": 5.795061305713142e-06, + "loss": 0.8359, + "num_tokens": 49625913058.0, + "step": 11873 + }, + { + "epoch": 1.4110516934046347, + "grad_norm": 0.298055043507451, + "learning_rate": 5.793648455455009e-06, + "loss": 0.8306, + "num_tokens": 49630083983.0, + "step": 11874 + }, + { + "epoch": 1.4111705288175878, + "grad_norm": 0.27015931597169734, + "learning_rate": 5.792235798005677e-06, + "loss": 0.7945, + "num_tokens": 49634242716.0, + "step": 11875 + }, + { + "epoch": 1.4112893642305406, + "grad_norm": 0.3004015239477381, + "learning_rate": 5.790823333417459e-06, + "loss": 0.8099, + "num_tokens": 49638431114.0, + "step": 11876 + }, + { + "epoch": 1.4114081996434937, + "grad_norm": 0.2630432176246493, + "learning_rate": 5.789411061742667e-06, + "loss": 0.8176, + "num_tokens": 49642619077.0, + "step": 11877 + }, + { + "epoch": 1.4115270350564468, + "grad_norm": 0.2952658869949405, + "learning_rate": 5.787998983033601e-06, + "loss": 0.8033, + "num_tokens": 49646808232.0, + "step": 11878 + }, + { + "epoch": 1.4116458704694, + "grad_norm": 0.26203552288812004, + "learning_rate": 5.786587097342553e-06, + "loss": 0.811, + "num_tokens": 49650970947.0, + "step": 11879 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.29116228285950624, + "learning_rate": 5.785175404721816e-06, + "loss": 0.822, + "num_tokens": 49655159292.0, + "step": 11880 + }, + { + "epoch": 1.411883541295306, + "grad_norm": 0.2550199322165437, + "learning_rate": 5.78376390522366e-06, + "loss": 0.806, + "num_tokens": 49659349278.0, + "step": 11881 + }, + { + "epoch": 1.412002376708259, + "grad_norm": 0.29972666212866933, + "learning_rate": 5.782352598900367e-06, + "loss": 0.7947, + "num_tokens": 49663510914.0, + "step": 11882 + }, + { + "epoch": 1.412121212121212, + "grad_norm": 0.2866354640200327, + "learning_rate": 5.780941485804198e-06, + "loss": 0.8313, + "num_tokens": 49667701410.0, + "step": 11883 + }, + { + "epoch": 1.4122400475341652, + "grad_norm": 0.2897391604459599, + "learning_rate": 5.77953056598741e-06, + "loss": 0.8149, + "num_tokens": 49671889236.0, + "step": 11884 + }, + { + "epoch": 1.4123588829471183, + "grad_norm": 0.29164103944538494, + "learning_rate": 5.778119839502261e-06, + "loss": 0.7836, + "num_tokens": 49676079289.0, + "step": 11885 + }, + { + "epoch": 1.4124777183600714, + "grad_norm": 0.29736226427434237, + "learning_rate": 5.776709306400986e-06, + "loss": 0.7734, + "num_tokens": 49680268513.0, + "step": 11886 + }, + { + "epoch": 1.4125965537730243, + "grad_norm": 0.24839078815256038, + "learning_rate": 5.775298966735823e-06, + "loss": 0.8129, + "num_tokens": 49684457317.0, + "step": 11887 + }, + { + "epoch": 1.4127153891859774, + "grad_norm": 0.3014115615974318, + "learning_rate": 5.7738888205590074e-06, + "loss": 0.8135, + "num_tokens": 49688629032.0, + "step": 11888 + }, + { + "epoch": 1.4128342245989305, + "grad_norm": 0.26670609133610296, + "learning_rate": 5.772478867922759e-06, + "loss": 0.8479, + "num_tokens": 49692818598.0, + "step": 11889 + }, + { + "epoch": 1.4129530600118836, + "grad_norm": 0.28757375996533124, + "learning_rate": 5.771069108879295e-06, + "loss": 0.8384, + "num_tokens": 49697007745.0, + "step": 11890 + }, + { + "epoch": 1.4130718954248365, + "grad_norm": 0.27753573151567795, + "learning_rate": 5.7696595434808186e-06, + "loss": 0.8371, + "num_tokens": 49701195684.0, + "step": 11891 + }, + { + "epoch": 1.4131907308377896, + "grad_norm": 0.3125351013674754, + "learning_rate": 5.768250171779535e-06, + "loss": 0.8087, + "num_tokens": 49705385683.0, + "step": 11892 + }, + { + "epoch": 1.4133095662507427, + "grad_norm": 0.2874135454869625, + "learning_rate": 5.766840993827631e-06, + "loss": 0.8515, + "num_tokens": 49709549924.0, + "step": 11893 + }, + { + "epoch": 1.4134284016636958, + "grad_norm": 0.28960833086553806, + "learning_rate": 5.765432009677303e-06, + "loss": 0.8064, + "num_tokens": 49713710364.0, + "step": 11894 + }, + { + "epoch": 1.4135472370766489, + "grad_norm": 0.291938801003675, + "learning_rate": 5.76402321938073e-06, + "loss": 0.8556, + "num_tokens": 49717898949.0, + "step": 11895 + }, + { + "epoch": 1.413666072489602, + "grad_norm": 0.2702239992648739, + "learning_rate": 5.762614622990076e-06, + "loss": 0.7927, + "num_tokens": 49722089045.0, + "step": 11896 + }, + { + "epoch": 1.413784907902555, + "grad_norm": 0.25615730799166997, + "learning_rate": 5.761206220557511e-06, + "loss": 0.8234, + "num_tokens": 49726237715.0, + "step": 11897 + }, + { + "epoch": 1.413903743315508, + "grad_norm": 0.2925894941281934, + "learning_rate": 5.759798012135193e-06, + "loss": 0.7822, + "num_tokens": 49730404094.0, + "step": 11898 + }, + { + "epoch": 1.414022578728461, + "grad_norm": 0.2655325356207429, + "learning_rate": 5.758389997775271e-06, + "loss": 0.7932, + "num_tokens": 49734592506.0, + "step": 11899 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.2937400006778045, + "learning_rate": 5.75698217752989e-06, + "loss": 0.8109, + "num_tokens": 49738752758.0, + "step": 11900 + }, + { + "epoch": 1.4142602495543672, + "grad_norm": 0.30231580112599477, + "learning_rate": 5.755574551451187e-06, + "loss": 0.7821, + "num_tokens": 49742914247.0, + "step": 11901 + }, + { + "epoch": 1.4143790849673203, + "grad_norm": 0.3183055538068264, + "learning_rate": 5.75416711959129e-06, + "loss": 0.8024, + "num_tokens": 49747077395.0, + "step": 11902 + }, + { + "epoch": 1.4144979203802732, + "grad_norm": 0.32916225637526486, + "learning_rate": 5.75275988200232e-06, + "loss": 0.8251, + "num_tokens": 49751238233.0, + "step": 11903 + }, + { + "epoch": 1.4146167557932263, + "grad_norm": 0.31967879446374886, + "learning_rate": 5.7513528387363926e-06, + "loss": 0.8237, + "num_tokens": 49755426156.0, + "step": 11904 + }, + { + "epoch": 1.4147355912061794, + "grad_norm": 0.3319920776638315, + "learning_rate": 5.749945989845618e-06, + "loss": 0.8219, + "num_tokens": 49759599101.0, + "step": 11905 + }, + { + "epoch": 1.4148544266191325, + "grad_norm": 0.30812741602747507, + "learning_rate": 5.7485393353820925e-06, + "loss": 0.8261, + "num_tokens": 49763787143.0, + "step": 11906 + }, + { + "epoch": 1.4149732620320856, + "grad_norm": 0.30346770110681237, + "learning_rate": 5.747132875397912e-06, + "loss": 0.8209, + "num_tokens": 49767974912.0, + "step": 11907 + }, + { + "epoch": 1.4150920974450387, + "grad_norm": 0.3010883276987131, + "learning_rate": 5.74572660994516e-06, + "loss": 0.7803, + "num_tokens": 49772153221.0, + "step": 11908 + }, + { + "epoch": 1.4152109328579916, + "grad_norm": 0.29277127559833765, + "learning_rate": 5.744320539075918e-06, + "loss": 0.7658, + "num_tokens": 49776342399.0, + "step": 11909 + }, + { + "epoch": 1.4153297682709447, + "grad_norm": 0.33824863778726105, + "learning_rate": 5.742914662842256e-06, + "loss": 0.7781, + "num_tokens": 49780496167.0, + "step": 11910 + }, + { + "epoch": 1.4154486036838978, + "grad_norm": 0.30823615658202835, + "learning_rate": 5.7415089812962424e-06, + "loss": 0.8118, + "num_tokens": 49784685156.0, + "step": 11911 + }, + { + "epoch": 1.415567439096851, + "grad_norm": 0.2972384648093712, + "learning_rate": 5.7401034944899235e-06, + "loss": 0.7588, + "num_tokens": 49788873942.0, + "step": 11912 + }, + { + "epoch": 1.415686274509804, + "grad_norm": 0.2827069224469969, + "learning_rate": 5.738698202475359e-06, + "loss": 0.7726, + "num_tokens": 49793064600.0, + "step": 11913 + }, + { + "epoch": 1.4158051099227569, + "grad_norm": 0.30719652775056655, + "learning_rate": 5.737293105304589e-06, + "loss": 0.8244, + "num_tokens": 49797209921.0, + "step": 11914 + }, + { + "epoch": 1.41592394533571, + "grad_norm": 0.2903384619256346, + "learning_rate": 5.7358882030296505e-06, + "loss": 0.8158, + "num_tokens": 49801390193.0, + "step": 11915 + }, + { + "epoch": 1.416042780748663, + "grad_norm": 0.24904676256165098, + "learning_rate": 5.734483495702572e-06, + "loss": 0.7963, + "num_tokens": 49805551210.0, + "step": 11916 + }, + { + "epoch": 1.4161616161616162, + "grad_norm": 0.2974708029391485, + "learning_rate": 5.733078983375369e-06, + "loss": 0.8218, + "num_tokens": 49809741300.0, + "step": 11917 + }, + { + "epoch": 1.4162804515745693, + "grad_norm": 0.24926768542762356, + "learning_rate": 5.7316746661000624e-06, + "loss": 0.8511, + "num_tokens": 49813929380.0, + "step": 11918 + }, + { + "epoch": 1.4163992869875224, + "grad_norm": 0.2635445951574296, + "learning_rate": 5.730270543928648e-06, + "loss": 0.819, + "num_tokens": 49818119401.0, + "step": 11919 + }, + { + "epoch": 1.4165181224004755, + "grad_norm": 0.26481262022475194, + "learning_rate": 5.728866616913139e-06, + "loss": 0.8162, + "num_tokens": 49822301030.0, + "step": 11920 + }, + { + "epoch": 1.4166369578134284, + "grad_norm": 0.25004357116890463, + "learning_rate": 5.727462885105524e-06, + "loss": 0.8401, + "num_tokens": 49826488931.0, + "step": 11921 + }, + { + "epoch": 1.4167557932263815, + "grad_norm": 0.3008421962285821, + "learning_rate": 5.726059348557782e-06, + "loss": 0.7835, + "num_tokens": 49830677585.0, + "step": 11922 + }, + { + "epoch": 1.4168746286393346, + "grad_norm": 0.2475846117278981, + "learning_rate": 5.7246560073218935e-06, + "loss": 0.8014, + "num_tokens": 49834867975.0, + "step": 11923 + }, + { + "epoch": 1.4169934640522877, + "grad_norm": 0.30885240453134755, + "learning_rate": 5.723252861449829e-06, + "loss": 0.8154, + "num_tokens": 49839058656.0, + "step": 11924 + }, + { + "epoch": 1.4171122994652405, + "grad_norm": 0.2641239762058467, + "learning_rate": 5.7218499109935485e-06, + "loss": 0.808, + "num_tokens": 49843247509.0, + "step": 11925 + }, + { + "epoch": 1.4172311348781936, + "grad_norm": 0.3228812042647006, + "learning_rate": 5.72044715600502e-06, + "loss": 0.791, + "num_tokens": 49847413129.0, + "step": 11926 + }, + { + "epoch": 1.4173499702911467, + "grad_norm": 0.2579391710353971, + "learning_rate": 5.7190445965361805e-06, + "loss": 0.8243, + "num_tokens": 49851578075.0, + "step": 11927 + }, + { + "epoch": 1.4174688057040998, + "grad_norm": 0.2924016197201863, + "learning_rate": 5.717642232638974e-06, + "loss": 0.7932, + "num_tokens": 49855767519.0, + "step": 11928 + }, + { + "epoch": 1.417587641117053, + "grad_norm": 0.2694276912016569, + "learning_rate": 5.716240064365335e-06, + "loss": 0.8239, + "num_tokens": 49859956576.0, + "step": 11929 + }, + { + "epoch": 1.417706476530006, + "grad_norm": 0.24682671096702882, + "learning_rate": 5.714838091767192e-06, + "loss": 0.7879, + "num_tokens": 49864098773.0, + "step": 11930 + }, + { + "epoch": 1.4178253119429591, + "grad_norm": 0.2798556663612802, + "learning_rate": 5.7134363148964645e-06, + "loss": 0.8069, + "num_tokens": 49868288171.0, + "step": 11931 + }, + { + "epoch": 1.417944147355912, + "grad_norm": 0.2387100011995445, + "learning_rate": 5.712034733805063e-06, + "loss": 0.7552, + "num_tokens": 49872477187.0, + "step": 11932 + }, + { + "epoch": 1.418062982768865, + "grad_norm": 0.2819741428287323, + "learning_rate": 5.7106333485448966e-06, + "loss": 0.7956, + "num_tokens": 49876658549.0, + "step": 11933 + }, + { + "epoch": 1.4181818181818182, + "grad_norm": 0.2603715569899546, + "learning_rate": 5.709232159167859e-06, + "loss": 0.799, + "num_tokens": 49880822296.0, + "step": 11934 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.28658910606024257, + "learning_rate": 5.707831165725845e-06, + "loss": 0.8014, + "num_tokens": 49885011441.0, + "step": 11935 + }, + { + "epoch": 1.4184194890077242, + "grad_norm": 0.2667972138777098, + "learning_rate": 5.706430368270735e-06, + "loss": 0.8094, + "num_tokens": 49889186564.0, + "step": 11936 + }, + { + "epoch": 1.4185383244206773, + "grad_norm": 0.2713619155337043, + "learning_rate": 5.7050297668544066e-06, + "loss": 0.8121, + "num_tokens": 49893353944.0, + "step": 11937 + }, + { + "epoch": 1.4186571598336304, + "grad_norm": 0.2831089695259387, + "learning_rate": 5.703629361528729e-06, + "loss": 0.8028, + "num_tokens": 49897529131.0, + "step": 11938 + }, + { + "epoch": 1.4187759952465835, + "grad_norm": 0.25764974588553985, + "learning_rate": 5.702229152345562e-06, + "loss": 0.813, + "num_tokens": 49901719524.0, + "step": 11939 + }, + { + "epoch": 1.4188948306595366, + "grad_norm": 0.27338235469692185, + "learning_rate": 5.700829139356763e-06, + "loss": 0.8094, + "num_tokens": 49905908767.0, + "step": 11940 + }, + { + "epoch": 1.4190136660724897, + "grad_norm": 0.27968564043585764, + "learning_rate": 5.699429322614178e-06, + "loss": 0.8139, + "num_tokens": 49910098993.0, + "step": 11941 + }, + { + "epoch": 1.4191325014854428, + "grad_norm": 0.2898736368914461, + "learning_rate": 5.698029702169647e-06, + "loss": 0.8054, + "num_tokens": 49914287759.0, + "step": 11942 + }, + { + "epoch": 1.4192513368983957, + "grad_norm": 0.2605045096309884, + "learning_rate": 5.696630278075006e-06, + "loss": 0.8073, + "num_tokens": 49918475647.0, + "step": 11943 + }, + { + "epoch": 1.4193701723113488, + "grad_norm": 0.31330387477789795, + "learning_rate": 5.695231050382069e-06, + "loss": 0.8163, + "num_tokens": 49922665600.0, + "step": 11944 + }, + { + "epoch": 1.4194890077243019, + "grad_norm": 0.2921796456169154, + "learning_rate": 5.693832019142666e-06, + "loss": 0.8142, + "num_tokens": 49926841438.0, + "step": 11945 + }, + { + "epoch": 1.419607843137255, + "grad_norm": 0.27149533132323944, + "learning_rate": 5.6924331844086045e-06, + "loss": 0.7849, + "num_tokens": 49931030331.0, + "step": 11946 + }, + { + "epoch": 1.4197266785502078, + "grad_norm": 0.29530578877815034, + "learning_rate": 5.691034546231686e-06, + "loss": 0.7686, + "num_tokens": 49935218133.0, + "step": 11947 + }, + { + "epoch": 1.419845513963161, + "grad_norm": 0.2819876020490841, + "learning_rate": 5.689636104663714e-06, + "loss": 0.8099, + "num_tokens": 49939366456.0, + "step": 11948 + }, + { + "epoch": 1.419964349376114, + "grad_norm": 0.26140956003470395, + "learning_rate": 5.688237859756467e-06, + "loss": 0.8463, + "num_tokens": 49943532025.0, + "step": 11949 + }, + { + "epoch": 1.4200831847890671, + "grad_norm": 0.28448326294494036, + "learning_rate": 5.686839811561731e-06, + "loss": 0.8263, + "num_tokens": 49947721674.0, + "step": 11950 + }, + { + "epoch": 1.4202020202020202, + "grad_norm": 0.2877721260175861, + "learning_rate": 5.685441960131279e-06, + "loss": 0.8179, + "num_tokens": 49951911217.0, + "step": 11951 + }, + { + "epoch": 1.4203208556149733, + "grad_norm": 0.2668467319595308, + "learning_rate": 5.684044305516883e-06, + "loss": 0.8124, + "num_tokens": 49956099064.0, + "step": 11952 + }, + { + "epoch": 1.4204396910279264, + "grad_norm": 0.2959006869649749, + "learning_rate": 5.682646847770303e-06, + "loss": 0.7896, + "num_tokens": 49960288851.0, + "step": 11953 + }, + { + "epoch": 1.4205585264408793, + "grad_norm": 0.27695704605625243, + "learning_rate": 5.681249586943285e-06, + "loss": 0.8219, + "num_tokens": 49964479951.0, + "step": 11954 + }, + { + "epoch": 1.4206773618538324, + "grad_norm": 0.30788100330294055, + "learning_rate": 5.679852523087579e-06, + "loss": 0.7869, + "num_tokens": 49968669575.0, + "step": 11955 + }, + { + "epoch": 1.4207961972667855, + "grad_norm": 0.2798004503923958, + "learning_rate": 5.6784556562549205e-06, + "loss": 0.8033, + "num_tokens": 49972857784.0, + "step": 11956 + }, + { + "epoch": 1.4209150326797386, + "grad_norm": 0.3482031462340128, + "learning_rate": 5.677058986497039e-06, + "loss": 0.8144, + "num_tokens": 49977046094.0, + "step": 11957 + }, + { + "epoch": 1.4210338680926915, + "grad_norm": 0.28835154473172175, + "learning_rate": 5.675662513865667e-06, + "loss": 0.8447, + "num_tokens": 49981235033.0, + "step": 11958 + }, + { + "epoch": 1.4211527035056446, + "grad_norm": 0.3585750051564846, + "learning_rate": 5.674266238412512e-06, + "loss": 0.8448, + "num_tokens": 49985421715.0, + "step": 11959 + }, + { + "epoch": 1.4212715389185977, + "grad_norm": 0.2766519500195734, + "learning_rate": 5.6728701601892844e-06, + "loss": 0.8418, + "num_tokens": 49989609019.0, + "step": 11960 + }, + { + "epoch": 1.4213903743315508, + "grad_norm": 0.34554507142156776, + "learning_rate": 5.671474279247686e-06, + "loss": 0.8003, + "num_tokens": 49993797962.0, + "step": 11961 + }, + { + "epoch": 1.421509209744504, + "grad_norm": 0.2849741325689031, + "learning_rate": 5.67007859563941e-06, + "loss": 0.8009, + "num_tokens": 49997980840.0, + "step": 11962 + }, + { + "epoch": 1.421628045157457, + "grad_norm": 0.2826605219843583, + "learning_rate": 5.6686831094161465e-06, + "loss": 0.8175, + "num_tokens": 50002140822.0, + "step": 11963 + }, + { + "epoch": 1.42174688057041, + "grad_norm": 0.2763600682127502, + "learning_rate": 5.667287820629572e-06, + "loss": 0.8058, + "num_tokens": 50006330171.0, + "step": 11964 + }, + { + "epoch": 1.421865715983363, + "grad_norm": 0.29055648434036613, + "learning_rate": 5.665892729331359e-06, + "loss": 0.8329, + "num_tokens": 50010519829.0, + "step": 11965 + }, + { + "epoch": 1.421984551396316, + "grad_norm": 0.2652636544809883, + "learning_rate": 5.664497835573175e-06, + "loss": 0.7992, + "num_tokens": 50014708500.0, + "step": 11966 + }, + { + "epoch": 1.4221033868092692, + "grad_norm": 0.27784922609839, + "learning_rate": 5.663103139406676e-06, + "loss": 0.802, + "num_tokens": 50018897977.0, + "step": 11967 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.261360339268705, + "learning_rate": 5.66170864088351e-06, + "loss": 0.8103, + "num_tokens": 50023040478.0, + "step": 11968 + }, + { + "epoch": 1.4223410576351752, + "grad_norm": 0.2723496962711894, + "learning_rate": 5.660314340055327e-06, + "loss": 0.8187, + "num_tokens": 50027230271.0, + "step": 11969 + }, + { + "epoch": 1.4224598930481283, + "grad_norm": 0.2812302789245839, + "learning_rate": 5.658920236973752e-06, + "loss": 0.8322, + "num_tokens": 50031375229.0, + "step": 11970 + }, + { + "epoch": 1.4225787284610814, + "grad_norm": 0.24687106285226895, + "learning_rate": 5.657526331690422e-06, + "loss": 0.8311, + "num_tokens": 50035563523.0, + "step": 11971 + }, + { + "epoch": 1.4226975638740345, + "grad_norm": 0.2591238681259109, + "learning_rate": 5.656132624256955e-06, + "loss": 0.7889, + "num_tokens": 50039681376.0, + "step": 11972 + }, + { + "epoch": 1.4228163992869876, + "grad_norm": 0.2550289626825554, + "learning_rate": 5.654739114724965e-06, + "loss": 0.808, + "num_tokens": 50043871127.0, + "step": 11973 + }, + { + "epoch": 1.4229352346999407, + "grad_norm": 0.24826798421226282, + "learning_rate": 5.65334580314606e-06, + "loss": 0.8201, + "num_tokens": 50048060774.0, + "step": 11974 + }, + { + "epoch": 1.4230540701128938, + "grad_norm": 0.26374075088138493, + "learning_rate": 5.651952689571837e-06, + "loss": 0.8211, + "num_tokens": 50052250240.0, + "step": 11975 + }, + { + "epoch": 1.4231729055258466, + "grad_norm": 0.27801284957367695, + "learning_rate": 5.650559774053883e-06, + "loss": 0.816, + "num_tokens": 50056439865.0, + "step": 11976 + }, + { + "epoch": 1.4232917409387997, + "grad_norm": 0.241697770015763, + "learning_rate": 5.649167056643792e-06, + "loss": 0.8236, + "num_tokens": 50060629578.0, + "step": 11977 + }, + { + "epoch": 1.4234105763517528, + "grad_norm": 0.27459071735896784, + "learning_rate": 5.647774537393134e-06, + "loss": 0.7947, + "num_tokens": 50064818988.0, + "step": 11978 + }, + { + "epoch": 1.423529411764706, + "grad_norm": 0.2568680139673256, + "learning_rate": 5.646382216353487e-06, + "loss": 0.7989, + "num_tokens": 50069006203.0, + "step": 11979 + }, + { + "epoch": 1.4236482471776588, + "grad_norm": 0.29864695498070043, + "learning_rate": 5.644990093576403e-06, + "loss": 0.7901, + "num_tokens": 50073194877.0, + "step": 11980 + }, + { + "epoch": 1.423767082590612, + "grad_norm": 0.2657369930203411, + "learning_rate": 5.643598169113442e-06, + "loss": 0.7952, + "num_tokens": 50077383206.0, + "step": 11981 + }, + { + "epoch": 1.423885918003565, + "grad_norm": 0.2913313333694096, + "learning_rate": 5.642206443016148e-06, + "loss": 0.7738, + "num_tokens": 50081556535.0, + "step": 11982 + }, + { + "epoch": 1.424004753416518, + "grad_norm": 0.2881138772473084, + "learning_rate": 5.640814915336068e-06, + "loss": 0.8549, + "num_tokens": 50085743260.0, + "step": 11983 + }, + { + "epoch": 1.4241235888294712, + "grad_norm": 0.2916660476766645, + "learning_rate": 5.6394235861247345e-06, + "loss": 0.8159, + "num_tokens": 50089931181.0, + "step": 11984 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 0.3026128646661144, + "learning_rate": 5.638032455433666e-06, + "loss": 0.8222, + "num_tokens": 50094080122.0, + "step": 11985 + }, + { + "epoch": 1.4243612596553774, + "grad_norm": 0.2919544462144838, + "learning_rate": 5.636641523314387e-06, + "loss": 0.8512, + "num_tokens": 50098269787.0, + "step": 11986 + }, + { + "epoch": 1.4244800950683303, + "grad_norm": 0.33321040471674906, + "learning_rate": 5.635250789818405e-06, + "loss": 0.7944, + "num_tokens": 50102431961.0, + "step": 11987 + }, + { + "epoch": 1.4245989304812834, + "grad_norm": 0.26782868586780595, + "learning_rate": 5.633860254997226e-06, + "loss": 0.7964, + "num_tokens": 50106620616.0, + "step": 11988 + }, + { + "epoch": 1.4247177658942365, + "grad_norm": 0.2971577674283108, + "learning_rate": 5.6324699189023436e-06, + "loss": 0.8004, + "num_tokens": 50110779784.0, + "step": 11989 + }, + { + "epoch": 1.4248366013071896, + "grad_norm": 0.25747501309664944, + "learning_rate": 5.631079781585249e-06, + "loss": 0.8349, + "num_tokens": 50114967466.0, + "step": 11990 + }, + { + "epoch": 1.4249554367201425, + "grad_norm": 0.3088838714289648, + "learning_rate": 5.629689843097421e-06, + "loss": 0.7477, + "num_tokens": 50119156354.0, + "step": 11991 + }, + { + "epoch": 1.4250742721330956, + "grad_norm": 0.2576874809666062, + "learning_rate": 5.628300103490338e-06, + "loss": 0.794, + "num_tokens": 50123346461.0, + "step": 11992 + }, + { + "epoch": 1.4251931075460487, + "grad_norm": 0.2803523916516066, + "learning_rate": 5.626910562815464e-06, + "loss": 0.7748, + "num_tokens": 50127537081.0, + "step": 11993 + }, + { + "epoch": 1.4253119429590018, + "grad_norm": 0.2556220454446815, + "learning_rate": 5.625521221124257e-06, + "loss": 0.8333, + "num_tokens": 50131727636.0, + "step": 11994 + }, + { + "epoch": 1.4254307783719549, + "grad_norm": 0.26214206165509085, + "learning_rate": 5.624132078468172e-06, + "loss": 0.8112, + "num_tokens": 50135916581.0, + "step": 11995 + }, + { + "epoch": 1.425549613784908, + "grad_norm": 0.25053722116959926, + "learning_rate": 5.622743134898652e-06, + "loss": 0.7994, + "num_tokens": 50140105211.0, + "step": 11996 + }, + { + "epoch": 1.425668449197861, + "grad_norm": 0.29520466316222166, + "learning_rate": 5.621354390467135e-06, + "loss": 0.7868, + "num_tokens": 50144292866.0, + "step": 11997 + }, + { + "epoch": 1.425787284610814, + "grad_norm": 0.2761910055574452, + "learning_rate": 5.619965845225051e-06, + "loss": 0.821, + "num_tokens": 50148481207.0, + "step": 11998 + }, + { + "epoch": 1.425906120023767, + "grad_norm": 0.2874465473038238, + "learning_rate": 5.618577499223822e-06, + "loss": 0.8111, + "num_tokens": 50152670135.0, + "step": 11999 + }, + { + "epoch": 1.4260249554367201, + "grad_norm": 0.2679524134431459, + "learning_rate": 5.6171893525148644e-06, + "loss": 0.8309, + "num_tokens": 50156860273.0, + "step": 12000 + }, + { + "epoch": 1.4261437908496732, + "grad_norm": 0.26454075864033066, + "learning_rate": 5.6158014051495805e-06, + "loss": 0.8214, + "num_tokens": 50161049498.0, + "step": 12001 + }, + { + "epoch": 1.4262626262626263, + "grad_norm": 0.2591633905028757, + "learning_rate": 5.6144136571793786e-06, + "loss": 0.816, + "num_tokens": 50165239631.0, + "step": 12002 + }, + { + "epoch": 1.4263814616755792, + "grad_norm": 0.2657169875636454, + "learning_rate": 5.6130261086556456e-06, + "loss": 0.8376, + "num_tokens": 50169420627.0, + "step": 12003 + }, + { + "epoch": 1.4265002970885323, + "grad_norm": 0.28756392949026766, + "learning_rate": 5.611638759629772e-06, + "loss": 0.82, + "num_tokens": 50173584789.0, + "step": 12004 + }, + { + "epoch": 1.4266191325014854, + "grad_norm": 0.2554501205277594, + "learning_rate": 5.610251610153136e-06, + "loss": 0.7939, + "num_tokens": 50177773737.0, + "step": 12005 + }, + { + "epoch": 1.4267379679144385, + "grad_norm": 0.287830774946855, + "learning_rate": 5.6088646602771025e-06, + "loss": 0.825, + "num_tokens": 50181960794.0, + "step": 12006 + }, + { + "epoch": 1.4268568033273916, + "grad_norm": 0.2503216898391043, + "learning_rate": 5.6074779100530385e-06, + "loss": 0.7748, + "num_tokens": 50186142241.0, + "step": 12007 + }, + { + "epoch": 1.4269756387403447, + "grad_norm": 0.30319548844993616, + "learning_rate": 5.606091359532296e-06, + "loss": 0.8326, + "num_tokens": 50190332577.0, + "step": 12008 + }, + { + "epoch": 1.4270944741532978, + "grad_norm": 0.25626061497591013, + "learning_rate": 5.604705008766234e-06, + "loss": 0.8332, + "num_tokens": 50194522630.0, + "step": 12009 + }, + { + "epoch": 1.4272133095662507, + "grad_norm": 0.33258656580928403, + "learning_rate": 5.603318857806187e-06, + "loss": 0.8111, + "num_tokens": 50198680309.0, + "step": 12010 + }, + { + "epoch": 1.4273321449792038, + "grad_norm": 0.2775057594056944, + "learning_rate": 5.601932906703489e-06, + "loss": 0.797, + "num_tokens": 50202851269.0, + "step": 12011 + }, + { + "epoch": 1.427450980392157, + "grad_norm": 0.33948235936508947, + "learning_rate": 5.600547155509464e-06, + "loss": 0.7724, + "num_tokens": 50207041218.0, + "step": 12012 + }, + { + "epoch": 1.42756981580511, + "grad_norm": 0.2894535724051627, + "learning_rate": 5.599161604275436e-06, + "loss": 0.8536, + "num_tokens": 50211188472.0, + "step": 12013 + }, + { + "epoch": 1.4276886512180629, + "grad_norm": 0.32846175419062734, + "learning_rate": 5.597776253052711e-06, + "loss": 0.8238, + "num_tokens": 50215365943.0, + "step": 12014 + }, + { + "epoch": 1.427807486631016, + "grad_norm": 0.2790971511399373, + "learning_rate": 5.596391101892602e-06, + "loss": 0.8169, + "num_tokens": 50219527077.0, + "step": 12015 + }, + { + "epoch": 1.427926322043969, + "grad_norm": 0.30446326334900775, + "learning_rate": 5.595006150846398e-06, + "loss": 0.8084, + "num_tokens": 50223675786.0, + "step": 12016 + }, + { + "epoch": 1.4280451574569222, + "grad_norm": 0.2757319446695984, + "learning_rate": 5.59362139996539e-06, + "loss": 0.8366, + "num_tokens": 50227834144.0, + "step": 12017 + }, + { + "epoch": 1.4281639928698753, + "grad_norm": 0.30312097976890984, + "learning_rate": 5.5922368493008625e-06, + "loss": 0.8238, + "num_tokens": 50232022532.0, + "step": 12018 + }, + { + "epoch": 1.4282828282828284, + "grad_norm": 0.2860654921175424, + "learning_rate": 5.5908524989040895e-06, + "loss": 0.7638, + "num_tokens": 50236187434.0, + "step": 12019 + }, + { + "epoch": 1.4284016636957815, + "grad_norm": 0.27221883130679286, + "learning_rate": 5.589468348826334e-06, + "loss": 0.8275, + "num_tokens": 50240335778.0, + "step": 12020 + }, + { + "epoch": 1.4285204991087344, + "grad_norm": 0.27193360636195774, + "learning_rate": 5.5880843991188605e-06, + "loss": 0.8145, + "num_tokens": 50244524823.0, + "step": 12021 + }, + { + "epoch": 1.4286393345216875, + "grad_norm": 0.28142740565640184, + "learning_rate": 5.586700649832921e-06, + "loss": 0.7803, + "num_tokens": 50248697680.0, + "step": 12022 + }, + { + "epoch": 1.4287581699346406, + "grad_norm": 0.27810063966977605, + "learning_rate": 5.5853171010197585e-06, + "loss": 0.84, + "num_tokens": 50252886569.0, + "step": 12023 + }, + { + "epoch": 1.4288770053475937, + "grad_norm": 0.29674492688398246, + "learning_rate": 5.5839337527306104e-06, + "loss": 0.8025, + "num_tokens": 50257076173.0, + "step": 12024 + }, + { + "epoch": 1.4289958407605465, + "grad_norm": 0.2815427193889941, + "learning_rate": 5.582550605016708e-06, + "loss": 0.8033, + "num_tokens": 50261265117.0, + "step": 12025 + }, + { + "epoch": 1.4291146761734996, + "grad_norm": 0.2682131944411783, + "learning_rate": 5.581167657929277e-06, + "loss": 0.8087, + "num_tokens": 50265436352.0, + "step": 12026 + }, + { + "epoch": 1.4292335115864527, + "grad_norm": 0.27181428235768196, + "learning_rate": 5.579784911519522e-06, + "loss": 0.8444, + "num_tokens": 50269626166.0, + "step": 12027 + }, + { + "epoch": 1.4293523469994058, + "grad_norm": 0.27242878246811364, + "learning_rate": 5.578402365838662e-06, + "loss": 0.8319, + "num_tokens": 50273817095.0, + "step": 12028 + }, + { + "epoch": 1.429471182412359, + "grad_norm": 0.2708326887715196, + "learning_rate": 5.577020020937894e-06, + "loss": 0.7746, + "num_tokens": 50278004959.0, + "step": 12029 + }, + { + "epoch": 1.429590017825312, + "grad_norm": 0.2585531841502501, + "learning_rate": 5.57563787686841e-06, + "loss": 0.8266, + "num_tokens": 50282171038.0, + "step": 12030 + }, + { + "epoch": 1.4297088532382651, + "grad_norm": 0.27254065558613216, + "learning_rate": 5.574255933681396e-06, + "loss": 0.8356, + "num_tokens": 50286347951.0, + "step": 12031 + }, + { + "epoch": 1.429827688651218, + "grad_norm": 0.28550681056586386, + "learning_rate": 5.572874191428028e-06, + "loss": 0.7813, + "num_tokens": 50290537603.0, + "step": 12032 + }, + { + "epoch": 1.429946524064171, + "grad_norm": 0.2502955553563272, + "learning_rate": 5.571492650159475e-06, + "loss": 0.7669, + "num_tokens": 50294687523.0, + "step": 12033 + }, + { + "epoch": 1.4300653594771242, + "grad_norm": 0.27465511682547056, + "learning_rate": 5.570111309926908e-06, + "loss": 0.8403, + "num_tokens": 50298873257.0, + "step": 12034 + }, + { + "epoch": 1.4301841948900773, + "grad_norm": 0.2581830930427694, + "learning_rate": 5.5687301707814765e-06, + "loss": 0.7802, + "num_tokens": 50303064777.0, + "step": 12035 + }, + { + "epoch": 1.4303030303030302, + "grad_norm": 0.2789966454641836, + "learning_rate": 5.567349232774335e-06, + "loss": 0.8146, + "num_tokens": 50307241605.0, + "step": 12036 + }, + { + "epoch": 1.4304218657159833, + "grad_norm": 0.26173973940350304, + "learning_rate": 5.565968495956615e-06, + "loss": 0.8391, + "num_tokens": 50311421316.0, + "step": 12037 + }, + { + "epoch": 1.4305407011289364, + "grad_norm": 0.2703525923347045, + "learning_rate": 5.564587960379455e-06, + "loss": 0.825, + "num_tokens": 50315568122.0, + "step": 12038 + }, + { + "epoch": 1.4306595365418895, + "grad_norm": 0.27134155104680946, + "learning_rate": 5.563207626093981e-06, + "loss": 0.7798, + "num_tokens": 50319757376.0, + "step": 12039 + }, + { + "epoch": 1.4307783719548426, + "grad_norm": 0.28115935441602347, + "learning_rate": 5.5618274931513076e-06, + "loss": 0.8265, + "num_tokens": 50323945594.0, + "step": 12040 + }, + { + "epoch": 1.4308972073677957, + "grad_norm": 0.2648869955553577, + "learning_rate": 5.560447561602553e-06, + "loss": 0.768, + "num_tokens": 50328135293.0, + "step": 12041 + }, + { + "epoch": 1.4310160427807488, + "grad_norm": 0.25705063584294036, + "learning_rate": 5.559067831498817e-06, + "loss": 0.8348, + "num_tokens": 50332325349.0, + "step": 12042 + }, + { + "epoch": 1.4311348781937017, + "grad_norm": 0.2583212308542355, + "learning_rate": 5.557688302891193e-06, + "loss": 0.8544, + "num_tokens": 50336514192.0, + "step": 12043 + }, + { + "epoch": 1.4312537136066548, + "grad_norm": 0.2553901653896888, + "learning_rate": 5.556308975830773e-06, + "loss": 0.8033, + "num_tokens": 50340703693.0, + "step": 12044 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 0.2694507264939378, + "learning_rate": 5.554929850368638e-06, + "loss": 0.8112, + "num_tokens": 50344893893.0, + "step": 12045 + }, + { + "epoch": 1.431491384432561, + "grad_norm": 0.26994866243347565, + "learning_rate": 5.55355092655586e-06, + "loss": 0.8364, + "num_tokens": 50349046815.0, + "step": 12046 + }, + { + "epoch": 1.4316102198455138, + "grad_norm": 0.25707540831732645, + "learning_rate": 5.552172204443504e-06, + "loss": 0.8184, + "num_tokens": 50353211328.0, + "step": 12047 + }, + { + "epoch": 1.431729055258467, + "grad_norm": 0.24456460188058105, + "learning_rate": 5.550793684082634e-06, + "loss": 0.8085, + "num_tokens": 50357400406.0, + "step": 12048 + }, + { + "epoch": 1.43184789067142, + "grad_norm": 0.25150509777451596, + "learning_rate": 5.549415365524298e-06, + "loss": 0.8141, + "num_tokens": 50361561324.0, + "step": 12049 + }, + { + "epoch": 1.4319667260843731, + "grad_norm": 0.24849705609202904, + "learning_rate": 5.548037248819538e-06, + "loss": 0.8053, + "num_tokens": 50365750236.0, + "step": 12050 + }, + { + "epoch": 1.4320855614973262, + "grad_norm": 0.2600123344356142, + "learning_rate": 5.546659334019393e-06, + "loss": 0.8108, + "num_tokens": 50369938152.0, + "step": 12051 + }, + { + "epoch": 1.4322043969102793, + "grad_norm": 0.24741012041807542, + "learning_rate": 5.5452816211748905e-06, + "loss": 0.7982, + "num_tokens": 50374126639.0, + "step": 12052 + }, + { + "epoch": 1.4323232323232324, + "grad_norm": 0.2458156724921303, + "learning_rate": 5.5439041103370525e-06, + "loss": 0.8219, + "num_tokens": 50378314703.0, + "step": 12053 + }, + { + "epoch": 1.4324420677361853, + "grad_norm": 0.2447811359763242, + "learning_rate": 5.542526801556894e-06, + "loss": 0.8262, + "num_tokens": 50382502846.0, + "step": 12054 + }, + { + "epoch": 1.4325609031491384, + "grad_norm": 0.2674824924238598, + "learning_rate": 5.541149694885419e-06, + "loss": 0.8315, + "num_tokens": 50386670869.0, + "step": 12055 + }, + { + "epoch": 1.4326797385620915, + "grad_norm": 0.25186954674932227, + "learning_rate": 5.5397727903736276e-06, + "loss": 0.7875, + "num_tokens": 50390857727.0, + "step": 12056 + }, + { + "epoch": 1.4327985739750446, + "grad_norm": 0.2785308885427549, + "learning_rate": 5.5383960880725115e-06, + "loss": 0.8272, + "num_tokens": 50395046970.0, + "step": 12057 + }, + { + "epoch": 1.4329174093879975, + "grad_norm": 0.2641422060612831, + "learning_rate": 5.537019588033059e-06, + "loss": 0.7976, + "num_tokens": 50399235492.0, + "step": 12058 + }, + { + "epoch": 1.4330362448009506, + "grad_norm": 0.26152627905068726, + "learning_rate": 5.535643290306234e-06, + "loss": 0.8375, + "num_tokens": 50403424366.0, + "step": 12059 + }, + { + "epoch": 1.4331550802139037, + "grad_norm": 0.2602678767385144, + "learning_rate": 5.534267194943017e-06, + "loss": 0.8034, + "num_tokens": 50407603680.0, + "step": 12060 + }, + { + "epoch": 1.4332739156268568, + "grad_norm": 0.24235872525466795, + "learning_rate": 5.5328913019943655e-06, + "loss": 0.7965, + "num_tokens": 50411793293.0, + "step": 12061 + }, + { + "epoch": 1.43339275103981, + "grad_norm": 0.25874337597620284, + "learning_rate": 5.531515611511235e-06, + "loss": 0.7879, + "num_tokens": 50415982637.0, + "step": 12062 + }, + { + "epoch": 1.433511586452763, + "grad_norm": 0.24834901527890885, + "learning_rate": 5.530140123544572e-06, + "loss": 0.7915, + "num_tokens": 50420157804.0, + "step": 12063 + }, + { + "epoch": 1.433630421865716, + "grad_norm": 0.28174673557220703, + "learning_rate": 5.528764838145313e-06, + "loss": 0.8127, + "num_tokens": 50424347352.0, + "step": 12064 + }, + { + "epoch": 1.433749257278669, + "grad_norm": 0.25997926124108833, + "learning_rate": 5.527389755364386e-06, + "loss": 0.8404, + "num_tokens": 50428513212.0, + "step": 12065 + }, + { + "epoch": 1.433868092691622, + "grad_norm": 0.26246485847058637, + "learning_rate": 5.526014875252723e-06, + "loss": 0.8732, + "num_tokens": 50432702225.0, + "step": 12066 + }, + { + "epoch": 1.4339869281045752, + "grad_norm": 0.2601967626095311, + "learning_rate": 5.524640197861237e-06, + "loss": 0.7877, + "num_tokens": 50436891842.0, + "step": 12067 + }, + { + "epoch": 1.4341057635175283, + "grad_norm": 0.2564902386656819, + "learning_rate": 5.523265723240842e-06, + "loss": 0.8155, + "num_tokens": 50441080851.0, + "step": 12068 + }, + { + "epoch": 1.4342245989304812, + "grad_norm": 0.2704516077646113, + "learning_rate": 5.521891451442429e-06, + "loss": 0.7954, + "num_tokens": 50445240108.0, + "step": 12069 + }, + { + "epoch": 1.4343434343434343, + "grad_norm": 0.271390985154773, + "learning_rate": 5.5205173825169015e-06, + "loss": 0.8107, + "num_tokens": 50449426949.0, + "step": 12070 + }, + { + "epoch": 1.4344622697563874, + "grad_norm": 0.2700876811766304, + "learning_rate": 5.519143516515135e-06, + "loss": 0.7973, + "num_tokens": 50453615749.0, + "step": 12071 + }, + { + "epoch": 1.4345811051693405, + "grad_norm": 0.26067380661190226, + "learning_rate": 5.5177698534880195e-06, + "loss": 0.8026, + "num_tokens": 50457767693.0, + "step": 12072 + }, + { + "epoch": 1.4346999405822936, + "grad_norm": 0.2645181750663852, + "learning_rate": 5.516396393486425e-06, + "loss": 0.7721, + "num_tokens": 50461951761.0, + "step": 12073 + }, + { + "epoch": 1.4348187759952467, + "grad_norm": 0.26122761919385773, + "learning_rate": 5.515023136561209e-06, + "loss": 0.8204, + "num_tokens": 50466107475.0, + "step": 12074 + }, + { + "epoch": 1.4349376114081998, + "grad_norm": 0.2865592826325182, + "learning_rate": 5.513650082763234e-06, + "loss": 0.8214, + "num_tokens": 50470296627.0, + "step": 12075 + }, + { + "epoch": 1.4350564468211526, + "grad_norm": 0.2999658110757485, + "learning_rate": 5.512277232143343e-06, + "loss": 0.8065, + "num_tokens": 50474486382.0, + "step": 12076 + }, + { + "epoch": 1.4351752822341057, + "grad_norm": 0.2639665046442839, + "learning_rate": 5.510904584752383e-06, + "loss": 0.7915, + "num_tokens": 50478675674.0, + "step": 12077 + }, + { + "epoch": 1.4352941176470588, + "grad_norm": 0.30530481437365875, + "learning_rate": 5.509532140641182e-06, + "loss": 0.7987, + "num_tokens": 50482860846.0, + "step": 12078 + }, + { + "epoch": 1.435412953060012, + "grad_norm": 0.2638207796498986, + "learning_rate": 5.508159899860572e-06, + "loss": 0.811, + "num_tokens": 50487050568.0, + "step": 12079 + }, + { + "epoch": 1.4355317884729648, + "grad_norm": 0.275992768850692, + "learning_rate": 5.506787862461369e-06, + "loss": 0.8119, + "num_tokens": 50491220824.0, + "step": 12080 + }, + { + "epoch": 1.435650623885918, + "grad_norm": 0.2668207228061273, + "learning_rate": 5.505416028494386e-06, + "loss": 0.8183, + "num_tokens": 50495409909.0, + "step": 12081 + }, + { + "epoch": 1.435769459298871, + "grad_norm": 0.25944613365110025, + "learning_rate": 5.5040443980104215e-06, + "loss": 0.8102, + "num_tokens": 50499555504.0, + "step": 12082 + }, + { + "epoch": 1.435888294711824, + "grad_norm": 0.29448588969510464, + "learning_rate": 5.502672971060279e-06, + "loss": 0.8535, + "num_tokens": 50503743759.0, + "step": 12083 + }, + { + "epoch": 1.4360071301247772, + "grad_norm": 0.2466402569635061, + "learning_rate": 5.5013017476947405e-06, + "loss": 0.7989, + "num_tokens": 50507932325.0, + "step": 12084 + }, + { + "epoch": 1.4361259655377303, + "grad_norm": 0.26198025359564, + "learning_rate": 5.499930727964591e-06, + "loss": 0.8373, + "num_tokens": 50512116945.0, + "step": 12085 + }, + { + "epoch": 1.4362448009506834, + "grad_norm": 0.2642174606942234, + "learning_rate": 5.498559911920602e-06, + "loss": 0.7751, + "num_tokens": 50516292379.0, + "step": 12086 + }, + { + "epoch": 1.4363636363636363, + "grad_norm": 0.26398842377517273, + "learning_rate": 5.4971892996135414e-06, + "loss": 0.8332, + "num_tokens": 50520468407.0, + "step": 12087 + }, + { + "epoch": 1.4364824717765894, + "grad_norm": 0.29003193980168773, + "learning_rate": 5.495818891094167e-06, + "loss": 0.8157, + "num_tokens": 50524656898.0, + "step": 12088 + }, + { + "epoch": 1.4366013071895425, + "grad_norm": 0.2503321853023531, + "learning_rate": 5.494448686413231e-06, + "loss": 0.8102, + "num_tokens": 50528831943.0, + "step": 12089 + }, + { + "epoch": 1.4367201426024956, + "grad_norm": 0.2820224738103627, + "learning_rate": 5.493078685621469e-06, + "loss": 0.8149, + "num_tokens": 50533022566.0, + "step": 12090 + }, + { + "epoch": 1.4368389780154487, + "grad_norm": 0.24405161635762562, + "learning_rate": 5.491708888769625e-06, + "loss": 0.8335, + "num_tokens": 50537210581.0, + "step": 12091 + }, + { + "epoch": 1.4369578134284016, + "grad_norm": 0.2876256748256383, + "learning_rate": 5.490339295908426e-06, + "loss": 0.8252, + "num_tokens": 50541398316.0, + "step": 12092 + }, + { + "epoch": 1.4370766488413547, + "grad_norm": 0.273736133418108, + "learning_rate": 5.488969907088591e-06, + "loss": 0.8526, + "num_tokens": 50545586854.0, + "step": 12093 + }, + { + "epoch": 1.4371954842543078, + "grad_norm": 0.28959697929913136, + "learning_rate": 5.4876007223608365e-06, + "loss": 0.8103, + "num_tokens": 50549775048.0, + "step": 12094 + }, + { + "epoch": 1.4373143196672609, + "grad_norm": 0.2690720845503835, + "learning_rate": 5.486231741775862e-06, + "loss": 0.7876, + "num_tokens": 50553910031.0, + "step": 12095 + }, + { + "epoch": 1.437433155080214, + "grad_norm": 0.2660770158581649, + "learning_rate": 5.4848629653843675e-06, + "loss": 0.7888, + "num_tokens": 50558070436.0, + "step": 12096 + }, + { + "epoch": 1.437551990493167, + "grad_norm": 0.26599218306176475, + "learning_rate": 5.4834943932370434e-06, + "loss": 0.7967, + "num_tokens": 50562241955.0, + "step": 12097 + }, + { + "epoch": 1.43767082590612, + "grad_norm": 0.27313880378995314, + "learning_rate": 5.482126025384576e-06, + "loss": 0.7861, + "num_tokens": 50566395674.0, + "step": 12098 + }, + { + "epoch": 1.437789661319073, + "grad_norm": 0.2832059045970045, + "learning_rate": 5.480757861877641e-06, + "loss": 0.8212, + "num_tokens": 50570583319.0, + "step": 12099 + }, + { + "epoch": 1.4379084967320261, + "grad_norm": 0.25464385666467054, + "learning_rate": 5.479389902766901e-06, + "loss": 0.8157, + "num_tokens": 50574767356.0, + "step": 12100 + }, + { + "epoch": 1.4380273321449792, + "grad_norm": 0.2803197819456296, + "learning_rate": 5.478022148103017e-06, + "loss": 0.8091, + "num_tokens": 50578918221.0, + "step": 12101 + }, + { + "epoch": 1.4381461675579323, + "grad_norm": 0.27030940432870887, + "learning_rate": 5.476654597936646e-06, + "loss": 0.7941, + "num_tokens": 50583108213.0, + "step": 12102 + }, + { + "epoch": 1.4382650029708852, + "grad_norm": 0.2565095989983707, + "learning_rate": 5.475287252318424e-06, + "loss": 0.8274, + "num_tokens": 50587297576.0, + "step": 12103 + }, + { + "epoch": 1.4383838383838383, + "grad_norm": 0.26023631306388445, + "learning_rate": 5.473920111299003e-06, + "loss": 0.7857, + "num_tokens": 50591486633.0, + "step": 12104 + }, + { + "epoch": 1.4385026737967914, + "grad_norm": 0.27426254637241654, + "learning_rate": 5.472553174929001e-06, + "loss": 0.7862, + "num_tokens": 50595635571.0, + "step": 12105 + }, + { + "epoch": 1.4386215092097445, + "grad_norm": 0.246351777889114, + "learning_rate": 5.4711864432590424e-06, + "loss": 0.7955, + "num_tokens": 50599824927.0, + "step": 12106 + }, + { + "epoch": 1.4387403446226976, + "grad_norm": 0.28231478108434566, + "learning_rate": 5.469819916339743e-06, + "loss": 0.7852, + "num_tokens": 50604012844.0, + "step": 12107 + }, + { + "epoch": 1.4388591800356507, + "grad_norm": 0.25286860523456794, + "learning_rate": 5.4684535942217094e-06, + "loss": 0.8135, + "num_tokens": 50608146733.0, + "step": 12108 + }, + { + "epoch": 1.4389780154486038, + "grad_norm": 0.25744074804598066, + "learning_rate": 5.467087476955545e-06, + "loss": 0.7801, + "num_tokens": 50612335942.0, + "step": 12109 + }, + { + "epoch": 1.4390968508615567, + "grad_norm": 0.2711551570484796, + "learning_rate": 5.465721564591837e-06, + "loss": 0.7899, + "num_tokens": 50616525060.0, + "step": 12110 + }, + { + "epoch": 1.4392156862745098, + "grad_norm": 0.2557279960420647, + "learning_rate": 5.46435585718117e-06, + "loss": 0.8238, + "num_tokens": 50620691007.0, + "step": 12111 + }, + { + "epoch": 1.439334521687463, + "grad_norm": 0.2647836014246487, + "learning_rate": 5.462990354774123e-06, + "loss": 0.8213, + "num_tokens": 50624881297.0, + "step": 12112 + }, + { + "epoch": 1.439453357100416, + "grad_norm": 0.27693070195622055, + "learning_rate": 5.461625057421265e-06, + "loss": 0.7675, + "num_tokens": 50629071130.0, + "step": 12113 + }, + { + "epoch": 1.4395721925133689, + "grad_norm": 0.2701437148163717, + "learning_rate": 5.460259965173156e-06, + "loss": 0.7958, + "num_tokens": 50633254219.0, + "step": 12114 + }, + { + "epoch": 1.439691027926322, + "grad_norm": 0.2757900931902084, + "learning_rate": 5.458895078080354e-06, + "loss": 0.8174, + "num_tokens": 50637443640.0, + "step": 12115 + }, + { + "epoch": 1.439809863339275, + "grad_norm": 0.26138123161941773, + "learning_rate": 5.457530396193396e-06, + "loss": 0.8029, + "num_tokens": 50641600117.0, + "step": 12116 + }, + { + "epoch": 1.4399286987522282, + "grad_norm": 0.2715894271618816, + "learning_rate": 5.456165919562831e-06, + "loss": 0.8175, + "num_tokens": 50645759522.0, + "step": 12117 + }, + { + "epoch": 1.4400475341651813, + "grad_norm": 0.2888467121187436, + "learning_rate": 5.454801648239185e-06, + "loss": 0.8156, + "num_tokens": 50649947377.0, + "step": 12118 + }, + { + "epoch": 1.4401663695781344, + "grad_norm": 0.26048698039025275, + "learning_rate": 5.453437582272982e-06, + "loss": 0.796, + "num_tokens": 50654136468.0, + "step": 12119 + }, + { + "epoch": 1.4402852049910875, + "grad_norm": 0.2923397976039656, + "learning_rate": 5.452073721714743e-06, + "loss": 0.8066, + "num_tokens": 50658326262.0, + "step": 12120 + }, + { + "epoch": 1.4404040404040404, + "grad_norm": 0.2680075606453221, + "learning_rate": 5.4507100666149705e-06, + "loss": 0.831, + "num_tokens": 50662504881.0, + "step": 12121 + }, + { + "epoch": 1.4405228758169935, + "grad_norm": 0.26061943662881837, + "learning_rate": 5.4493466170241604e-06, + "loss": 0.8161, + "num_tokens": 50666694320.0, + "step": 12122 + }, + { + "epoch": 1.4406417112299466, + "grad_norm": 0.2720274811279901, + "learning_rate": 5.447983372992818e-06, + "loss": 0.8563, + "num_tokens": 50670853571.0, + "step": 12123 + }, + { + "epoch": 1.4407605466428997, + "grad_norm": 0.2782358582247711, + "learning_rate": 5.446620334571421e-06, + "loss": 0.8297, + "num_tokens": 50675042511.0, + "step": 12124 + }, + { + "epoch": 1.4408793820558525, + "grad_norm": 0.280861458651684, + "learning_rate": 5.445257501810455e-06, + "loss": 0.8223, + "num_tokens": 50679215884.0, + "step": 12125 + }, + { + "epoch": 1.4409982174688056, + "grad_norm": 0.2543954583341825, + "learning_rate": 5.4438948747603784e-06, + "loss": 0.779, + "num_tokens": 50683377642.0, + "step": 12126 + }, + { + "epoch": 1.4411170528817587, + "grad_norm": 0.31069825961302233, + "learning_rate": 5.4425324534716625e-06, + "loss": 0.8176, + "num_tokens": 50687567264.0, + "step": 12127 + }, + { + "epoch": 1.4412358882947118, + "grad_norm": 0.25634203388660287, + "learning_rate": 5.44117023799476e-06, + "loss": 0.8469, + "num_tokens": 50691756700.0, + "step": 12128 + }, + { + "epoch": 1.441354723707665, + "grad_norm": 0.294848655830643, + "learning_rate": 5.439808228380114e-06, + "loss": 0.8164, + "num_tokens": 50695938108.0, + "step": 12129 + }, + { + "epoch": 1.441473559120618, + "grad_norm": 0.2692777056153208, + "learning_rate": 5.438446424678176e-06, + "loss": 0.795, + "num_tokens": 50700129084.0, + "step": 12130 + }, + { + "epoch": 1.4415923945335711, + "grad_norm": 0.2664121613333196, + "learning_rate": 5.4370848269393695e-06, + "loss": 0.8101, + "num_tokens": 50704317269.0, + "step": 12131 + }, + { + "epoch": 1.441711229946524, + "grad_norm": 0.2666178283942274, + "learning_rate": 5.435723435214117e-06, + "loss": 0.8226, + "num_tokens": 50708504368.0, + "step": 12132 + }, + { + "epoch": 1.441830065359477, + "grad_norm": 0.28496230540329615, + "learning_rate": 5.434362249552843e-06, + "loss": 0.7959, + "num_tokens": 50712693588.0, + "step": 12133 + }, + { + "epoch": 1.4419489007724302, + "grad_norm": 0.26342379448043135, + "learning_rate": 5.433001270005949e-06, + "loss": 0.8121, + "num_tokens": 50716879746.0, + "step": 12134 + }, + { + "epoch": 1.4420677361853833, + "grad_norm": 0.27019546085650087, + "learning_rate": 5.4316404966238425e-06, + "loss": 0.8087, + "num_tokens": 50721069649.0, + "step": 12135 + }, + { + "epoch": 1.4421865715983362, + "grad_norm": 0.27300599724565877, + "learning_rate": 5.430279929456914e-06, + "loss": 0.866, + "num_tokens": 50725258930.0, + "step": 12136 + }, + { + "epoch": 1.4423054070112893, + "grad_norm": 0.31500778401260954, + "learning_rate": 5.428919568555553e-06, + "loss": 0.8067, + "num_tokens": 50729426581.0, + "step": 12137 + }, + { + "epoch": 1.4424242424242424, + "grad_norm": 0.2645731357811794, + "learning_rate": 5.427559413970136e-06, + "loss": 0.8406, + "num_tokens": 50733615437.0, + "step": 12138 + }, + { + "epoch": 1.4425430778371955, + "grad_norm": 0.30530386577654606, + "learning_rate": 5.426199465751034e-06, + "loss": 0.7875, + "num_tokens": 50737804378.0, + "step": 12139 + }, + { + "epoch": 1.4426619132501486, + "grad_norm": 0.2585911725687099, + "learning_rate": 5.424839723948613e-06, + "loss": 0.7989, + "num_tokens": 50741991249.0, + "step": 12140 + }, + { + "epoch": 1.4427807486631017, + "grad_norm": 0.33135740204806924, + "learning_rate": 5.423480188613226e-06, + "loss": 0.8262, + "num_tokens": 50746178570.0, + "step": 12141 + }, + { + "epoch": 1.4428995840760548, + "grad_norm": 0.2503539731715719, + "learning_rate": 5.422120859795222e-06, + "loss": 0.8276, + "num_tokens": 50750368365.0, + "step": 12142 + }, + { + "epoch": 1.4430184194890077, + "grad_norm": 0.31265721678279373, + "learning_rate": 5.420761737544943e-06, + "loss": 0.8678, + "num_tokens": 50754556631.0, + "step": 12143 + }, + { + "epoch": 1.4431372549019608, + "grad_norm": 0.24668792910418438, + "learning_rate": 5.41940282191272e-06, + "loss": 0.7937, + "num_tokens": 50758746187.0, + "step": 12144 + }, + { + "epoch": 1.4432560903149139, + "grad_norm": 0.27565811491603326, + "learning_rate": 5.418044112948878e-06, + "loss": 0.8138, + "num_tokens": 50762936117.0, + "step": 12145 + }, + { + "epoch": 1.443374925727867, + "grad_norm": 0.24040373151907868, + "learning_rate": 5.41668561070374e-06, + "loss": 0.7789, + "num_tokens": 50767125291.0, + "step": 12146 + }, + { + "epoch": 1.4434937611408198, + "grad_norm": 0.28811616140025775, + "learning_rate": 5.415327315227608e-06, + "loss": 0.7696, + "num_tokens": 50771273198.0, + "step": 12147 + }, + { + "epoch": 1.443612596553773, + "grad_norm": 0.24630191891906097, + "learning_rate": 5.413969226570784e-06, + "loss": 0.771, + "num_tokens": 50775455652.0, + "step": 12148 + }, + { + "epoch": 1.443731431966726, + "grad_norm": 0.26687929149441925, + "learning_rate": 5.412611344783569e-06, + "loss": 0.7789, + "num_tokens": 50779644235.0, + "step": 12149 + }, + { + "epoch": 1.4438502673796791, + "grad_norm": 0.26781119065882253, + "learning_rate": 5.41125366991625e-06, + "loss": 0.8025, + "num_tokens": 50783834216.0, + "step": 12150 + }, + { + "epoch": 1.4439691027926322, + "grad_norm": 0.26375266664757124, + "learning_rate": 5.409896202019105e-06, + "loss": 0.7678, + "num_tokens": 50788023856.0, + "step": 12151 + }, + { + "epoch": 1.4440879382055853, + "grad_norm": 0.2694669827723079, + "learning_rate": 5.4085389411424e-06, + "loss": 0.8345, + "num_tokens": 50792212745.0, + "step": 12152 + }, + { + "epoch": 1.4442067736185384, + "grad_norm": 0.24416227307706143, + "learning_rate": 5.407181887336405e-06, + "loss": 0.7962, + "num_tokens": 50796373253.0, + "step": 12153 + }, + { + "epoch": 1.4443256090314913, + "grad_norm": 0.26418316690740035, + "learning_rate": 5.40582504065137e-06, + "loss": 0.868, + "num_tokens": 50800559658.0, + "step": 12154 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.254425202320473, + "learning_rate": 5.404468401137553e-06, + "loss": 0.8244, + "num_tokens": 50804748907.0, + "step": 12155 + }, + { + "epoch": 1.4445632798573975, + "grad_norm": 0.2548702818092919, + "learning_rate": 5.403111968845192e-06, + "loss": 0.7684, + "num_tokens": 50808914465.0, + "step": 12156 + }, + { + "epoch": 1.4446821152703506, + "grad_norm": 0.2869643407795001, + "learning_rate": 5.4017557438245135e-06, + "loss": 0.7928, + "num_tokens": 50813094764.0, + "step": 12157 + }, + { + "epoch": 1.4448009506833035, + "grad_norm": 0.2668066711589939, + "learning_rate": 5.400399726125751e-06, + "loss": 0.7775, + "num_tokens": 50817268963.0, + "step": 12158 + }, + { + "epoch": 1.4449197860962566, + "grad_norm": 0.25076965158185105, + "learning_rate": 5.399043915799117e-06, + "loss": 0.7999, + "num_tokens": 50821458997.0, + "step": 12159 + }, + { + "epoch": 1.4450386215092097, + "grad_norm": 0.26839451729934566, + "learning_rate": 5.3976883128948225e-06, + "loss": 0.8209, + "num_tokens": 50825619943.0, + "step": 12160 + }, + { + "epoch": 1.4451574569221628, + "grad_norm": 0.2699218891806099, + "learning_rate": 5.396332917463077e-06, + "loss": 0.8229, + "num_tokens": 50829779603.0, + "step": 12161 + }, + { + "epoch": 1.445276292335116, + "grad_norm": 0.260522637421517, + "learning_rate": 5.394977729554067e-06, + "loss": 0.835, + "num_tokens": 50833967832.0, + "step": 12162 + }, + { + "epoch": 1.445395127748069, + "grad_norm": 0.27289314456742986, + "learning_rate": 5.393622749217984e-06, + "loss": 0.8084, + "num_tokens": 50838156813.0, + "step": 12163 + }, + { + "epoch": 1.445513963161022, + "grad_norm": 0.26152094032232, + "learning_rate": 5.392267976505004e-06, + "loss": 0.8286, + "num_tokens": 50842345707.0, + "step": 12164 + }, + { + "epoch": 1.445632798573975, + "grad_norm": 0.2475782378731224, + "learning_rate": 5.390913411465304e-06, + "loss": 0.7964, + "num_tokens": 50846536108.0, + "step": 12165 + }, + { + "epoch": 1.445751633986928, + "grad_norm": 0.24328013035613721, + "learning_rate": 5.389559054149043e-06, + "loss": 0.8233, + "num_tokens": 50850727106.0, + "step": 12166 + }, + { + "epoch": 1.4458704693998812, + "grad_norm": 0.2643162524551968, + "learning_rate": 5.38820490460638e-06, + "loss": 0.805, + "num_tokens": 50854918474.0, + "step": 12167 + }, + { + "epoch": 1.4459893048128343, + "grad_norm": 0.24742988575388616, + "learning_rate": 5.3868509628874625e-06, + "loss": 0.8103, + "num_tokens": 50859108911.0, + "step": 12168 + }, + { + "epoch": 1.4461081402257872, + "grad_norm": 0.23822830535772407, + "learning_rate": 5.385497229042433e-06, + "loss": 0.7936, + "num_tokens": 50863297205.0, + "step": 12169 + }, + { + "epoch": 1.4462269756387403, + "grad_norm": 0.2634050186163949, + "learning_rate": 5.3841437031214235e-06, + "loss": 0.8204, + "num_tokens": 50867463011.0, + "step": 12170 + }, + { + "epoch": 1.4463458110516934, + "grad_norm": 0.25511633276277373, + "learning_rate": 5.382790385174561e-06, + "loss": 0.7861, + "num_tokens": 50871604470.0, + "step": 12171 + }, + { + "epoch": 1.4464646464646465, + "grad_norm": 0.251096575916212, + "learning_rate": 5.381437275251964e-06, + "loss": 0.8124, + "num_tokens": 50875758251.0, + "step": 12172 + }, + { + "epoch": 1.4465834818775996, + "grad_norm": 0.269328996361206, + "learning_rate": 5.38008437340374e-06, + "loss": 0.8139, + "num_tokens": 50879947347.0, + "step": 12173 + }, + { + "epoch": 1.4467023172905527, + "grad_norm": 0.26689022141906504, + "learning_rate": 5.378731679679994e-06, + "loss": 0.8461, + "num_tokens": 50884102177.0, + "step": 12174 + }, + { + "epoch": 1.4468211527035058, + "grad_norm": 0.24405380092423115, + "learning_rate": 5.377379194130818e-06, + "loss": 0.7742, + "num_tokens": 50888273774.0, + "step": 12175 + }, + { + "epoch": 1.4469399881164586, + "grad_norm": 0.2672250135189956, + "learning_rate": 5.376026916806303e-06, + "loss": 0.7988, + "num_tokens": 50892432572.0, + "step": 12176 + }, + { + "epoch": 1.4470588235294117, + "grad_norm": 0.2834775820710852, + "learning_rate": 5.3746748477565295e-06, + "loss": 0.8459, + "num_tokens": 50896621800.0, + "step": 12177 + }, + { + "epoch": 1.4471776589423648, + "grad_norm": 0.26393721250767904, + "learning_rate": 5.373322987031561e-06, + "loss": 0.8054, + "num_tokens": 50900777827.0, + "step": 12178 + }, + { + "epoch": 1.447296494355318, + "grad_norm": 0.2729710385144335, + "learning_rate": 5.371971334681464e-06, + "loss": 0.8329, + "num_tokens": 50904933625.0, + "step": 12179 + }, + { + "epoch": 1.4474153297682708, + "grad_norm": 0.26659007660019907, + "learning_rate": 5.370619890756301e-06, + "loss": 0.8168, + "num_tokens": 50909119370.0, + "step": 12180 + }, + { + "epoch": 1.447534165181224, + "grad_norm": 0.2649879267982672, + "learning_rate": 5.369268655306117e-06, + "loss": 0.7863, + "num_tokens": 50913308315.0, + "step": 12181 + }, + { + "epoch": 1.447653000594177, + "grad_norm": 0.2760512745744179, + "learning_rate": 5.367917628380956e-06, + "loss": 0.7884, + "num_tokens": 50917496602.0, + "step": 12182 + }, + { + "epoch": 1.44777183600713, + "grad_norm": 0.26108655648242185, + "learning_rate": 5.366566810030844e-06, + "loss": 0.804, + "num_tokens": 50921685768.0, + "step": 12183 + }, + { + "epoch": 1.4478906714200832, + "grad_norm": 0.2997996452916814, + "learning_rate": 5.36521620030581e-06, + "loss": 0.8429, + "num_tokens": 50925855871.0, + "step": 12184 + }, + { + "epoch": 1.4480095068330363, + "grad_norm": 0.2773134191402478, + "learning_rate": 5.3638657992558714e-06, + "loss": 0.8666, + "num_tokens": 50930027750.0, + "step": 12185 + }, + { + "epoch": 1.4481283422459894, + "grad_norm": 0.2859362807329837, + "learning_rate": 5.362515606931038e-06, + "loss": 0.8159, + "num_tokens": 50934216691.0, + "step": 12186 + }, + { + "epoch": 1.4482471776589423, + "grad_norm": 0.2715711311979965, + "learning_rate": 5.361165623381315e-06, + "loss": 0.7965, + "num_tokens": 50938407168.0, + "step": 12187 + }, + { + "epoch": 1.4483660130718954, + "grad_norm": 0.30022418496708575, + "learning_rate": 5.359815848656694e-06, + "loss": 0.7997, + "num_tokens": 50942596708.0, + "step": 12188 + }, + { + "epoch": 1.4484848484848485, + "grad_norm": 0.25833972263495536, + "learning_rate": 5.358466282807161e-06, + "loss": 0.7956, + "num_tokens": 50946785770.0, + "step": 12189 + }, + { + "epoch": 1.4486036838978016, + "grad_norm": 0.2632101125308082, + "learning_rate": 5.357116925882695e-06, + "loss": 0.8002, + "num_tokens": 50950973733.0, + "step": 12190 + }, + { + "epoch": 1.4487225193107547, + "grad_norm": 0.2510546836140584, + "learning_rate": 5.355767777933269e-06, + "loss": 0.789, + "num_tokens": 50955163060.0, + "step": 12191 + }, + { + "epoch": 1.4488413547237076, + "grad_norm": 0.2766865166775743, + "learning_rate": 5.354418839008846e-06, + "loss": 0.7876, + "num_tokens": 50959339964.0, + "step": 12192 + }, + { + "epoch": 1.4489601901366607, + "grad_norm": 0.2543945332363889, + "learning_rate": 5.3530701091593815e-06, + "loss": 0.8166, + "num_tokens": 50963526164.0, + "step": 12193 + }, + { + "epoch": 1.4490790255496138, + "grad_norm": 0.27124407816896934, + "learning_rate": 5.351721588434824e-06, + "loss": 0.7712, + "num_tokens": 50967681623.0, + "step": 12194 + }, + { + "epoch": 1.4491978609625669, + "grad_norm": 0.2624160542845972, + "learning_rate": 5.3503732768851135e-06, + "loss": 0.8231, + "num_tokens": 50971871452.0, + "step": 12195 + }, + { + "epoch": 1.44931669637552, + "grad_norm": 0.24800174550777643, + "learning_rate": 5.349025174560183e-06, + "loss": 0.8151, + "num_tokens": 50976059920.0, + "step": 12196 + }, + { + "epoch": 1.449435531788473, + "grad_norm": 0.2981659187670616, + "learning_rate": 5.347677281509954e-06, + "loss": 0.8185, + "num_tokens": 50980248667.0, + "step": 12197 + }, + { + "epoch": 1.449554367201426, + "grad_norm": 0.2463400920640623, + "learning_rate": 5.346329597784349e-06, + "loss": 0.7941, + "num_tokens": 50984407172.0, + "step": 12198 + }, + { + "epoch": 1.449673202614379, + "grad_norm": 0.28046779169451874, + "learning_rate": 5.3449821234332736e-06, + "loss": 0.7939, + "num_tokens": 50988596051.0, + "step": 12199 + }, + { + "epoch": 1.4497920380273321, + "grad_norm": 0.2551937599237019, + "learning_rate": 5.34363485850663e-06, + "loss": 0.7915, + "num_tokens": 50992786307.0, + "step": 12200 + }, + { + "epoch": 1.4499108734402852, + "grad_norm": 0.2604966926600904, + "learning_rate": 5.342287803054311e-06, + "loss": 0.8118, + "num_tokens": 50996975580.0, + "step": 12201 + }, + { + "epoch": 1.4500297088532383, + "grad_norm": 0.24562492498706745, + "learning_rate": 5.3409409571262064e-06, + "loss": 0.8098, + "num_tokens": 51001162172.0, + "step": 12202 + }, + { + "epoch": 1.4501485442661912, + "grad_norm": 0.24814910653583913, + "learning_rate": 5.33959432077219e-06, + "loss": 0.7869, + "num_tokens": 51005349165.0, + "step": 12203 + }, + { + "epoch": 1.4502673796791443, + "grad_norm": 0.24638671984231045, + "learning_rate": 5.338247894042138e-06, + "loss": 0.8114, + "num_tokens": 51009512131.0, + "step": 12204 + }, + { + "epoch": 1.4503862150920974, + "grad_norm": 0.2466000693767029, + "learning_rate": 5.3369016769859026e-06, + "loss": 0.8472, + "num_tokens": 51013685815.0, + "step": 12205 + }, + { + "epoch": 1.4505050505050505, + "grad_norm": 0.25304194557119336, + "learning_rate": 5.33555566965335e-06, + "loss": 0.836, + "num_tokens": 51017853952.0, + "step": 12206 + }, + { + "epoch": 1.4506238859180036, + "grad_norm": 0.2609377360550516, + "learning_rate": 5.334209872094321e-06, + "loss": 0.8114, + "num_tokens": 51021998310.0, + "step": 12207 + }, + { + "epoch": 1.4507427213309567, + "grad_norm": 0.25206419626168824, + "learning_rate": 5.3328642843586565e-06, + "loss": 0.7916, + "num_tokens": 51026187914.0, + "step": 12208 + }, + { + "epoch": 1.4508615567439098, + "grad_norm": 0.27270395398313974, + "learning_rate": 5.331518906496194e-06, + "loss": 0.7644, + "num_tokens": 51030376757.0, + "step": 12209 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.2817845710829915, + "learning_rate": 5.330173738556748e-06, + "loss": 0.8406, + "num_tokens": 51034565769.0, + "step": 12210 + }, + { + "epoch": 1.4510992275698158, + "grad_norm": 0.2518731877163842, + "learning_rate": 5.328828780590135e-06, + "loss": 0.7875, + "num_tokens": 51038755420.0, + "step": 12211 + }, + { + "epoch": 1.451218062982769, + "grad_norm": 0.2708548278833114, + "learning_rate": 5.327484032646171e-06, + "loss": 0.8046, + "num_tokens": 51042943953.0, + "step": 12212 + }, + { + "epoch": 1.451336898395722, + "grad_norm": 0.2698716434337383, + "learning_rate": 5.3261394947746515e-06, + "loss": 0.8252, + "num_tokens": 51047109407.0, + "step": 12213 + }, + { + "epoch": 1.4514557338086749, + "grad_norm": 0.2771977810492864, + "learning_rate": 5.324795167025374e-06, + "loss": 0.8024, + "num_tokens": 51051273372.0, + "step": 12214 + }, + { + "epoch": 1.451574569221628, + "grad_norm": 0.278123478374008, + "learning_rate": 5.3234510494481166e-06, + "loss": 0.8231, + "num_tokens": 51055444499.0, + "step": 12215 + }, + { + "epoch": 1.451693404634581, + "grad_norm": 0.2895004038817753, + "learning_rate": 5.322107142092659e-06, + "loss": 0.8096, + "num_tokens": 51059634110.0, + "step": 12216 + }, + { + "epoch": 1.4518122400475342, + "grad_norm": 0.2745423541117063, + "learning_rate": 5.320763445008772e-06, + "loss": 0.8136, + "num_tokens": 51063822120.0, + "step": 12217 + }, + { + "epoch": 1.4519310754604873, + "grad_norm": 0.28726334107346396, + "learning_rate": 5.319419958246214e-06, + "loss": 0.8133, + "num_tokens": 51068010396.0, + "step": 12218 + }, + { + "epoch": 1.4520499108734404, + "grad_norm": 0.2650204475392127, + "learning_rate": 5.3180766818547456e-06, + "loss": 0.7623, + "num_tokens": 51072199229.0, + "step": 12219 + }, + { + "epoch": 1.4521687462863935, + "grad_norm": 0.26537886513077275, + "learning_rate": 5.3167336158841075e-06, + "loss": 0.859, + "num_tokens": 51076354632.0, + "step": 12220 + }, + { + "epoch": 1.4522875816993464, + "grad_norm": 0.2628726252827571, + "learning_rate": 5.315390760384038e-06, + "loss": 0.8521, + "num_tokens": 51080530759.0, + "step": 12221 + }, + { + "epoch": 1.4524064171122995, + "grad_norm": 0.2844619495269138, + "learning_rate": 5.3140481154042715e-06, + "loss": 0.8508, + "num_tokens": 51084662505.0, + "step": 12222 + }, + { + "epoch": 1.4525252525252526, + "grad_norm": 0.26764810096212766, + "learning_rate": 5.312705680994525e-06, + "loss": 0.8399, + "num_tokens": 51088851461.0, + "step": 12223 + }, + { + "epoch": 1.4526440879382057, + "grad_norm": 0.2675422896221078, + "learning_rate": 5.311363457204517e-06, + "loss": 0.8224, + "num_tokens": 51093019711.0, + "step": 12224 + }, + { + "epoch": 1.4527629233511585, + "grad_norm": 0.30193222775136724, + "learning_rate": 5.310021444083955e-06, + "loss": 0.8173, + "num_tokens": 51097210157.0, + "step": 12225 + }, + { + "epoch": 1.4528817587641116, + "grad_norm": 0.2657915776073709, + "learning_rate": 5.308679641682536e-06, + "loss": 0.8513, + "num_tokens": 51101400911.0, + "step": 12226 + }, + { + "epoch": 1.4530005941770647, + "grad_norm": 0.2857876768495877, + "learning_rate": 5.307338050049953e-06, + "loss": 0.7945, + "num_tokens": 51105567705.0, + "step": 12227 + }, + { + "epoch": 1.4531194295900178, + "grad_norm": 0.27039704256859215, + "learning_rate": 5.305996669235889e-06, + "loss": 0.8097, + "num_tokens": 51109756613.0, + "step": 12228 + }, + { + "epoch": 1.453238265002971, + "grad_norm": 0.2986180065398808, + "learning_rate": 5.304655499290021e-06, + "loss": 0.7625, + "num_tokens": 51113937548.0, + "step": 12229 + }, + { + "epoch": 1.453357100415924, + "grad_norm": 0.2785219482160312, + "learning_rate": 5.3033145402620164e-06, + "loss": 0.8049, + "num_tokens": 51118126386.0, + "step": 12230 + }, + { + "epoch": 1.4534759358288771, + "grad_norm": 0.28894600442819374, + "learning_rate": 5.301973792201534e-06, + "loss": 0.8188, + "num_tokens": 51122314452.0, + "step": 12231 + }, + { + "epoch": 1.45359477124183, + "grad_norm": 0.2669661384213706, + "learning_rate": 5.300633255158227e-06, + "loss": 0.8315, + "num_tokens": 51126503763.0, + "step": 12232 + }, + { + "epoch": 1.453713606654783, + "grad_norm": 0.27790729056895236, + "learning_rate": 5.299292929181743e-06, + "loss": 0.8323, + "num_tokens": 51130673277.0, + "step": 12233 + }, + { + "epoch": 1.4538324420677362, + "grad_norm": 0.24437333688209018, + "learning_rate": 5.297952814321715e-06, + "loss": 0.78, + "num_tokens": 51134863235.0, + "step": 12234 + }, + { + "epoch": 1.4539512774806893, + "grad_norm": 0.2710571591522676, + "learning_rate": 5.2966129106277775e-06, + "loss": 0.7863, + "num_tokens": 51139053598.0, + "step": 12235 + }, + { + "epoch": 1.4540701128936422, + "grad_norm": 0.26569623590254254, + "learning_rate": 5.295273218149544e-06, + "loss": 0.8491, + "num_tokens": 51143212571.0, + "step": 12236 + }, + { + "epoch": 1.4541889483065953, + "grad_norm": 0.25214163602863326, + "learning_rate": 5.293933736936628e-06, + "loss": 0.7793, + "num_tokens": 51147402173.0, + "step": 12237 + }, + { + "epoch": 1.4543077837195484, + "grad_norm": 0.2761254346079013, + "learning_rate": 5.292594467038641e-06, + "loss": 0.8249, + "num_tokens": 51151567687.0, + "step": 12238 + }, + { + "epoch": 1.4544266191325015, + "grad_norm": 0.25321664734768234, + "learning_rate": 5.291255408505179e-06, + "loss": 0.794, + "num_tokens": 51155757027.0, + "step": 12239 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.27142436424416905, + "learning_rate": 5.289916561385833e-06, + "loss": 0.792, + "num_tokens": 51159944678.0, + "step": 12240 + }, + { + "epoch": 1.4546642899584077, + "grad_norm": 0.2755832412049007, + "learning_rate": 5.2885779257301804e-06, + "loss": 0.8057, + "num_tokens": 51164133776.0, + "step": 12241 + }, + { + "epoch": 1.4547831253713608, + "grad_norm": 0.2653431705115399, + "learning_rate": 5.2872395015877985e-06, + "loss": 0.798, + "num_tokens": 51168307376.0, + "step": 12242 + }, + { + "epoch": 1.4549019607843137, + "grad_norm": 0.2538147835288805, + "learning_rate": 5.28590128900825e-06, + "loss": 0.7858, + "num_tokens": 51172475789.0, + "step": 12243 + }, + { + "epoch": 1.4550207961972668, + "grad_norm": 0.26631008319367866, + "learning_rate": 5.284563288041099e-06, + "loss": 0.8238, + "num_tokens": 51176664548.0, + "step": 12244 + }, + { + "epoch": 1.4551396316102199, + "grad_norm": 0.25437539490361644, + "learning_rate": 5.283225498735898e-06, + "loss": 0.8067, + "num_tokens": 51180853516.0, + "step": 12245 + }, + { + "epoch": 1.455258467023173, + "grad_norm": 0.2733309965505787, + "learning_rate": 5.281887921142181e-06, + "loss": 0.8024, + "num_tokens": 51185042710.0, + "step": 12246 + }, + { + "epoch": 1.4553773024361258, + "grad_norm": 0.27425717856600035, + "learning_rate": 5.280550555309489e-06, + "loss": 0.8131, + "num_tokens": 51189215353.0, + "step": 12247 + }, + { + "epoch": 1.455496137849079, + "grad_norm": 0.2637951708522803, + "learning_rate": 5.279213401287347e-06, + "loss": 0.8457, + "num_tokens": 51193404796.0, + "step": 12248 + }, + { + "epoch": 1.455614973262032, + "grad_norm": 0.29093784280682655, + "learning_rate": 5.277876459125273e-06, + "loss": 0.8241, + "num_tokens": 51197592871.0, + "step": 12249 + }, + { + "epoch": 1.4557338086749851, + "grad_norm": 0.24901143703311684, + "learning_rate": 5.276539728872785e-06, + "loss": 0.8669, + "num_tokens": 51201781888.0, + "step": 12250 + }, + { + "epoch": 1.4558526440879382, + "grad_norm": 0.27364648644999556, + "learning_rate": 5.27520321057938e-06, + "loss": 0.7809, + "num_tokens": 51205972547.0, + "step": 12251 + }, + { + "epoch": 1.4559714795008913, + "grad_norm": 0.2559326433743828, + "learning_rate": 5.273866904294558e-06, + "loss": 0.8061, + "num_tokens": 51210162412.0, + "step": 12252 + }, + { + "epoch": 1.4560903149138444, + "grad_norm": 0.2725588168555315, + "learning_rate": 5.272530810067802e-06, + "loss": 0.8202, + "num_tokens": 51214353173.0, + "step": 12253 + }, + { + "epoch": 1.4562091503267973, + "grad_norm": 0.32450328938290224, + "learning_rate": 5.271194927948598e-06, + "loss": 0.7802, + "num_tokens": 51218542789.0, + "step": 12254 + }, + { + "epoch": 1.4563279857397504, + "grad_norm": 0.2380737679674007, + "learning_rate": 5.269859257986414e-06, + "loss": 0.7983, + "num_tokens": 51222700384.0, + "step": 12255 + }, + { + "epoch": 1.4564468211527035, + "grad_norm": 0.30246865085506414, + "learning_rate": 5.268523800230716e-06, + "loss": 0.8355, + "num_tokens": 51226888969.0, + "step": 12256 + }, + { + "epoch": 1.4565656565656566, + "grad_norm": 0.2550724482407282, + "learning_rate": 5.26718855473096e-06, + "loss": 0.8018, + "num_tokens": 51231078399.0, + "step": 12257 + }, + { + "epoch": 1.4566844919786095, + "grad_norm": 0.2865104829347618, + "learning_rate": 5.265853521536595e-06, + "loss": 0.825, + "num_tokens": 51235266342.0, + "step": 12258 + }, + { + "epoch": 1.4568033273915626, + "grad_norm": 0.252034075404731, + "learning_rate": 5.264518700697062e-06, + "loss": 0.7826, + "num_tokens": 51239427166.0, + "step": 12259 + }, + { + "epoch": 1.4569221628045157, + "grad_norm": 0.29097315568065146, + "learning_rate": 5.263184092261793e-06, + "loss": 0.7927, + "num_tokens": 51243617378.0, + "step": 12260 + }, + { + "epoch": 1.4570409982174688, + "grad_norm": 0.271546334238656, + "learning_rate": 5.261849696280217e-06, + "loss": 0.8344, + "num_tokens": 51247806457.0, + "step": 12261 + }, + { + "epoch": 1.457159833630422, + "grad_norm": 0.2686978598093981, + "learning_rate": 5.260515512801742e-06, + "loss": 0.8073, + "num_tokens": 51251994639.0, + "step": 12262 + }, + { + "epoch": 1.457278669043375, + "grad_norm": 0.2503060760651089, + "learning_rate": 5.259181541875787e-06, + "loss": 0.8159, + "num_tokens": 51256183686.0, + "step": 12263 + }, + { + "epoch": 1.457397504456328, + "grad_norm": 0.26080534362596597, + "learning_rate": 5.257847783551748e-06, + "loss": 0.7939, + "num_tokens": 51260374618.0, + "step": 12264 + }, + { + "epoch": 1.457516339869281, + "grad_norm": 0.2717337204100778, + "learning_rate": 5.2565142378790214e-06, + "loss": 0.8447, + "num_tokens": 51264534870.0, + "step": 12265 + }, + { + "epoch": 1.457635175282234, + "grad_norm": 0.25369483379828583, + "learning_rate": 5.255180904906995e-06, + "loss": 0.805, + "num_tokens": 51268723399.0, + "step": 12266 + }, + { + "epoch": 1.4577540106951872, + "grad_norm": 0.2647506356238312, + "learning_rate": 5.25384778468504e-06, + "loss": 0.7828, + "num_tokens": 51272882780.0, + "step": 12267 + }, + { + "epoch": 1.4578728461081403, + "grad_norm": 0.24701363315525512, + "learning_rate": 5.252514877262529e-06, + "loss": 0.8355, + "num_tokens": 51277071943.0, + "step": 12268 + }, + { + "epoch": 1.4579916815210932, + "grad_norm": 0.27941378117082677, + "learning_rate": 5.251182182688823e-06, + "loss": 0.8216, + "num_tokens": 51281261963.0, + "step": 12269 + }, + { + "epoch": 1.4581105169340463, + "grad_norm": 0.26846778636111956, + "learning_rate": 5.24984970101328e-06, + "loss": 0.7973, + "num_tokens": 51285451320.0, + "step": 12270 + }, + { + "epoch": 1.4582293523469994, + "grad_norm": 0.261453980433397, + "learning_rate": 5.248517432285247e-06, + "loss": 0.8418, + "num_tokens": 51289637381.0, + "step": 12271 + }, + { + "epoch": 1.4583481877599525, + "grad_norm": 0.2592815226274692, + "learning_rate": 5.2471853765540585e-06, + "loss": 0.78, + "num_tokens": 51293826743.0, + "step": 12272 + }, + { + "epoch": 1.4584670231729056, + "grad_norm": 0.25499091956456554, + "learning_rate": 5.2458535338690456e-06, + "loss": 0.8392, + "num_tokens": 51298015073.0, + "step": 12273 + }, + { + "epoch": 1.4585858585858587, + "grad_norm": 0.2469332408644645, + "learning_rate": 5.24452190427953e-06, + "loss": 0.8178, + "num_tokens": 51302203275.0, + "step": 12274 + }, + { + "epoch": 1.4587046939988118, + "grad_norm": 0.25964862110756004, + "learning_rate": 5.243190487834826e-06, + "loss": 0.8277, + "num_tokens": 51306392479.0, + "step": 12275 + }, + { + "epoch": 1.4588235294117646, + "grad_norm": 0.2572430986269402, + "learning_rate": 5.24185928458425e-06, + "loss": 0.8041, + "num_tokens": 51310581689.0, + "step": 12276 + }, + { + "epoch": 1.4589423648247177, + "grad_norm": 0.2738585808592954, + "learning_rate": 5.240528294577087e-06, + "loss": 0.8182, + "num_tokens": 51314770902.0, + "step": 12277 + }, + { + "epoch": 1.4590612002376708, + "grad_norm": 0.27113388054017906, + "learning_rate": 5.239197517862638e-06, + "loss": 0.8128, + "num_tokens": 51318954277.0, + "step": 12278 + }, + { + "epoch": 1.459180035650624, + "grad_norm": 0.26505092040334344, + "learning_rate": 5.237866954490181e-06, + "loss": 0.8132, + "num_tokens": 51323143856.0, + "step": 12279 + }, + { + "epoch": 1.4592988710635768, + "grad_norm": 0.25400707816804496, + "learning_rate": 5.236536604508993e-06, + "loss": 0.821, + "num_tokens": 51327333496.0, + "step": 12280 + }, + { + "epoch": 1.45941770647653, + "grad_norm": 0.27592921787830504, + "learning_rate": 5.235206467968341e-06, + "loss": 0.8076, + "num_tokens": 51331522283.0, + "step": 12281 + }, + { + "epoch": 1.459536541889483, + "grad_norm": 0.2903895856655764, + "learning_rate": 5.233876544917485e-06, + "loss": 0.7891, + "num_tokens": 51335706085.0, + "step": 12282 + }, + { + "epoch": 1.459655377302436, + "grad_norm": 0.27937768063751056, + "learning_rate": 5.232546835405676e-06, + "loss": 0.8292, + "num_tokens": 51339894198.0, + "step": 12283 + }, + { + "epoch": 1.4597742127153892, + "grad_norm": 0.27197907635856783, + "learning_rate": 5.231217339482159e-06, + "loss": 0.8089, + "num_tokens": 51344052388.0, + "step": 12284 + }, + { + "epoch": 1.4598930481283423, + "grad_norm": 0.2534826614202449, + "learning_rate": 5.229888057196168e-06, + "loss": 0.8132, + "num_tokens": 51348191193.0, + "step": 12285 + }, + { + "epoch": 1.4600118835412954, + "grad_norm": 0.2570203769781213, + "learning_rate": 5.228558988596931e-06, + "loss": 0.8181, + "num_tokens": 51352379600.0, + "step": 12286 + }, + { + "epoch": 1.4601307189542483, + "grad_norm": 0.2772351315597316, + "learning_rate": 5.227230133733672e-06, + "loss": 0.805, + "num_tokens": 51356567953.0, + "step": 12287 + }, + { + "epoch": 1.4602495543672014, + "grad_norm": 0.2513834230418863, + "learning_rate": 5.225901492655592e-06, + "loss": 0.8248, + "num_tokens": 51360727370.0, + "step": 12288 + }, + { + "epoch": 1.4603683897801545, + "grad_norm": 0.2805447952752973, + "learning_rate": 5.224573065411906e-06, + "loss": 0.8432, + "num_tokens": 51364917197.0, + "step": 12289 + }, + { + "epoch": 1.4604872251931076, + "grad_norm": 0.2699223493306429, + "learning_rate": 5.223244852051808e-06, + "loss": 0.7878, + "num_tokens": 51369082329.0, + "step": 12290 + }, + { + "epoch": 1.4606060606060607, + "grad_norm": 0.24428259229382387, + "learning_rate": 5.221916852624485e-06, + "loss": 0.8108, + "num_tokens": 51373262230.0, + "step": 12291 + }, + { + "epoch": 1.4607248960190136, + "grad_norm": 0.2619298872742858, + "learning_rate": 5.2205890671791185e-06, + "loss": 0.8162, + "num_tokens": 51377381261.0, + "step": 12292 + }, + { + "epoch": 1.4608437314319667, + "grad_norm": 0.2390889913387031, + "learning_rate": 5.219261495764876e-06, + "loss": 0.8142, + "num_tokens": 51381570199.0, + "step": 12293 + }, + { + "epoch": 1.4609625668449198, + "grad_norm": 0.2508360802183312, + "learning_rate": 5.217934138430924e-06, + "loss": 0.8067, + "num_tokens": 51385754349.0, + "step": 12294 + }, + { + "epoch": 1.4610814022578729, + "grad_norm": 0.2509492677422197, + "learning_rate": 5.216606995226422e-06, + "loss": 0.797, + "num_tokens": 51389943681.0, + "step": 12295 + }, + { + "epoch": 1.461200237670826, + "grad_norm": 0.2612017064893654, + "learning_rate": 5.215280066200519e-06, + "loss": 0.8053, + "num_tokens": 51394131729.0, + "step": 12296 + }, + { + "epoch": 1.461319073083779, + "grad_norm": 0.25438000303890784, + "learning_rate": 5.213953351402355e-06, + "loss": 0.8026, + "num_tokens": 51398300607.0, + "step": 12297 + }, + { + "epoch": 1.4614379084967322, + "grad_norm": 0.24569330936192504, + "learning_rate": 5.21262685088106e-06, + "loss": 0.8332, + "num_tokens": 51402488555.0, + "step": 12298 + }, + { + "epoch": 1.461556743909685, + "grad_norm": 0.24088042047454364, + "learning_rate": 5.211300564685758e-06, + "loss": 0.8029, + "num_tokens": 51406677186.0, + "step": 12299 + }, + { + "epoch": 1.4616755793226381, + "grad_norm": 0.2781425405866376, + "learning_rate": 5.209974492865567e-06, + "loss": 0.8505, + "num_tokens": 51410847769.0, + "step": 12300 + }, + { + "epoch": 1.4617944147355912, + "grad_norm": 0.28716923775322617, + "learning_rate": 5.2086486354696e-06, + "loss": 0.8116, + "num_tokens": 51415018126.0, + "step": 12301 + }, + { + "epoch": 1.4619132501485443, + "grad_norm": 0.24829428533551845, + "learning_rate": 5.207322992546957e-06, + "loss": 0.8155, + "num_tokens": 51419198711.0, + "step": 12302 + }, + { + "epoch": 1.4620320855614972, + "grad_norm": 0.28690950449791514, + "learning_rate": 5.205997564146727e-06, + "loss": 0.8094, + "num_tokens": 51423363094.0, + "step": 12303 + }, + { + "epoch": 1.4621509209744503, + "grad_norm": 0.24504408665687225, + "learning_rate": 5.204672350317997e-06, + "loss": 0.786, + "num_tokens": 51427548598.0, + "step": 12304 + }, + { + "epoch": 1.4622697563874034, + "grad_norm": 0.26798899052266484, + "learning_rate": 5.203347351109844e-06, + "loss": 0.7965, + "num_tokens": 51431736242.0, + "step": 12305 + }, + { + "epoch": 1.4623885918003565, + "grad_norm": 0.24471240437828393, + "learning_rate": 5.202022566571338e-06, + "loss": 0.8091, + "num_tokens": 51435925678.0, + "step": 12306 + }, + { + "epoch": 1.4625074272133096, + "grad_norm": 0.2738705738232723, + "learning_rate": 5.20069799675154e-06, + "loss": 0.8212, + "num_tokens": 51440114137.0, + "step": 12307 + }, + { + "epoch": 1.4626262626262627, + "grad_norm": 0.2483785099339906, + "learning_rate": 5.199373641699504e-06, + "loss": 0.8352, + "num_tokens": 51444299559.0, + "step": 12308 + }, + { + "epoch": 1.4627450980392158, + "grad_norm": 0.257860701850093, + "learning_rate": 5.198049501464271e-06, + "loss": 0.8274, + "num_tokens": 51448488878.0, + "step": 12309 + }, + { + "epoch": 1.4628639334521687, + "grad_norm": 0.24428853288587762, + "learning_rate": 5.196725576094886e-06, + "loss": 0.8252, + "num_tokens": 51452677679.0, + "step": 12310 + }, + { + "epoch": 1.4629827688651218, + "grad_norm": 0.26092810870681343, + "learning_rate": 5.195401865640372e-06, + "loss": 0.7946, + "num_tokens": 51456851513.0, + "step": 12311 + }, + { + "epoch": 1.463101604278075, + "grad_norm": 0.24010749839835993, + "learning_rate": 5.194078370149753e-06, + "loss": 0.774, + "num_tokens": 51461040243.0, + "step": 12312 + }, + { + "epoch": 1.463220439691028, + "grad_norm": 0.24967431175222593, + "learning_rate": 5.1927550896720416e-06, + "loss": 0.7953, + "num_tokens": 51465229114.0, + "step": 12313 + }, + { + "epoch": 1.4633392751039809, + "grad_norm": 0.25604726644761566, + "learning_rate": 5.1914320242562445e-06, + "loss": 0.7832, + "num_tokens": 51469388318.0, + "step": 12314 + }, + { + "epoch": 1.463458110516934, + "grad_norm": 0.2406179928109818, + "learning_rate": 5.190109173951358e-06, + "loss": 0.8208, + "num_tokens": 51473576829.0, + "step": 12315 + }, + { + "epoch": 1.463576945929887, + "grad_norm": 0.24719942226555802, + "learning_rate": 5.188786538806374e-06, + "loss": 0.7837, + "num_tokens": 51477744087.0, + "step": 12316 + }, + { + "epoch": 1.4636957813428402, + "grad_norm": 0.2715151971264872, + "learning_rate": 5.187464118870271e-06, + "loss": 0.8487, + "num_tokens": 51481919654.0, + "step": 12317 + }, + { + "epoch": 1.4638146167557933, + "grad_norm": 0.25468677535606365, + "learning_rate": 5.186141914192027e-06, + "loss": 0.8083, + "num_tokens": 51486088182.0, + "step": 12318 + }, + { + "epoch": 1.4639334521687464, + "grad_norm": 0.2586294288993484, + "learning_rate": 5.1848199248206036e-06, + "loss": 0.7514, + "num_tokens": 51490277939.0, + "step": 12319 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.24931278528484352, + "learning_rate": 5.18349815080496e-06, + "loss": 0.8084, + "num_tokens": 51494462529.0, + "step": 12320 + }, + { + "epoch": 1.4641711229946524, + "grad_norm": 0.2704630440534478, + "learning_rate": 5.182176592194045e-06, + "loss": 0.8103, + "num_tokens": 51498651554.0, + "step": 12321 + }, + { + "epoch": 1.4642899584076055, + "grad_norm": 0.2513602813333469, + "learning_rate": 5.180855249036802e-06, + "loss": 0.8286, + "num_tokens": 51502839620.0, + "step": 12322 + }, + { + "epoch": 1.4644087938205586, + "grad_norm": 0.277911414834259, + "learning_rate": 5.179534121382164e-06, + "loss": 0.8118, + "num_tokens": 51507028902.0, + "step": 12323 + }, + { + "epoch": 1.4645276292335117, + "grad_norm": 0.27178966505158414, + "learning_rate": 5.178213209279059e-06, + "loss": 0.8535, + "num_tokens": 51511218501.0, + "step": 12324 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.26015495871888833, + "learning_rate": 5.176892512776402e-06, + "loss": 0.8009, + "num_tokens": 51515408119.0, + "step": 12325 + }, + { + "epoch": 1.4647653000594176, + "grad_norm": 0.2598602163768453, + "learning_rate": 5.1755720319231e-06, + "loss": 0.8014, + "num_tokens": 51519596351.0, + "step": 12326 + }, + { + "epoch": 1.4648841354723707, + "grad_norm": 0.2642408121871551, + "learning_rate": 5.17425176676806e-06, + "loss": 0.8024, + "num_tokens": 51523787101.0, + "step": 12327 + }, + { + "epoch": 1.4650029708853238, + "grad_norm": 0.2807981306962726, + "learning_rate": 5.172931717360176e-06, + "loss": 0.7968, + "num_tokens": 51527972099.0, + "step": 12328 + }, + { + "epoch": 1.465121806298277, + "grad_norm": 0.25781797463910505, + "learning_rate": 5.171611883748334e-06, + "loss": 0.7867, + "num_tokens": 51532135148.0, + "step": 12329 + }, + { + "epoch": 1.46524064171123, + "grad_norm": 0.2920777030406873, + "learning_rate": 5.1702922659814084e-06, + "loss": 0.8267, + "num_tokens": 51536325021.0, + "step": 12330 + }, + { + "epoch": 1.4653594771241831, + "grad_norm": 0.26723300234069014, + "learning_rate": 5.16897286410827e-06, + "loss": 0.7902, + "num_tokens": 51540513781.0, + "step": 12331 + }, + { + "epoch": 1.465478312537136, + "grad_norm": 0.2887043310934904, + "learning_rate": 5.16765367817778e-06, + "loss": 0.8144, + "num_tokens": 51544697159.0, + "step": 12332 + }, + { + "epoch": 1.465597147950089, + "grad_norm": 0.3082111543363137, + "learning_rate": 5.166334708238796e-06, + "loss": 0.835, + "num_tokens": 51548886028.0, + "step": 12333 + }, + { + "epoch": 1.4657159833630422, + "grad_norm": 0.29918368898615194, + "learning_rate": 5.165015954340165e-06, + "loss": 0.8292, + "num_tokens": 51553074589.0, + "step": 12334 + }, + { + "epoch": 1.4658348187759953, + "grad_norm": 0.31756594224573415, + "learning_rate": 5.163697416530717e-06, + "loss": 0.8251, + "num_tokens": 51557235791.0, + "step": 12335 + }, + { + "epoch": 1.4659536541889482, + "grad_norm": 0.31404040462956406, + "learning_rate": 5.162379094859289e-06, + "loss": 0.8259, + "num_tokens": 51561399290.0, + "step": 12336 + }, + { + "epoch": 1.4660724896019013, + "grad_norm": 0.29887567544724775, + "learning_rate": 5.1610609893747e-06, + "loss": 0.8251, + "num_tokens": 51565587532.0, + "step": 12337 + }, + { + "epoch": 1.4661913250148544, + "grad_norm": 0.2791012071150869, + "learning_rate": 5.1597431001257615e-06, + "loss": 0.8313, + "num_tokens": 51569776107.0, + "step": 12338 + }, + { + "epoch": 1.4663101604278075, + "grad_norm": 0.30988802457535464, + "learning_rate": 5.158425427161289e-06, + "loss": 0.7984, + "num_tokens": 51573965182.0, + "step": 12339 + }, + { + "epoch": 1.4664289958407606, + "grad_norm": 0.27213352654528944, + "learning_rate": 5.157107970530069e-06, + "loss": 0.8001, + "num_tokens": 51578148540.0, + "step": 12340 + }, + { + "epoch": 1.4665478312537137, + "grad_norm": 0.27275653905081454, + "learning_rate": 5.155790730280896e-06, + "loss": 0.7868, + "num_tokens": 51582336263.0, + "step": 12341 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.2531388542937173, + "learning_rate": 5.154473706462554e-06, + "loss": 0.8049, + "num_tokens": 51586514398.0, + "step": 12342 + }, + { + "epoch": 1.4667855020796197, + "grad_norm": 0.28430240028187537, + "learning_rate": 5.153156899123814e-06, + "loss": 0.7848, + "num_tokens": 51590655577.0, + "step": 12343 + }, + { + "epoch": 1.4669043374925728, + "grad_norm": 0.2774065307032907, + "learning_rate": 5.1518403083134425e-06, + "loss": 0.7797, + "num_tokens": 51594845004.0, + "step": 12344 + }, + { + "epoch": 1.4670231729055259, + "grad_norm": 0.27399924216242616, + "learning_rate": 5.150523934080199e-06, + "loss": 0.7939, + "num_tokens": 51599033659.0, + "step": 12345 + }, + { + "epoch": 1.467142008318479, + "grad_norm": 0.27082221311103294, + "learning_rate": 5.149207776472829e-06, + "loss": 0.8176, + "num_tokens": 51603222233.0, + "step": 12346 + }, + { + "epoch": 1.4672608437314318, + "grad_norm": 0.2787880137964963, + "learning_rate": 5.147891835540078e-06, + "loss": 0.8148, + "num_tokens": 51607404776.0, + "step": 12347 + }, + { + "epoch": 1.467379679144385, + "grad_norm": 0.2657047649395475, + "learning_rate": 5.1465761113306785e-06, + "loss": 0.7991, + "num_tokens": 51611593367.0, + "step": 12348 + }, + { + "epoch": 1.467498514557338, + "grad_norm": 0.25489767611117914, + "learning_rate": 5.145260603893358e-06, + "loss": 0.8377, + "num_tokens": 51615782498.0, + "step": 12349 + }, + { + "epoch": 1.4676173499702911, + "grad_norm": 0.27439080441906194, + "learning_rate": 5.1439453132768344e-06, + "loss": 0.7849, + "num_tokens": 51619942342.0, + "step": 12350 + }, + { + "epoch": 1.4677361853832442, + "grad_norm": 0.2696045914069341, + "learning_rate": 5.14263023952981e-06, + "loss": 0.8274, + "num_tokens": 51624131056.0, + "step": 12351 + }, + { + "epoch": 1.4678550207961973, + "grad_norm": 0.2583199678376875, + "learning_rate": 5.1413153827009955e-06, + "loss": 0.8302, + "num_tokens": 51628297814.0, + "step": 12352 + }, + { + "epoch": 1.4679738562091504, + "grad_norm": 0.27038550393083594, + "learning_rate": 5.14000074283908e-06, + "loss": 0.7967, + "num_tokens": 51632487905.0, + "step": 12353 + }, + { + "epoch": 1.4680926916221033, + "grad_norm": 0.25970976249317035, + "learning_rate": 5.138686319992752e-06, + "loss": 0.8223, + "num_tokens": 51636652249.0, + "step": 12354 + }, + { + "epoch": 1.4682115270350564, + "grad_norm": 0.2580113191592562, + "learning_rate": 5.137372114210688e-06, + "loss": 0.8271, + "num_tokens": 51640822773.0, + "step": 12355 + }, + { + "epoch": 1.4683303624480095, + "grad_norm": 0.24935925224041916, + "learning_rate": 5.136058125541554e-06, + "loss": 0.8012, + "num_tokens": 51645012268.0, + "step": 12356 + }, + { + "epoch": 1.4684491978609626, + "grad_norm": 0.25119458089758945, + "learning_rate": 5.134744354034016e-06, + "loss": 0.7989, + "num_tokens": 51649168303.0, + "step": 12357 + }, + { + "epoch": 1.4685680332739155, + "grad_norm": 0.2658271617385659, + "learning_rate": 5.133430799736721e-06, + "loss": 0.8159, + "num_tokens": 51653356456.0, + "step": 12358 + }, + { + "epoch": 1.4686868686868686, + "grad_norm": 0.2559111614388266, + "learning_rate": 5.1321174626983215e-06, + "loss": 0.8394, + "num_tokens": 51657544836.0, + "step": 12359 + }, + { + "epoch": 1.4688057040998217, + "grad_norm": 0.24233248495579823, + "learning_rate": 5.130804342967455e-06, + "loss": 0.8284, + "num_tokens": 51661733587.0, + "step": 12360 + }, + { + "epoch": 1.4689245395127748, + "grad_norm": 0.2944753996313036, + "learning_rate": 5.129491440592746e-06, + "loss": 0.8349, + "num_tokens": 51665923528.0, + "step": 12361 + }, + { + "epoch": 1.469043374925728, + "grad_norm": 0.2394887312807637, + "learning_rate": 5.128178755622817e-06, + "loss": 0.8241, + "num_tokens": 51670112578.0, + "step": 12362 + }, + { + "epoch": 1.469162210338681, + "grad_norm": 0.28741114213430047, + "learning_rate": 5.126866288106283e-06, + "loss": 0.8296, + "num_tokens": 51674301050.0, + "step": 12363 + }, + { + "epoch": 1.469281045751634, + "grad_norm": 0.2497487223113889, + "learning_rate": 5.125554038091742e-06, + "loss": 0.7956, + "num_tokens": 51678491877.0, + "step": 12364 + }, + { + "epoch": 1.469399881164587, + "grad_norm": 0.29542211986451433, + "learning_rate": 5.124242005627805e-06, + "loss": 0.8111, + "num_tokens": 51682680573.0, + "step": 12365 + }, + { + "epoch": 1.46951871657754, + "grad_norm": 0.2580569874163492, + "learning_rate": 5.122930190763051e-06, + "loss": 0.8066, + "num_tokens": 51686870369.0, + "step": 12366 + }, + { + "epoch": 1.4696375519904932, + "grad_norm": 0.2633800804411657, + "learning_rate": 5.12161859354606e-06, + "loss": 0.8395, + "num_tokens": 51691056688.0, + "step": 12367 + }, + { + "epoch": 1.4697563874034463, + "grad_norm": 0.2598995561663531, + "learning_rate": 5.120307214025409e-06, + "loss": 0.8072, + "num_tokens": 51695245582.0, + "step": 12368 + }, + { + "epoch": 1.4698752228163992, + "grad_norm": 0.26948208451238925, + "learning_rate": 5.118996052249663e-06, + "loss": 0.7885, + "num_tokens": 51699403857.0, + "step": 12369 + }, + { + "epoch": 1.4699940582293523, + "grad_norm": 0.2547137376619094, + "learning_rate": 5.117685108267376e-06, + "loss": 0.7929, + "num_tokens": 51703594006.0, + "step": 12370 + }, + { + "epoch": 1.4701128936423054, + "grad_norm": 0.2462965778238051, + "learning_rate": 5.1163743821270984e-06, + "loss": 0.7879, + "num_tokens": 51707782840.0, + "step": 12371 + }, + { + "epoch": 1.4702317290552585, + "grad_norm": 0.2694521810213588, + "learning_rate": 5.11506387387737e-06, + "loss": 0.7843, + "num_tokens": 51711973314.0, + "step": 12372 + }, + { + "epoch": 1.4703505644682116, + "grad_norm": 0.25830102814594974, + "learning_rate": 5.113753583566724e-06, + "loss": 0.8029, + "num_tokens": 51716162620.0, + "step": 12373 + }, + { + "epoch": 1.4704693998811647, + "grad_norm": 0.27985469548021963, + "learning_rate": 5.112443511243686e-06, + "loss": 0.8606, + "num_tokens": 51720353164.0, + "step": 12374 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.255050015910513, + "learning_rate": 5.111133656956768e-06, + "loss": 0.8407, + "num_tokens": 51724513116.0, + "step": 12375 + }, + { + "epoch": 1.4707070707070706, + "grad_norm": 0.2722714967498366, + "learning_rate": 5.109824020754486e-06, + "loss": 0.7933, + "num_tokens": 51728702439.0, + "step": 12376 + }, + { + "epoch": 1.4708259061200237, + "grad_norm": 0.253061513102618, + "learning_rate": 5.108514602685331e-06, + "loss": 0.8065, + "num_tokens": 51732861139.0, + "step": 12377 + }, + { + "epoch": 1.4709447415329768, + "grad_norm": 0.27277944314741076, + "learning_rate": 5.107205402797802e-06, + "loss": 0.8451, + "num_tokens": 51737050453.0, + "step": 12378 + }, + { + "epoch": 1.47106357694593, + "grad_norm": 0.2784478533654921, + "learning_rate": 5.10589642114038e-06, + "loss": 0.8454, + "num_tokens": 51741238862.0, + "step": 12379 + }, + { + "epoch": 1.471182412358883, + "grad_norm": 0.25828436315095876, + "learning_rate": 5.1045876577615426e-06, + "loss": 0.8414, + "num_tokens": 51745422442.0, + "step": 12380 + }, + { + "epoch": 1.471301247771836, + "grad_norm": 0.2699979383623424, + "learning_rate": 5.103279112709761e-06, + "loss": 0.8313, + "num_tokens": 51749581266.0, + "step": 12381 + }, + { + "epoch": 1.471420083184789, + "grad_norm": 0.25511610000416723, + "learning_rate": 5.101970786033486e-06, + "loss": 0.7833, + "num_tokens": 51753743036.0, + "step": 12382 + }, + { + "epoch": 1.471538918597742, + "grad_norm": 0.27187452025928827, + "learning_rate": 5.100662677781173e-06, + "loss": 0.7821, + "num_tokens": 51757901695.0, + "step": 12383 + }, + { + "epoch": 1.4716577540106952, + "grad_norm": 0.2767417505522647, + "learning_rate": 5.0993547880012694e-06, + "loss": 0.8148, + "num_tokens": 51762070023.0, + "step": 12384 + }, + { + "epoch": 1.4717765894236483, + "grad_norm": 0.2976127350981873, + "learning_rate": 5.098047116742209e-06, + "loss": 0.8389, + "num_tokens": 51766259372.0, + "step": 12385 + }, + { + "epoch": 1.4718954248366014, + "grad_norm": 0.28712845613864185, + "learning_rate": 5.096739664052423e-06, + "loss": 0.803, + "num_tokens": 51770448612.0, + "step": 12386 + }, + { + "epoch": 1.4720142602495543, + "grad_norm": 0.25781274616489447, + "learning_rate": 5.095432429980321e-06, + "loss": 0.7729, + "num_tokens": 51774637808.0, + "step": 12387 + }, + { + "epoch": 1.4721330956625074, + "grad_norm": 0.28425386721145635, + "learning_rate": 5.094125414574322e-06, + "loss": 0.8108, + "num_tokens": 51778789931.0, + "step": 12388 + }, + { + "epoch": 1.4722519310754605, + "grad_norm": 0.2565392285700318, + "learning_rate": 5.092818617882824e-06, + "loss": 0.8369, + "num_tokens": 51782979643.0, + "step": 12389 + }, + { + "epoch": 1.4723707664884136, + "grad_norm": 0.25450248279594795, + "learning_rate": 5.091512039954227e-06, + "loss": 0.7891, + "num_tokens": 51787167353.0, + "step": 12390 + }, + { + "epoch": 1.4724896019013667, + "grad_norm": 0.2595493892930113, + "learning_rate": 5.090205680836921e-06, + "loss": 0.8318, + "num_tokens": 51791357400.0, + "step": 12391 + }, + { + "epoch": 1.4726084373143196, + "grad_norm": 0.26601539364405696, + "learning_rate": 5.088899540579277e-06, + "loss": 0.8321, + "num_tokens": 51795515571.0, + "step": 12392 + }, + { + "epoch": 1.4727272727272727, + "grad_norm": 0.2612349291971247, + "learning_rate": 5.087593619229667e-06, + "loss": 0.8163, + "num_tokens": 51799676118.0, + "step": 12393 + }, + { + "epoch": 1.4728461081402258, + "grad_norm": 0.2560503999370168, + "learning_rate": 5.086287916836457e-06, + "loss": 0.8058, + "num_tokens": 51803865882.0, + "step": 12394 + }, + { + "epoch": 1.4729649435531789, + "grad_norm": 0.24709776615663281, + "learning_rate": 5.0849824334480005e-06, + "loss": 0.8095, + "num_tokens": 51808054863.0, + "step": 12395 + }, + { + "epoch": 1.473083778966132, + "grad_norm": 0.25477177686044783, + "learning_rate": 5.083677169112643e-06, + "loss": 0.8031, + "num_tokens": 51812211326.0, + "step": 12396 + }, + { + "epoch": 1.473202614379085, + "grad_norm": 0.2441964995725171, + "learning_rate": 5.082372123878723e-06, + "loss": 0.8019, + "num_tokens": 51816400349.0, + "step": 12397 + }, + { + "epoch": 1.4733214497920382, + "grad_norm": 0.27945828060447375, + "learning_rate": 5.081067297794572e-06, + "loss": 0.8467, + "num_tokens": 51820557707.0, + "step": 12398 + }, + { + "epoch": 1.473440285204991, + "grad_norm": 0.2467518706245154, + "learning_rate": 5.079762690908511e-06, + "loss": 0.7825, + "num_tokens": 51824716910.0, + "step": 12399 + }, + { + "epoch": 1.4735591206179441, + "grad_norm": 0.24454586380172924, + "learning_rate": 5.078458303268855e-06, + "loss": 0.7846, + "num_tokens": 51828906800.0, + "step": 12400 + }, + { + "epoch": 1.4736779560308972, + "grad_norm": 0.24730490795104376, + "learning_rate": 5.077154134923907e-06, + "loss": 0.7997, + "num_tokens": 51833096585.0, + "step": 12401 + }, + { + "epoch": 1.4737967914438503, + "grad_norm": 0.25960417666123153, + "learning_rate": 5.07585018592197e-06, + "loss": 0.8447, + "num_tokens": 51837264480.0, + "step": 12402 + }, + { + "epoch": 1.4739156268568032, + "grad_norm": 0.24109583919804164, + "learning_rate": 5.074546456311329e-06, + "loss": 0.8279, + "num_tokens": 51841436286.0, + "step": 12403 + }, + { + "epoch": 1.4740344622697563, + "grad_norm": 0.2579076972535814, + "learning_rate": 5.0732429461402675e-06, + "loss": 0.8053, + "num_tokens": 51845623965.0, + "step": 12404 + }, + { + "epoch": 1.4741532976827094, + "grad_norm": 0.24202176500864245, + "learning_rate": 5.071939655457059e-06, + "loss": 0.7748, + "num_tokens": 51849794556.0, + "step": 12405 + }, + { + "epoch": 1.4742721330956625, + "grad_norm": 0.25109876064307407, + "learning_rate": 5.0706365843099666e-06, + "loss": 0.7899, + "num_tokens": 51853964988.0, + "step": 12406 + }, + { + "epoch": 1.4743909685086156, + "grad_norm": 0.2562109490720699, + "learning_rate": 5.069333732747255e-06, + "loss": 0.7654, + "num_tokens": 51858128175.0, + "step": 12407 + }, + { + "epoch": 1.4745098039215687, + "grad_norm": 0.2607735857554361, + "learning_rate": 5.068031100817159e-06, + "loss": 0.8318, + "num_tokens": 51862314537.0, + "step": 12408 + }, + { + "epoch": 1.4746286393345218, + "grad_norm": 0.24711994344629576, + "learning_rate": 5.066728688567932e-06, + "loss": 0.8435, + "num_tokens": 51866502534.0, + "step": 12409 + }, + { + "epoch": 1.4747474747474747, + "grad_norm": 0.25119953916819876, + "learning_rate": 5.065426496047802e-06, + "loss": 0.7984, + "num_tokens": 51870671844.0, + "step": 12410 + }, + { + "epoch": 1.4748663101604278, + "grad_norm": 0.25919427709299525, + "learning_rate": 5.064124523304994e-06, + "loss": 0.7981, + "num_tokens": 51874859479.0, + "step": 12411 + }, + { + "epoch": 1.474985145573381, + "grad_norm": 0.258458690552079, + "learning_rate": 5.062822770387728e-06, + "loss": 0.8325, + "num_tokens": 51879047312.0, + "step": 12412 + }, + { + "epoch": 1.475103980986334, + "grad_norm": 0.2655975582305351, + "learning_rate": 5.061521237344205e-06, + "loss": 0.7977, + "num_tokens": 51883230351.0, + "step": 12413 + }, + { + "epoch": 1.4752228163992869, + "grad_norm": 0.25138126032503705, + "learning_rate": 5.06021992422263e-06, + "loss": 0.8375, + "num_tokens": 51887420899.0, + "step": 12414 + }, + { + "epoch": 1.47534165181224, + "grad_norm": 0.26393098420138617, + "learning_rate": 5.058918831071191e-06, + "loss": 0.7944, + "num_tokens": 51891611766.0, + "step": 12415 + }, + { + "epoch": 1.475460487225193, + "grad_norm": 0.27087219247162403, + "learning_rate": 5.057617957938077e-06, + "loss": 0.8051, + "num_tokens": 51895801602.0, + "step": 12416 + }, + { + "epoch": 1.4755793226381462, + "grad_norm": 0.26900071618665816, + "learning_rate": 5.056317304871465e-06, + "loss": 0.8012, + "num_tokens": 51899964059.0, + "step": 12417 + }, + { + "epoch": 1.4756981580510993, + "grad_norm": 0.24825403662032708, + "learning_rate": 5.055016871919518e-06, + "loss": 0.7726, + "num_tokens": 51904135005.0, + "step": 12418 + }, + { + "epoch": 1.4758169934640524, + "grad_norm": 0.25645665927027567, + "learning_rate": 5.053716659130396e-06, + "loss": 0.7921, + "num_tokens": 51908323306.0, + "step": 12419 + }, + { + "epoch": 1.4759358288770055, + "grad_norm": 0.29999441201015087, + "learning_rate": 5.052416666552249e-06, + "loss": 0.8027, + "num_tokens": 51912510984.0, + "step": 12420 + }, + { + "epoch": 1.4760546642899584, + "grad_norm": 0.2577696886247104, + "learning_rate": 5.0511168942332215e-06, + "loss": 0.8131, + "num_tokens": 51916646130.0, + "step": 12421 + }, + { + "epoch": 1.4761734997029115, + "grad_norm": 0.27836330455587993, + "learning_rate": 5.049817342221453e-06, + "loss": 0.7879, + "num_tokens": 51920835048.0, + "step": 12422 + }, + { + "epoch": 1.4762923351158646, + "grad_norm": 0.2531474470255279, + "learning_rate": 5.048518010565064e-06, + "loss": 0.8088, + "num_tokens": 51925023724.0, + "step": 12423 + }, + { + "epoch": 1.4764111705288177, + "grad_norm": 0.2749297547281328, + "learning_rate": 5.047218899312176e-06, + "loss": 0.8073, + "num_tokens": 51929213409.0, + "step": 12424 + }, + { + "epoch": 1.4765300059417705, + "grad_norm": 0.25484017235355505, + "learning_rate": 5.045920008510898e-06, + "loss": 0.8625, + "num_tokens": 51933400949.0, + "step": 12425 + }, + { + "epoch": 1.4766488413547236, + "grad_norm": 0.2757916245891272, + "learning_rate": 5.044621338209336e-06, + "loss": 0.8222, + "num_tokens": 51937589576.0, + "step": 12426 + }, + { + "epoch": 1.4767676767676767, + "grad_norm": 0.27617162176277266, + "learning_rate": 5.043322888455579e-06, + "loss": 0.8235, + "num_tokens": 51941778039.0, + "step": 12427 + }, + { + "epoch": 1.4768865121806298, + "grad_norm": 0.2569901026945433, + "learning_rate": 5.042024659297716e-06, + "loss": 0.7812, + "num_tokens": 51945966263.0, + "step": 12428 + }, + { + "epoch": 1.477005347593583, + "grad_norm": 0.25889721219211237, + "learning_rate": 5.0407266507838246e-06, + "loss": 0.8137, + "num_tokens": 51950125275.0, + "step": 12429 + }, + { + "epoch": 1.477124183006536, + "grad_norm": 0.24627726771901018, + "learning_rate": 5.039428862961974e-06, + "loss": 0.8118, + "num_tokens": 51954314145.0, + "step": 12430 + }, + { + "epoch": 1.4772430184194891, + "grad_norm": 0.28658616295272626, + "learning_rate": 5.0381312958802254e-06, + "loss": 0.7856, + "num_tokens": 51958475934.0, + "step": 12431 + }, + { + "epoch": 1.477361853832442, + "grad_norm": 0.2532843876653652, + "learning_rate": 5.036833949586632e-06, + "loss": 0.8407, + "num_tokens": 51962663525.0, + "step": 12432 + }, + { + "epoch": 1.477480689245395, + "grad_norm": 0.2602648895732263, + "learning_rate": 5.035536824129241e-06, + "loss": 0.769, + "num_tokens": 51966852356.0, + "step": 12433 + }, + { + "epoch": 1.4775995246583482, + "grad_norm": 0.26438107121493437, + "learning_rate": 5.034239919556085e-06, + "loss": 0.7898, + "num_tokens": 51971043169.0, + "step": 12434 + }, + { + "epoch": 1.4777183600713013, + "grad_norm": 0.25494978070876334, + "learning_rate": 5.032943235915199e-06, + "loss": 0.8074, + "num_tokens": 51975232563.0, + "step": 12435 + }, + { + "epoch": 1.4778371954842542, + "grad_norm": 0.2658273270026769, + "learning_rate": 5.031646773254597e-06, + "loss": 0.7576, + "num_tokens": 51979394685.0, + "step": 12436 + }, + { + "epoch": 1.4779560308972073, + "grad_norm": 0.23887164825627702, + "learning_rate": 5.030350531622297e-06, + "loss": 0.7883, + "num_tokens": 51983542045.0, + "step": 12437 + }, + { + "epoch": 1.4780748663101604, + "grad_norm": 0.27403357751414625, + "learning_rate": 5.0290545110662995e-06, + "loss": 0.799, + "num_tokens": 51987700276.0, + "step": 12438 + }, + { + "epoch": 1.4781937017231135, + "grad_norm": 0.2672462417810102, + "learning_rate": 5.027758711634605e-06, + "loss": 0.8271, + "num_tokens": 51991890267.0, + "step": 12439 + }, + { + "epoch": 1.4783125371360666, + "grad_norm": 0.2632560559654809, + "learning_rate": 5.0264631333751925e-06, + "loss": 0.7875, + "num_tokens": 51996072341.0, + "step": 12440 + }, + { + "epoch": 1.4784313725490197, + "grad_norm": 0.2584124443266229, + "learning_rate": 5.025167776336048e-06, + "loss": 0.7904, + "num_tokens": 52000259850.0, + "step": 12441 + }, + { + "epoch": 1.4785502079619728, + "grad_norm": 0.25359557089609097, + "learning_rate": 5.023872640565144e-06, + "loss": 0.7717, + "num_tokens": 52004449493.0, + "step": 12442 + }, + { + "epoch": 1.4786690433749257, + "grad_norm": 0.2627063376328632, + "learning_rate": 5.022577726110441e-06, + "loss": 0.8304, + "num_tokens": 52008636567.0, + "step": 12443 + }, + { + "epoch": 1.4787878787878788, + "grad_norm": 0.23926506132931577, + "learning_rate": 5.021283033019899e-06, + "loss": 0.8343, + "num_tokens": 52012823124.0, + "step": 12444 + }, + { + "epoch": 1.4789067142008319, + "grad_norm": 0.24451429140794656, + "learning_rate": 5.0199885613414565e-06, + "loss": 0.7948, + "num_tokens": 52017010001.0, + "step": 12445 + }, + { + "epoch": 1.479025549613785, + "grad_norm": 0.27147457465271907, + "learning_rate": 5.018694311123057e-06, + "loss": 0.7668, + "num_tokens": 52021199565.0, + "step": 12446 + }, + { + "epoch": 1.4791443850267378, + "grad_norm": 0.24656198736317372, + "learning_rate": 5.0174002824126274e-06, + "loss": 0.8336, + "num_tokens": 52025356728.0, + "step": 12447 + }, + { + "epoch": 1.479263220439691, + "grad_norm": 0.24979936745395, + "learning_rate": 5.016106475258096e-06, + "loss": 0.819, + "num_tokens": 52029525402.0, + "step": 12448 + }, + { + "epoch": 1.479382055852644, + "grad_norm": 0.246143523041174, + "learning_rate": 5.014812889707377e-06, + "loss": 0.7621, + "num_tokens": 52033709185.0, + "step": 12449 + }, + { + "epoch": 1.4795008912655971, + "grad_norm": 0.24366116801667156, + "learning_rate": 5.01351952580837e-06, + "loss": 0.8262, + "num_tokens": 52037868291.0, + "step": 12450 + }, + { + "epoch": 1.4796197266785502, + "grad_norm": 0.2459753348255517, + "learning_rate": 5.012226383608976e-06, + "loss": 0.8057, + "num_tokens": 52042031783.0, + "step": 12451 + }, + { + "epoch": 1.4797385620915033, + "grad_norm": 0.23946237954028765, + "learning_rate": 5.010933463157085e-06, + "loss": 0.7758, + "num_tokens": 52046220702.0, + "step": 12452 + }, + { + "epoch": 1.4798573975044564, + "grad_norm": 0.234774171408165, + "learning_rate": 5.009640764500573e-06, + "loss": 0.8322, + "num_tokens": 52050371155.0, + "step": 12453 + }, + { + "epoch": 1.4799762329174093, + "grad_norm": 0.2648669467808719, + "learning_rate": 5.0083482876873235e-06, + "loss": 0.8032, + "num_tokens": 52054561391.0, + "step": 12454 + }, + { + "epoch": 1.4800950683303624, + "grad_norm": 0.2566096304192391, + "learning_rate": 5.007056032765191e-06, + "loss": 0.7833, + "num_tokens": 52058747823.0, + "step": 12455 + }, + { + "epoch": 1.4802139037433155, + "grad_norm": 0.2519332515656604, + "learning_rate": 5.005763999782039e-06, + "loss": 0.7732, + "num_tokens": 52062935956.0, + "step": 12456 + }, + { + "epoch": 1.4803327391562686, + "grad_norm": 0.2556471142963531, + "learning_rate": 5.004472188785712e-06, + "loss": 0.7704, + "num_tokens": 52067122444.0, + "step": 12457 + }, + { + "epoch": 1.4804515745692215, + "grad_norm": 0.2538675305680675, + "learning_rate": 5.0031805998240515e-06, + "loss": 0.8007, + "num_tokens": 52071310130.0, + "step": 12458 + }, + { + "epoch": 1.4805704099821746, + "grad_norm": 0.24918856172479786, + "learning_rate": 5.0018892329448875e-06, + "loss": 0.8254, + "num_tokens": 52075500535.0, + "step": 12459 + }, + { + "epoch": 1.4806892453951277, + "grad_norm": 0.24242811041390158, + "learning_rate": 5.000598088196047e-06, + "loss": 0.7911, + "num_tokens": 52079690657.0, + "step": 12460 + }, + { + "epoch": 1.4808080808080808, + "grad_norm": 0.22923452237754172, + "learning_rate": 4.999307165625343e-06, + "loss": 0.7992, + "num_tokens": 52083847554.0, + "step": 12461 + }, + { + "epoch": 1.480926916221034, + "grad_norm": 0.2621336160145316, + "learning_rate": 4.998016465280584e-06, + "loss": 0.8339, + "num_tokens": 52088012625.0, + "step": 12462 + }, + { + "epoch": 1.481045751633987, + "grad_norm": 0.25252709324698147, + "learning_rate": 4.996725987209567e-06, + "loss": 0.8001, + "num_tokens": 52092201975.0, + "step": 12463 + }, + { + "epoch": 1.48116458704694, + "grad_norm": 0.24944126113535758, + "learning_rate": 4.995435731460085e-06, + "loss": 0.8273, + "num_tokens": 52096391505.0, + "step": 12464 + }, + { + "epoch": 1.481283422459893, + "grad_norm": 0.25316112061744706, + "learning_rate": 4.994145698079923e-06, + "loss": 0.8161, + "num_tokens": 52100560573.0, + "step": 12465 + }, + { + "epoch": 1.481402257872846, + "grad_norm": 0.2585588498146348, + "learning_rate": 4.992855887116847e-06, + "loss": 0.8472, + "num_tokens": 52104740407.0, + "step": 12466 + }, + { + "epoch": 1.4815210932857992, + "grad_norm": 0.25825060537270234, + "learning_rate": 4.991566298618631e-06, + "loss": 0.8435, + "num_tokens": 52108916366.0, + "step": 12467 + }, + { + "epoch": 1.4816399286987523, + "grad_norm": 0.25944574201276555, + "learning_rate": 4.9902769326330284e-06, + "loss": 0.77, + "num_tokens": 52113105638.0, + "step": 12468 + }, + { + "epoch": 1.4817587641117052, + "grad_norm": 0.24787913257985542, + "learning_rate": 4.988987789207791e-06, + "loss": 0.791, + "num_tokens": 52117259024.0, + "step": 12469 + }, + { + "epoch": 1.4818775995246583, + "grad_norm": 0.2637721857944257, + "learning_rate": 4.987698868390663e-06, + "loss": 0.8399, + "num_tokens": 52121448262.0, + "step": 12470 + }, + { + "epoch": 1.4819964349376114, + "grad_norm": 0.23632626732160095, + "learning_rate": 4.98641017022937e-06, + "loss": 0.7607, + "num_tokens": 52125611870.0, + "step": 12471 + }, + { + "epoch": 1.4821152703505645, + "grad_norm": 0.24976239440090242, + "learning_rate": 4.9851216947716394e-06, + "loss": 0.7892, + "num_tokens": 52129801802.0, + "step": 12472 + }, + { + "epoch": 1.4822341057635176, + "grad_norm": 0.2464241357804452, + "learning_rate": 4.983833442065191e-06, + "loss": 0.8293, + "num_tokens": 52133990771.0, + "step": 12473 + }, + { + "epoch": 1.4823529411764707, + "grad_norm": 0.2467940178229404, + "learning_rate": 4.982545412157731e-06, + "loss": 0.807, + "num_tokens": 52138178836.0, + "step": 12474 + }, + { + "epoch": 1.4824717765894238, + "grad_norm": 0.23507422479185103, + "learning_rate": 4.9812576050969644e-06, + "loss": 0.8025, + "num_tokens": 52142368114.0, + "step": 12475 + }, + { + "epoch": 1.4825906120023766, + "grad_norm": 0.25767634741488843, + "learning_rate": 4.979970020930575e-06, + "loss": 0.806, + "num_tokens": 52146529956.0, + "step": 12476 + }, + { + "epoch": 1.4827094474153297, + "grad_norm": 0.2485543009702129, + "learning_rate": 4.9786826597062495e-06, + "loss": 0.805, + "num_tokens": 52150719035.0, + "step": 12477 + }, + { + "epoch": 1.4828282828282828, + "grad_norm": 0.25861896752696656, + "learning_rate": 4.977395521471661e-06, + "loss": 0.8043, + "num_tokens": 52154900251.0, + "step": 12478 + }, + { + "epoch": 1.482947118241236, + "grad_norm": 0.26498104150379326, + "learning_rate": 4.976108606274482e-06, + "loss": 0.837, + "num_tokens": 52159089804.0, + "step": 12479 + }, + { + "epoch": 1.483065953654189, + "grad_norm": 0.2601610635632622, + "learning_rate": 4.974821914162372e-06, + "loss": 0.8316, + "num_tokens": 52163278413.0, + "step": 12480 + }, + { + "epoch": 1.483184789067142, + "grad_norm": 0.24730096804703594, + "learning_rate": 4.973535445182974e-06, + "loss": 0.7865, + "num_tokens": 52167468275.0, + "step": 12481 + }, + { + "epoch": 1.483303624480095, + "grad_norm": 0.2537087585846426, + "learning_rate": 4.9722491993839345e-06, + "loss": 0.803, + "num_tokens": 52171639373.0, + "step": 12482 + }, + { + "epoch": 1.483422459893048, + "grad_norm": 0.2610541643713612, + "learning_rate": 4.970963176812889e-06, + "loss": 0.809, + "num_tokens": 52175807673.0, + "step": 12483 + }, + { + "epoch": 1.4835412953060012, + "grad_norm": 0.2402148303452602, + "learning_rate": 4.96967737751746e-06, + "loss": 0.8194, + "num_tokens": 52179998689.0, + "step": 12484 + }, + { + "epoch": 1.4836601307189543, + "grad_norm": 0.27524920220083987, + "learning_rate": 4.968391801545267e-06, + "loss": 0.8093, + "num_tokens": 52184175429.0, + "step": 12485 + }, + { + "epoch": 1.4837789661319074, + "grad_norm": 0.2430658920134106, + "learning_rate": 4.96710644894392e-06, + "loss": 0.8255, + "num_tokens": 52188363885.0, + "step": 12486 + }, + { + "epoch": 1.4838978015448605, + "grad_norm": 0.27107508890765786, + "learning_rate": 4.965821319761018e-06, + "loss": 0.7797, + "num_tokens": 52192529350.0, + "step": 12487 + }, + { + "epoch": 1.4840166369578134, + "grad_norm": 0.27698804139911043, + "learning_rate": 4.964536414044156e-06, + "loss": 0.8058, + "num_tokens": 52196716592.0, + "step": 12488 + }, + { + "epoch": 1.4841354723707665, + "grad_norm": 0.2663413773958086, + "learning_rate": 4.963251731840916e-06, + "loss": 0.8091, + "num_tokens": 52200876978.0, + "step": 12489 + }, + { + "epoch": 1.4842543077837196, + "grad_norm": 0.26460838240939416, + "learning_rate": 4.961967273198875e-06, + "loss": 0.8415, + "num_tokens": 52205036102.0, + "step": 12490 + }, + { + "epoch": 1.4843731431966727, + "grad_norm": 0.2686435938717817, + "learning_rate": 4.9606830381656e-06, + "loss": 0.8068, + "num_tokens": 52209204391.0, + "step": 12491 + }, + { + "epoch": 1.4844919786096256, + "grad_norm": 0.2345317930429912, + "learning_rate": 4.959399026788653e-06, + "loss": 0.8093, + "num_tokens": 52213334159.0, + "step": 12492 + }, + { + "epoch": 1.4846108140225787, + "grad_norm": 0.25304144951204033, + "learning_rate": 4.9581152391155835e-06, + "loss": 0.8055, + "num_tokens": 52217500408.0, + "step": 12493 + }, + { + "epoch": 1.4847296494355318, + "grad_norm": 0.2558721626012922, + "learning_rate": 4.9568316751939345e-06, + "loss": 0.8157, + "num_tokens": 52221682819.0, + "step": 12494 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 0.2632465547888225, + "learning_rate": 4.9555483350712405e-06, + "loss": 0.7691, + "num_tokens": 52225844817.0, + "step": 12495 + }, + { + "epoch": 1.484967320261438, + "grad_norm": 0.25089115205721924, + "learning_rate": 4.954265218795033e-06, + "loss": 0.791, + "num_tokens": 52230013335.0, + "step": 12496 + }, + { + "epoch": 1.485086155674391, + "grad_norm": 0.25903794669154623, + "learning_rate": 4.952982326412818e-06, + "loss": 0.8314, + "num_tokens": 52234180003.0, + "step": 12497 + }, + { + "epoch": 1.4852049910873442, + "grad_norm": 0.26721592965867325, + "learning_rate": 4.951699657972116e-06, + "loss": 0.8243, + "num_tokens": 52238368692.0, + "step": 12498 + }, + { + "epoch": 1.485323826500297, + "grad_norm": 0.2484481684961108, + "learning_rate": 4.950417213520427e-06, + "loss": 0.8119, + "num_tokens": 52242558480.0, + "step": 12499 + }, + { + "epoch": 1.4854426619132501, + "grad_norm": 0.2757211603536436, + "learning_rate": 4.94913499310524e-06, + "loss": 0.8286, + "num_tokens": 52246748561.0, + "step": 12500 + }, + { + "epoch": 1.4855614973262032, + "grad_norm": 0.2509452426417618, + "learning_rate": 4.947852996774048e-06, + "loss": 0.7982, + "num_tokens": 52250913536.0, + "step": 12501 + }, + { + "epoch": 1.4856803327391563, + "grad_norm": 0.25391874684011106, + "learning_rate": 4.946571224574319e-06, + "loss": 0.8187, + "num_tokens": 52255101390.0, + "step": 12502 + }, + { + "epoch": 1.4857991681521092, + "grad_norm": 0.27044179815662067, + "learning_rate": 4.945289676553524e-06, + "loss": 0.7945, + "num_tokens": 52259272612.0, + "step": 12503 + }, + { + "epoch": 1.4859180035650623, + "grad_norm": 0.25054773484703863, + "learning_rate": 4.944008352759121e-06, + "loss": 0.7823, + "num_tokens": 52263430689.0, + "step": 12504 + }, + { + "epoch": 1.4860368389780154, + "grad_norm": 0.25496201766363297, + "learning_rate": 4.942727253238568e-06, + "loss": 0.8179, + "num_tokens": 52267610867.0, + "step": 12505 + }, + { + "epoch": 1.4861556743909685, + "grad_norm": 0.2549079963784509, + "learning_rate": 4.9414463780393075e-06, + "loss": 0.8284, + "num_tokens": 52271800116.0, + "step": 12506 + }, + { + "epoch": 1.4862745098039216, + "grad_norm": 0.24537945264516384, + "learning_rate": 4.940165727208769e-06, + "loss": 0.8515, + "num_tokens": 52275988548.0, + "step": 12507 + }, + { + "epoch": 1.4863933452168747, + "grad_norm": 0.2487077006262781, + "learning_rate": 4.938885300794383e-06, + "loss": 0.7985, + "num_tokens": 52280150456.0, + "step": 12508 + }, + { + "epoch": 1.4865121806298278, + "grad_norm": 0.2539692289777046, + "learning_rate": 4.9376050988435665e-06, + "loss": 0.7933, + "num_tokens": 52284340286.0, + "step": 12509 + }, + { + "epoch": 1.4866310160427807, + "grad_norm": 0.257259724180951, + "learning_rate": 4.936325121403728e-06, + "loss": 0.7915, + "num_tokens": 52288525562.0, + "step": 12510 + }, + { + "epoch": 1.4867498514557338, + "grad_norm": 0.2643105261830404, + "learning_rate": 4.9350453685222776e-06, + "loss": 0.8154, + "num_tokens": 52292715110.0, + "step": 12511 + }, + { + "epoch": 1.486868686868687, + "grad_norm": 0.2381844698676453, + "learning_rate": 4.933765840246601e-06, + "loss": 0.8015, + "num_tokens": 52296888511.0, + "step": 12512 + }, + { + "epoch": 1.48698752228164, + "grad_norm": 0.27664928537236677, + "learning_rate": 4.9324865366240855e-06, + "loss": 0.7979, + "num_tokens": 52301077927.0, + "step": 12513 + }, + { + "epoch": 1.4871063576945929, + "grad_norm": 0.27393047619577926, + "learning_rate": 4.931207457702108e-06, + "loss": 0.7911, + "num_tokens": 52305239079.0, + "step": 12514 + }, + { + "epoch": 1.487225193107546, + "grad_norm": 0.26973617699817914, + "learning_rate": 4.9299286035280375e-06, + "loss": 0.8011, + "num_tokens": 52309430045.0, + "step": 12515 + }, + { + "epoch": 1.487344028520499, + "grad_norm": 0.25762124757004873, + "learning_rate": 4.9286499741492344e-06, + "loss": 0.8513, + "num_tokens": 52313619786.0, + "step": 12516 + }, + { + "epoch": 1.4874628639334522, + "grad_norm": 0.2791249716235767, + "learning_rate": 4.927371569613051e-06, + "loss": 0.8262, + "num_tokens": 52317770885.0, + "step": 12517 + }, + { + "epoch": 1.4875816993464053, + "grad_norm": 0.24335690853726097, + "learning_rate": 4.9260933899668294e-06, + "loss": 0.8425, + "num_tokens": 52321925467.0, + "step": 12518 + }, + { + "epoch": 1.4877005347593584, + "grad_norm": 0.2450023406386117, + "learning_rate": 4.924815435257907e-06, + "loss": 0.8045, + "num_tokens": 52326090362.0, + "step": 12519 + }, + { + "epoch": 1.4878193701723115, + "grad_norm": 0.26187054461898623, + "learning_rate": 4.923537705533609e-06, + "loss": 0.8175, + "num_tokens": 52330278838.0, + "step": 12520 + }, + { + "epoch": 1.4879382055852644, + "grad_norm": 0.24751611208529267, + "learning_rate": 4.922260200841255e-06, + "loss": 0.8035, + "num_tokens": 52334468138.0, + "step": 12521 + }, + { + "epoch": 1.4880570409982175, + "grad_norm": 0.24345603398709212, + "learning_rate": 4.920982921228159e-06, + "loss": 0.7938, + "num_tokens": 52338656341.0, + "step": 12522 + }, + { + "epoch": 1.4881758764111706, + "grad_norm": 0.26180183729275214, + "learning_rate": 4.919705866741611e-06, + "loss": 0.8348, + "num_tokens": 52342844615.0, + "step": 12523 + }, + { + "epoch": 1.4882947118241237, + "grad_norm": 0.2649737288398333, + "learning_rate": 4.9184290374289176e-06, + "loss": 0.7993, + "num_tokens": 52347011825.0, + "step": 12524 + }, + { + "epoch": 1.4884135472370765, + "grad_norm": 0.24808877628567602, + "learning_rate": 4.917152433337357e-06, + "loss": 0.8142, + "num_tokens": 52351201720.0, + "step": 12525 + }, + { + "epoch": 1.4885323826500296, + "grad_norm": 0.27990796406272195, + "learning_rate": 4.91587605451421e-06, + "loss": 0.8341, + "num_tokens": 52355377410.0, + "step": 12526 + }, + { + "epoch": 1.4886512180629827, + "grad_norm": 0.26668945586529264, + "learning_rate": 4.9145999010067465e-06, + "loss": 0.8145, + "num_tokens": 52359566589.0, + "step": 12527 + }, + { + "epoch": 1.4887700534759358, + "grad_norm": 0.2626641804411455, + "learning_rate": 4.91332397286222e-06, + "loss": 0.824, + "num_tokens": 52363754365.0, + "step": 12528 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.2763762543691425, + "learning_rate": 4.912048270127886e-06, + "loss": 0.8264, + "num_tokens": 52367943735.0, + "step": 12529 + }, + { + "epoch": 1.489007724301842, + "grad_norm": 0.2529585605242149, + "learning_rate": 4.9107727928509895e-06, + "loss": 0.7983, + "num_tokens": 52372133817.0, + "step": 12530 + }, + { + "epoch": 1.4891265597147951, + "grad_norm": 0.28989678994796664, + "learning_rate": 4.9094975410787635e-06, + "loss": 0.8051, + "num_tokens": 52376298743.0, + "step": 12531 + }, + { + "epoch": 1.489245395127748, + "grad_norm": 0.2950184389405639, + "learning_rate": 4.908222514858441e-06, + "loss": 0.7696, + "num_tokens": 52380457373.0, + "step": 12532 + }, + { + "epoch": 1.489364230540701, + "grad_norm": 0.24662889882187136, + "learning_rate": 4.9069477142372315e-06, + "loss": 0.7956, + "num_tokens": 52384614131.0, + "step": 12533 + }, + { + "epoch": 1.4894830659536542, + "grad_norm": 0.27868794721098855, + "learning_rate": 4.905673139262349e-06, + "loss": 0.7854, + "num_tokens": 52388791812.0, + "step": 12534 + }, + { + "epoch": 1.4896019013666073, + "grad_norm": 0.26472900600627647, + "learning_rate": 4.904398789980997e-06, + "loss": 0.8426, + "num_tokens": 52392979097.0, + "step": 12535 + }, + { + "epoch": 1.4897207367795602, + "grad_norm": 0.24886574679597956, + "learning_rate": 4.903124666440364e-06, + "loss": 0.7993, + "num_tokens": 52397143189.0, + "step": 12536 + }, + { + "epoch": 1.4898395721925133, + "grad_norm": 0.2583663206215984, + "learning_rate": 4.901850768687644e-06, + "loss": 0.7899, + "num_tokens": 52401333534.0, + "step": 12537 + }, + { + "epoch": 1.4899584076054664, + "grad_norm": 0.2524545839005201, + "learning_rate": 4.9005770967700055e-06, + "loss": 0.8259, + "num_tokens": 52405522024.0, + "step": 12538 + }, + { + "epoch": 1.4900772430184195, + "grad_norm": 0.2653792170049748, + "learning_rate": 4.899303650734619e-06, + "loss": 0.8207, + "num_tokens": 52409711614.0, + "step": 12539 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.25330661099723045, + "learning_rate": 4.898030430628647e-06, + "loss": 0.7752, + "num_tokens": 52413902376.0, + "step": 12540 + }, + { + "epoch": 1.4903149138443257, + "grad_norm": 0.25291135767115075, + "learning_rate": 4.896757436499238e-06, + "loss": 0.812, + "num_tokens": 52418065533.0, + "step": 12541 + }, + { + "epoch": 1.4904337492572788, + "grad_norm": 0.2540017945203903, + "learning_rate": 4.895484668393537e-06, + "loss": 0.8249, + "num_tokens": 52422255384.0, + "step": 12542 + }, + { + "epoch": 1.4905525846702317, + "grad_norm": 0.24860627320007841, + "learning_rate": 4.894212126358679e-06, + "loss": 0.8069, + "num_tokens": 52426398755.0, + "step": 12543 + }, + { + "epoch": 1.4906714200831848, + "grad_norm": 0.2534468605923312, + "learning_rate": 4.89293981044179e-06, + "loss": 0.832, + "num_tokens": 52430585679.0, + "step": 12544 + }, + { + "epoch": 1.4907902554961379, + "grad_norm": 0.24747379531967692, + "learning_rate": 4.891667720689988e-06, + "loss": 0.8087, + "num_tokens": 52434775941.0, + "step": 12545 + }, + { + "epoch": 1.490909090909091, + "grad_norm": 0.26104547060413014, + "learning_rate": 4.890395857150384e-06, + "loss": 0.7594, + "num_tokens": 52438963534.0, + "step": 12546 + }, + { + "epoch": 1.4910279263220438, + "grad_norm": 0.2576044680891477, + "learning_rate": 4.8891242198700775e-06, + "loss": 0.8174, + "num_tokens": 52443144859.0, + "step": 12547 + }, + { + "epoch": 1.491146761734997, + "grad_norm": 0.26074273910185536, + "learning_rate": 4.887852808896163e-06, + "loss": 0.7796, + "num_tokens": 52447333065.0, + "step": 12548 + }, + { + "epoch": 1.49126559714795, + "grad_norm": 0.25163844353383513, + "learning_rate": 4.886581624275724e-06, + "loss": 0.8005, + "num_tokens": 52451522808.0, + "step": 12549 + }, + { + "epoch": 1.4913844325609031, + "grad_norm": 0.24351984147765338, + "learning_rate": 4.885310666055837e-06, + "loss": 0.831, + "num_tokens": 52455712066.0, + "step": 12550 + }, + { + "epoch": 1.4915032679738562, + "grad_norm": 0.23090024817477037, + "learning_rate": 4.884039934283572e-06, + "loss": 0.8318, + "num_tokens": 52459893118.0, + "step": 12551 + }, + { + "epoch": 1.4916221033868093, + "grad_norm": 0.2459196453317174, + "learning_rate": 4.882769429005985e-06, + "loss": 0.8274, + "num_tokens": 52464082104.0, + "step": 12552 + }, + { + "epoch": 1.4917409387997624, + "grad_norm": 0.2615634229904023, + "learning_rate": 4.88149915027013e-06, + "loss": 0.8421, + "num_tokens": 52468202895.0, + "step": 12553 + }, + { + "epoch": 1.4918597742127153, + "grad_norm": 0.2627567815374962, + "learning_rate": 4.880229098123052e-06, + "loss": 0.8028, + "num_tokens": 52472389160.0, + "step": 12554 + }, + { + "epoch": 1.4919786096256684, + "grad_norm": 0.24103225276531634, + "learning_rate": 4.8789592726117745e-06, + "loss": 0.7885, + "num_tokens": 52476579107.0, + "step": 12555 + }, + { + "epoch": 1.4920974450386215, + "grad_norm": 0.24745882007184872, + "learning_rate": 4.877689673783335e-06, + "loss": 0.8085, + "num_tokens": 52480751487.0, + "step": 12556 + }, + { + "epoch": 1.4922162804515746, + "grad_norm": 0.2436976111776566, + "learning_rate": 4.876420301684747e-06, + "loss": 0.7747, + "num_tokens": 52484941476.0, + "step": 12557 + }, + { + "epoch": 1.4923351158645275, + "grad_norm": 0.2630625371535999, + "learning_rate": 4.875151156363019e-06, + "loss": 0.8005, + "num_tokens": 52489129726.0, + "step": 12558 + }, + { + "epoch": 1.4924539512774806, + "grad_norm": 0.2465547861483293, + "learning_rate": 4.873882237865155e-06, + "loss": 0.8434, + "num_tokens": 52493316695.0, + "step": 12559 + }, + { + "epoch": 1.4925727866904337, + "grad_norm": 0.26712330113238764, + "learning_rate": 4.872613546238142e-06, + "loss": 0.8194, + "num_tokens": 52497505121.0, + "step": 12560 + }, + { + "epoch": 1.4926916221033868, + "grad_norm": 0.2432551413789617, + "learning_rate": 4.871345081528964e-06, + "loss": 0.7641, + "num_tokens": 52501696395.0, + "step": 12561 + }, + { + "epoch": 1.49281045751634, + "grad_norm": 0.25292349130764463, + "learning_rate": 4.870076843784602e-06, + "loss": 0.8022, + "num_tokens": 52505886096.0, + "step": 12562 + }, + { + "epoch": 1.492929292929293, + "grad_norm": 0.25810162171241874, + "learning_rate": 4.868808833052023e-06, + "loss": 0.8248, + "num_tokens": 52510073750.0, + "step": 12563 + }, + { + "epoch": 1.493048128342246, + "grad_norm": 0.25272331855869484, + "learning_rate": 4.867541049378179e-06, + "loss": 0.8118, + "num_tokens": 52514211608.0, + "step": 12564 + }, + { + "epoch": 1.493166963755199, + "grad_norm": 0.28823054467321074, + "learning_rate": 4.8662734928100255e-06, + "loss": 0.797, + "num_tokens": 52518400357.0, + "step": 12565 + }, + { + "epoch": 1.493285799168152, + "grad_norm": 0.2625308345922747, + "learning_rate": 4.865006163394503e-06, + "loss": 0.8337, + "num_tokens": 52522589407.0, + "step": 12566 + }, + { + "epoch": 1.4934046345811052, + "grad_norm": 0.27212417143261175, + "learning_rate": 4.863739061178542e-06, + "loss": 0.8135, + "num_tokens": 52526777915.0, + "step": 12567 + }, + { + "epoch": 1.4935234699940583, + "grad_norm": 0.2635768443441733, + "learning_rate": 4.862472186209076e-06, + "loss": 0.7834, + "num_tokens": 52530967264.0, + "step": 12568 + }, + { + "epoch": 1.4936423054070114, + "grad_norm": 0.268727308976778, + "learning_rate": 4.861205538533015e-06, + "loss": 0.7829, + "num_tokens": 52535156451.0, + "step": 12569 + }, + { + "epoch": 1.4937611408199643, + "grad_norm": 0.2414721762905942, + "learning_rate": 4.859939118197268e-06, + "loss": 0.7913, + "num_tokens": 52539346261.0, + "step": 12570 + }, + { + "epoch": 1.4938799762329174, + "grad_norm": 0.2596194751107056, + "learning_rate": 4.858672925248734e-06, + "loss": 0.8124, + "num_tokens": 52543524950.0, + "step": 12571 + }, + { + "epoch": 1.4939988116458705, + "grad_norm": 0.2503428727369517, + "learning_rate": 4.857406959734308e-06, + "loss": 0.7733, + "num_tokens": 52547673849.0, + "step": 12572 + }, + { + "epoch": 1.4941176470588236, + "grad_norm": 0.252523822611508, + "learning_rate": 4.85614122170087e-06, + "loss": 0.8165, + "num_tokens": 52551862812.0, + "step": 12573 + }, + { + "epoch": 1.4942364824717767, + "grad_norm": 0.23800298434453862, + "learning_rate": 4.854875711195296e-06, + "loss": 0.8027, + "num_tokens": 52556051826.0, + "step": 12574 + }, + { + "epoch": 1.4943553178847298, + "grad_norm": 0.23168740150235567, + "learning_rate": 4.853610428264451e-06, + "loss": 0.7895, + "num_tokens": 52560241306.0, + "step": 12575 + }, + { + "epoch": 1.4944741532976826, + "grad_norm": 0.2640040450328621, + "learning_rate": 4.852345372955193e-06, + "loss": 0.8121, + "num_tokens": 52564415810.0, + "step": 12576 + }, + { + "epoch": 1.4945929887106357, + "grad_norm": 0.26356111420799166, + "learning_rate": 4.851080545314372e-06, + "loss": 0.8043, + "num_tokens": 52568596833.0, + "step": 12577 + }, + { + "epoch": 1.4947118241235888, + "grad_norm": 0.2654543512831189, + "learning_rate": 4.849815945388828e-06, + "loss": 0.8146, + "num_tokens": 52572785646.0, + "step": 12578 + }, + { + "epoch": 1.494830659536542, + "grad_norm": 0.2610912057400126, + "learning_rate": 4.8485515732253955e-06, + "loss": 0.8173, + "num_tokens": 52576974964.0, + "step": 12579 + }, + { + "epoch": 1.494949494949495, + "grad_norm": 0.24474122437301585, + "learning_rate": 4.847287428870895e-06, + "loss": 0.8277, + "num_tokens": 52581164057.0, + "step": 12580 + }, + { + "epoch": 1.495068330362448, + "grad_norm": 0.26725051947955036, + "learning_rate": 4.8460235123721435e-06, + "loss": 0.7951, + "num_tokens": 52585353202.0, + "step": 12581 + }, + { + "epoch": 1.495187165775401, + "grad_norm": 0.2783302099265832, + "learning_rate": 4.844759823775949e-06, + "loss": 0.8453, + "num_tokens": 52589542040.0, + "step": 12582 + }, + { + "epoch": 1.495306001188354, + "grad_norm": 0.25629928931531204, + "learning_rate": 4.843496363129111e-06, + "loss": 0.8285, + "num_tokens": 52593731614.0, + "step": 12583 + }, + { + "epoch": 1.4954248366013072, + "grad_norm": 0.2744996492491588, + "learning_rate": 4.842233130478416e-06, + "loss": 0.7869, + "num_tokens": 52597917479.0, + "step": 12584 + }, + { + "epoch": 1.4955436720142603, + "grad_norm": 0.2540067068883996, + "learning_rate": 4.840970125870652e-06, + "loss": 0.8047, + "num_tokens": 52602084391.0, + "step": 12585 + }, + { + "epoch": 1.4956625074272134, + "grad_norm": 0.25283410619883484, + "learning_rate": 4.839707349352583e-06, + "loss": 0.8459, + "num_tokens": 52606273358.0, + "step": 12586 + }, + { + "epoch": 1.4957813428401665, + "grad_norm": 0.2488220212068194, + "learning_rate": 4.838444800970982e-06, + "loss": 0.8331, + "num_tokens": 52610463705.0, + "step": 12587 + }, + { + "epoch": 1.4959001782531194, + "grad_norm": 0.26018090968130214, + "learning_rate": 4.8371824807726025e-06, + "loss": 0.7984, + "num_tokens": 52614652522.0, + "step": 12588 + }, + { + "epoch": 1.4960190136660725, + "grad_norm": 0.23474902491800245, + "learning_rate": 4.835920388804193e-06, + "loss": 0.8035, + "num_tokens": 52618841461.0, + "step": 12589 + }, + { + "epoch": 1.4961378490790256, + "grad_norm": 0.245745661293626, + "learning_rate": 4.834658525112495e-06, + "loss": 0.7896, + "num_tokens": 52623031023.0, + "step": 12590 + }, + { + "epoch": 1.4962566844919787, + "grad_norm": 0.250376703790046, + "learning_rate": 4.8333968897442356e-06, + "loss": 0.8286, + "num_tokens": 52627220013.0, + "step": 12591 + }, + { + "epoch": 1.4963755199049316, + "grad_norm": 0.24543056016438788, + "learning_rate": 4.832135482746138e-06, + "loss": 0.8335, + "num_tokens": 52631379419.0, + "step": 12592 + }, + { + "epoch": 1.4964943553178847, + "grad_norm": 0.25087699205257713, + "learning_rate": 4.830874304164915e-06, + "loss": 0.8202, + "num_tokens": 52635545918.0, + "step": 12593 + }, + { + "epoch": 1.4966131907308378, + "grad_norm": 0.24696002711091866, + "learning_rate": 4.829613354047277e-06, + "loss": 0.8082, + "num_tokens": 52639705407.0, + "step": 12594 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.2507269108912443, + "learning_rate": 4.828352632439924e-06, + "loss": 0.7946, + "num_tokens": 52643895054.0, + "step": 12595 + }, + { + "epoch": 1.496850861556744, + "grad_norm": 0.260288134448473, + "learning_rate": 4.8270921393895355e-06, + "loss": 0.7859, + "num_tokens": 52648082122.0, + "step": 12596 + }, + { + "epoch": 1.496969696969697, + "grad_norm": 0.24418989577229658, + "learning_rate": 4.825831874942797e-06, + "loss": 0.8045, + "num_tokens": 52652223096.0, + "step": 12597 + }, + { + "epoch": 1.4970885323826502, + "grad_norm": 0.2608732699934954, + "learning_rate": 4.824571839146378e-06, + "loss": 0.8254, + "num_tokens": 52656412022.0, + "step": 12598 + }, + { + "epoch": 1.497207367795603, + "grad_norm": 0.25018628375524266, + "learning_rate": 4.823312032046942e-06, + "loss": 0.7959, + "num_tokens": 52660600402.0, + "step": 12599 + }, + { + "epoch": 1.4973262032085561, + "grad_norm": 0.2367419238546413, + "learning_rate": 4.82205245369115e-06, + "loss": 0.8752, + "num_tokens": 52664724992.0, + "step": 12600 + }, + { + "epoch": 1.4974450386215092, + "grad_norm": 0.24015628310978748, + "learning_rate": 4.820793104125642e-06, + "loss": 0.7981, + "num_tokens": 52668893753.0, + "step": 12601 + }, + { + "epoch": 1.4975638740344623, + "grad_norm": 0.2422840414132333, + "learning_rate": 4.819533983397056e-06, + "loss": 0.8144, + "num_tokens": 52673072202.0, + "step": 12602 + }, + { + "epoch": 1.4976827094474152, + "grad_norm": 0.25160864639756036, + "learning_rate": 4.818275091552025e-06, + "loss": 0.8129, + "num_tokens": 52677222953.0, + "step": 12603 + }, + { + "epoch": 1.4978015448603683, + "grad_norm": 0.23334751267472126, + "learning_rate": 4.8170164286371675e-06, + "loss": 0.7862, + "num_tokens": 52681412414.0, + "step": 12604 + }, + { + "epoch": 1.4979203802733214, + "grad_norm": 0.26586614908745215, + "learning_rate": 4.815757994699098e-06, + "loss": 0.7768, + "num_tokens": 52685603045.0, + "step": 12605 + }, + { + "epoch": 1.4980392156862745, + "grad_norm": 0.2626967899699859, + "learning_rate": 4.814499789784418e-06, + "loss": 0.8141, + "num_tokens": 52689792598.0, + "step": 12606 + }, + { + "epoch": 1.4981580510992276, + "grad_norm": 0.24388138189196612, + "learning_rate": 4.8132418139397245e-06, + "loss": 0.8074, + "num_tokens": 52693982077.0, + "step": 12607 + }, + { + "epoch": 1.4982768865121807, + "grad_norm": 0.2658249023556024, + "learning_rate": 4.8119840672116055e-06, + "loss": 0.8408, + "num_tokens": 52698170682.0, + "step": 12608 + }, + { + "epoch": 1.4983957219251338, + "grad_norm": 0.24859518346092815, + "learning_rate": 4.8107265496466385e-06, + "loss": 0.8331, + "num_tokens": 52702358880.0, + "step": 12609 + }, + { + "epoch": 1.4985145573380867, + "grad_norm": 0.2919336247111284, + "learning_rate": 4.8094692612913925e-06, + "loss": 0.8043, + "num_tokens": 52706549127.0, + "step": 12610 + }, + { + "epoch": 1.4986333927510398, + "grad_norm": 0.27207994473816993, + "learning_rate": 4.808212202192435e-06, + "loss": 0.8035, + "num_tokens": 52710718403.0, + "step": 12611 + }, + { + "epoch": 1.498752228163993, + "grad_norm": 0.2528129145336645, + "learning_rate": 4.806955372396307e-06, + "loss": 0.7913, + "num_tokens": 52714842079.0, + "step": 12612 + }, + { + "epoch": 1.498871063576946, + "grad_norm": 0.2556166493756003, + "learning_rate": 4.805698771949565e-06, + "loss": 0.8302, + "num_tokens": 52719028455.0, + "step": 12613 + }, + { + "epoch": 1.4989898989898989, + "grad_norm": 0.25671246353277316, + "learning_rate": 4.804442400898741e-06, + "loss": 0.8373, + "num_tokens": 52723218134.0, + "step": 12614 + }, + { + "epoch": 1.499108734402852, + "grad_norm": 0.26702397977515774, + "learning_rate": 4.8031862592903615e-06, + "loss": 0.7793, + "num_tokens": 52727408189.0, + "step": 12615 + }, + { + "epoch": 1.499227569815805, + "grad_norm": 0.24568908785418223, + "learning_rate": 4.8019303471709505e-06, + "loss": 0.7736, + "num_tokens": 52731598126.0, + "step": 12616 + }, + { + "epoch": 1.4993464052287582, + "grad_norm": 0.2602062013201818, + "learning_rate": 4.800674664587012e-06, + "loss": 0.8052, + "num_tokens": 52735770970.0, + "step": 12617 + }, + { + "epoch": 1.4994652406417113, + "grad_norm": 0.2459440723960872, + "learning_rate": 4.799419211585047e-06, + "loss": 0.8263, + "num_tokens": 52739961306.0, + "step": 12618 + }, + { + "epoch": 1.4995840760546644, + "grad_norm": 0.27893784242155273, + "learning_rate": 4.798163988211558e-06, + "loss": 0.8515, + "num_tokens": 52744135633.0, + "step": 12619 + }, + { + "epoch": 1.4997029114676175, + "grad_norm": 0.2535883995423941, + "learning_rate": 4.796908994513025e-06, + "loss": 0.7864, + "num_tokens": 52748312817.0, + "step": 12620 + }, + { + "epoch": 1.4998217468805704, + "grad_norm": 0.30603905018879973, + "learning_rate": 4.795654230535927e-06, + "loss": 0.823, + "num_tokens": 52752503652.0, + "step": 12621 + }, + { + "epoch": 1.4999405822935235, + "grad_norm": 0.261354218914767, + "learning_rate": 4.794399696326728e-06, + "loss": 0.7848, + "num_tokens": 52756670169.0, + "step": 12622 + }, + { + "epoch": 1.5000594177064765, + "grad_norm": 0.28258809181268885, + "learning_rate": 4.7931453919318895e-06, + "loss": 0.803, + "num_tokens": 52760859025.0, + "step": 12623 + }, + { + "epoch": 1.5001782531194294, + "grad_norm": 0.2639269283018839, + "learning_rate": 4.791891317397863e-06, + "loss": 0.7991, + "num_tokens": 52764992969.0, + "step": 12624 + }, + { + "epoch": 1.5002970885323825, + "grad_norm": 0.25602530086620534, + "learning_rate": 4.790637472771088e-06, + "loss": 0.7982, + "num_tokens": 52769183437.0, + "step": 12625 + }, + { + "epoch": 1.5004159239453356, + "grad_norm": 0.2652924386685682, + "learning_rate": 4.789383858098008e-06, + "loss": 0.8036, + "num_tokens": 52773373922.0, + "step": 12626 + }, + { + "epoch": 1.5005347593582887, + "grad_norm": 0.2593012225938065, + "learning_rate": 4.78813047342504e-06, + "loss": 0.792, + "num_tokens": 52777563609.0, + "step": 12627 + }, + { + "epoch": 1.5006535947712418, + "grad_norm": 0.2521404514990862, + "learning_rate": 4.786877318798603e-06, + "loss": 0.8085, + "num_tokens": 52781753724.0, + "step": 12628 + }, + { + "epoch": 1.500772430184195, + "grad_norm": 0.38673436164257885, + "learning_rate": 4.785624394265106e-06, + "loss": 0.821, + "num_tokens": 52785943570.0, + "step": 12629 + }, + { + "epoch": 1.500891265597148, + "grad_norm": 0.2537059459541745, + "learning_rate": 4.784371699870948e-06, + "loss": 0.8216, + "num_tokens": 52790132169.0, + "step": 12630 + }, + { + "epoch": 1.5010101010101011, + "grad_norm": 0.2557073305443576, + "learning_rate": 4.7831192356625225e-06, + "loss": 0.8343, + "num_tokens": 52794276230.0, + "step": 12631 + }, + { + "epoch": 1.5011289364230542, + "grad_norm": 0.26756550369207605, + "learning_rate": 4.781867001686212e-06, + "loss": 0.7906, + "num_tokens": 52798464633.0, + "step": 12632 + }, + { + "epoch": 1.501247771836007, + "grad_norm": 0.23949337939392387, + "learning_rate": 4.780614997988389e-06, + "loss": 0.8056, + "num_tokens": 52802654830.0, + "step": 12633 + }, + { + "epoch": 1.5013666072489602, + "grad_norm": 0.285251208230051, + "learning_rate": 4.779363224615421e-06, + "loss": 0.7961, + "num_tokens": 52806842879.0, + "step": 12634 + }, + { + "epoch": 1.5014854426619133, + "grad_norm": 0.2508638395643308, + "learning_rate": 4.7781116816136664e-06, + "loss": 0.79, + "num_tokens": 52811033195.0, + "step": 12635 + }, + { + "epoch": 1.5016042780748662, + "grad_norm": 0.25661885228899944, + "learning_rate": 4.776860369029472e-06, + "loss": 0.7813, + "num_tokens": 52815197313.0, + "step": 12636 + }, + { + "epoch": 1.5017231134878193, + "grad_norm": 0.24522135960377153, + "learning_rate": 4.775609286909179e-06, + "loss": 0.8114, + "num_tokens": 52819387961.0, + "step": 12637 + }, + { + "epoch": 1.5018419489007724, + "grad_norm": 0.2662490078176998, + "learning_rate": 4.774358435299119e-06, + "loss": 0.834, + "num_tokens": 52823576197.0, + "step": 12638 + }, + { + "epoch": 1.5019607843137255, + "grad_norm": 0.25099626911424333, + "learning_rate": 4.7731078142456165e-06, + "loss": 0.7915, + "num_tokens": 52827765228.0, + "step": 12639 + }, + { + "epoch": 1.5020796197266786, + "grad_norm": 0.24217374631438515, + "learning_rate": 4.771857423794986e-06, + "loss": 0.8517, + "num_tokens": 52831937265.0, + "step": 12640 + }, + { + "epoch": 1.5021984551396317, + "grad_norm": 0.2495026694310728, + "learning_rate": 4.770607263993531e-06, + "loss": 0.7996, + "num_tokens": 52836126661.0, + "step": 12641 + }, + { + "epoch": 1.5023172905525848, + "grad_norm": 0.23793932972326892, + "learning_rate": 4.769357334887556e-06, + "loss": 0.7992, + "num_tokens": 52840292033.0, + "step": 12642 + }, + { + "epoch": 1.5024361259655379, + "grad_norm": 0.25833803756012164, + "learning_rate": 4.768107636523341e-06, + "loss": 0.7719, + "num_tokens": 52844482124.0, + "step": 12643 + }, + { + "epoch": 1.5025549613784908, + "grad_norm": 0.24950456001332363, + "learning_rate": 4.766858168947168e-06, + "loss": 0.7941, + "num_tokens": 52848645071.0, + "step": 12644 + }, + { + "epoch": 1.5026737967914439, + "grad_norm": 0.2472451215159051, + "learning_rate": 4.765608932205314e-06, + "loss": 0.7892, + "num_tokens": 52852829559.0, + "step": 12645 + }, + { + "epoch": 1.502792632204397, + "grad_norm": 0.24756789687957456, + "learning_rate": 4.764359926344041e-06, + "loss": 0.8164, + "num_tokens": 52857018037.0, + "step": 12646 + }, + { + "epoch": 1.5029114676173498, + "grad_norm": 0.26189533708430535, + "learning_rate": 4.763111151409606e-06, + "loss": 0.8067, + "num_tokens": 52861208052.0, + "step": 12647 + }, + { + "epoch": 1.503030303030303, + "grad_norm": 0.26848670295128774, + "learning_rate": 4.7618626074482495e-06, + "loss": 0.8296, + "num_tokens": 52865396756.0, + "step": 12648 + }, + { + "epoch": 1.503149138443256, + "grad_norm": 0.25557121867267835, + "learning_rate": 4.7606142945062125e-06, + "loss": 0.8007, + "num_tokens": 52869584810.0, + "step": 12649 + }, + { + "epoch": 1.5032679738562091, + "grad_norm": 0.25950077199931487, + "learning_rate": 4.759366212629721e-06, + "loss": 0.8208, + "num_tokens": 52873769742.0, + "step": 12650 + }, + { + "epoch": 1.5033868092691622, + "grad_norm": 0.25595791939333895, + "learning_rate": 4.758118361865001e-06, + "loss": 0.8021, + "num_tokens": 52877942116.0, + "step": 12651 + }, + { + "epoch": 1.5035056446821153, + "grad_norm": 0.2764017466917497, + "learning_rate": 4.756870742258266e-06, + "loss": 0.7897, + "num_tokens": 52882110755.0, + "step": 12652 + }, + { + "epoch": 1.5036244800950684, + "grad_norm": 0.25293524151519386, + "learning_rate": 4.755623353855713e-06, + "loss": 0.7926, + "num_tokens": 52886300291.0, + "step": 12653 + }, + { + "epoch": 1.5037433155080215, + "grad_norm": 0.24980744247434022, + "learning_rate": 4.75437619670354e-06, + "loss": 0.8075, + "num_tokens": 52890490146.0, + "step": 12654 + }, + { + "epoch": 1.5038621509209744, + "grad_norm": 0.2655205945669285, + "learning_rate": 4.753129270847934e-06, + "loss": 0.7947, + "num_tokens": 52894678746.0, + "step": 12655 + }, + { + "epoch": 1.5039809863339275, + "grad_norm": 0.257710821333095, + "learning_rate": 4.751882576335067e-06, + "loss": 0.7801, + "num_tokens": 52898848865.0, + "step": 12656 + }, + { + "epoch": 1.5040998217468806, + "grad_norm": 0.23912733151721252, + "learning_rate": 4.750636113211121e-06, + "loss": 0.8322, + "num_tokens": 52903037627.0, + "step": 12657 + }, + { + "epoch": 1.5042186571598335, + "grad_norm": 0.25645228132864895, + "learning_rate": 4.749389881522247e-06, + "loss": 0.7564, + "num_tokens": 52907225906.0, + "step": 12658 + }, + { + "epoch": 1.5043374925727866, + "grad_norm": 0.23669186590519176, + "learning_rate": 4.748143881314599e-06, + "loss": 0.7884, + "num_tokens": 52911415075.0, + "step": 12659 + }, + { + "epoch": 1.5044563279857397, + "grad_norm": 0.26261841986721796, + "learning_rate": 4.74689811263432e-06, + "loss": 0.8303, + "num_tokens": 52915603703.0, + "step": 12660 + }, + { + "epoch": 1.5045751633986928, + "grad_norm": 0.25063388564456757, + "learning_rate": 4.745652575527547e-06, + "loss": 0.8045, + "num_tokens": 52919776429.0, + "step": 12661 + }, + { + "epoch": 1.504693998811646, + "grad_norm": 0.269220397388861, + "learning_rate": 4.744407270040407e-06, + "loss": 0.8184, + "num_tokens": 52923947165.0, + "step": 12662 + }, + { + "epoch": 1.504812834224599, + "grad_norm": 0.2557176864516867, + "learning_rate": 4.743162196219014e-06, + "loss": 0.8649, + "num_tokens": 52928108063.0, + "step": 12663 + }, + { + "epoch": 1.504931669637552, + "grad_norm": 0.2803268108582196, + "learning_rate": 4.741917354109481e-06, + "loss": 0.821, + "num_tokens": 52932295548.0, + "step": 12664 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.26599443629239156, + "learning_rate": 4.740672743757907e-06, + "loss": 0.8207, + "num_tokens": 52936454429.0, + "step": 12665 + }, + { + "epoch": 1.505169340463458, + "grad_norm": 0.2678144710209541, + "learning_rate": 4.739428365210385e-06, + "loss": 0.8129, + "num_tokens": 52940644260.0, + "step": 12666 + }, + { + "epoch": 1.5052881758764112, + "grad_norm": 0.25074179396577573, + "learning_rate": 4.7381842185129995e-06, + "loss": 0.7885, + "num_tokens": 52944832827.0, + "step": 12667 + }, + { + "epoch": 1.5054070112893643, + "grad_norm": 0.28178838302178333, + "learning_rate": 4.736940303711825e-06, + "loss": 0.8066, + "num_tokens": 52949023579.0, + "step": 12668 + }, + { + "epoch": 1.5055258467023171, + "grad_norm": 0.24927385267915775, + "learning_rate": 4.735696620852922e-06, + "loss": 0.8266, + "num_tokens": 52953204485.0, + "step": 12669 + }, + { + "epoch": 1.5056446821152702, + "grad_norm": 0.26567680918252623, + "learning_rate": 4.734453169982356e-06, + "loss": 0.799, + "num_tokens": 52957393010.0, + "step": 12670 + }, + { + "epoch": 1.5057635175282233, + "grad_norm": 0.24041651441642561, + "learning_rate": 4.7332099511461725e-06, + "loss": 0.821, + "num_tokens": 52961582890.0, + "step": 12671 + }, + { + "epoch": 1.5058823529411764, + "grad_norm": 0.2839949847512515, + "learning_rate": 4.731966964390413e-06, + "loss": 0.8355, + "num_tokens": 52965737448.0, + "step": 12672 + }, + { + "epoch": 1.5060011883541295, + "grad_norm": 0.23882614577435185, + "learning_rate": 4.730724209761114e-06, + "loss": 0.8002, + "num_tokens": 52969925802.0, + "step": 12673 + }, + { + "epoch": 1.5061200237670826, + "grad_norm": 0.2777561912948347, + "learning_rate": 4.729481687304292e-06, + "loss": 0.784, + "num_tokens": 52974115338.0, + "step": 12674 + }, + { + "epoch": 1.5062388591800357, + "grad_norm": 0.2736917863258595, + "learning_rate": 4.728239397065962e-06, + "loss": 0.8099, + "num_tokens": 52978303393.0, + "step": 12675 + }, + { + "epoch": 1.5063576945929888, + "grad_norm": 0.27358600383204656, + "learning_rate": 4.7269973390921305e-06, + "loss": 0.8204, + "num_tokens": 52982469227.0, + "step": 12676 + }, + { + "epoch": 1.5064765300059417, + "grad_norm": 0.2695642323665636, + "learning_rate": 4.7257555134288e-06, + "loss": 0.8138, + "num_tokens": 52986624475.0, + "step": 12677 + }, + { + "epoch": 1.5065953654188948, + "grad_norm": 0.26831441629429137, + "learning_rate": 4.72451392012196e-06, + "loss": 0.8304, + "num_tokens": 52990809179.0, + "step": 12678 + }, + { + "epoch": 1.506714200831848, + "grad_norm": 0.2884717428957091, + "learning_rate": 4.7232725592175835e-06, + "loss": 0.8308, + "num_tokens": 52994996574.0, + "step": 12679 + }, + { + "epoch": 1.5068330362448008, + "grad_norm": 0.2690970329631857, + "learning_rate": 4.722031430761647e-06, + "loss": 0.8262, + "num_tokens": 52999153480.0, + "step": 12680 + }, + { + "epoch": 1.506951871657754, + "grad_norm": 0.29698720471106727, + "learning_rate": 4.720790534800113e-06, + "loss": 0.771, + "num_tokens": 53003283819.0, + "step": 12681 + }, + { + "epoch": 1.507070707070707, + "grad_norm": 0.25991949180647017, + "learning_rate": 4.7195498713789325e-06, + "loss": 0.8221, + "num_tokens": 53007473580.0, + "step": 12682 + }, + { + "epoch": 1.50718954248366, + "grad_norm": 0.28642892991630187, + "learning_rate": 4.718309440544062e-06, + "loss": 0.8142, + "num_tokens": 53011662197.0, + "step": 12683 + }, + { + "epoch": 1.5073083778966132, + "grad_norm": 0.25381603990454027, + "learning_rate": 4.717069242341427e-06, + "loss": 0.8365, + "num_tokens": 53015829060.0, + "step": 12684 + }, + { + "epoch": 1.5074272133095663, + "grad_norm": 0.26221747558475883, + "learning_rate": 4.715829276816962e-06, + "loss": 0.8229, + "num_tokens": 53020019747.0, + "step": 12685 + }, + { + "epoch": 1.5075460487225194, + "grad_norm": 0.25878265923518085, + "learning_rate": 4.7145895440165865e-06, + "loss": 0.8053, + "num_tokens": 53024207846.0, + "step": 12686 + }, + { + "epoch": 1.5076648841354725, + "grad_norm": 0.31087686574040424, + "learning_rate": 4.713350043986211e-06, + "loss": 0.8254, + "num_tokens": 53028397787.0, + "step": 12687 + }, + { + "epoch": 1.5077837195484254, + "grad_norm": 0.2691956682143284, + "learning_rate": 4.7121107767717405e-06, + "loss": 0.816, + "num_tokens": 53032585965.0, + "step": 12688 + }, + { + "epoch": 1.5079025549613785, + "grad_norm": 0.2561108244588178, + "learning_rate": 4.7108717424190664e-06, + "loss": 0.7911, + "num_tokens": 53036774551.0, + "step": 12689 + }, + { + "epoch": 1.5080213903743316, + "grad_norm": 0.256510738970594, + "learning_rate": 4.7096329409740754e-06, + "loss": 0.7886, + "num_tokens": 53040954592.0, + "step": 12690 + }, + { + "epoch": 1.5081402257872845, + "grad_norm": 0.23750349126705653, + "learning_rate": 4.708394372482646e-06, + "loss": 0.7698, + "num_tokens": 53045116388.0, + "step": 12691 + }, + { + "epoch": 1.5082590612002376, + "grad_norm": 0.24610121498400525, + "learning_rate": 4.707156036990645e-06, + "loss": 0.8217, + "num_tokens": 53049274258.0, + "step": 12692 + }, + { + "epoch": 1.5083778966131907, + "grad_norm": 0.2500654556900158, + "learning_rate": 4.7059179345439325e-06, + "loss": 0.8276, + "num_tokens": 53053463904.0, + "step": 12693 + }, + { + "epoch": 1.5084967320261438, + "grad_norm": 0.2490191945214936, + "learning_rate": 4.70468006518836e-06, + "loss": 0.7985, + "num_tokens": 53057653627.0, + "step": 12694 + }, + { + "epoch": 1.5086155674390969, + "grad_norm": 0.2506615215770803, + "learning_rate": 4.703442428969769e-06, + "loss": 0.8375, + "num_tokens": 53061825702.0, + "step": 12695 + }, + { + "epoch": 1.50873440285205, + "grad_norm": 0.25455858693505834, + "learning_rate": 4.702205025933993e-06, + "loss": 0.7701, + "num_tokens": 53066015454.0, + "step": 12696 + }, + { + "epoch": 1.508853238265003, + "grad_norm": 0.23876820591772624, + "learning_rate": 4.70096785612686e-06, + "loss": 0.7984, + "num_tokens": 53070204801.0, + "step": 12697 + }, + { + "epoch": 1.5089720736779562, + "grad_norm": 0.24827851058557301, + "learning_rate": 4.699730919594184e-06, + "loss": 0.8122, + "num_tokens": 53074380127.0, + "step": 12698 + }, + { + "epoch": 1.509090909090909, + "grad_norm": 0.23379347954051594, + "learning_rate": 4.698494216381775e-06, + "loss": 0.7987, + "num_tokens": 53078569694.0, + "step": 12699 + }, + { + "epoch": 1.5092097445038621, + "grad_norm": 0.2439431359442814, + "learning_rate": 4.697257746535433e-06, + "loss": 0.8101, + "num_tokens": 53082745681.0, + "step": 12700 + }, + { + "epoch": 1.5093285799168152, + "grad_norm": 0.2611635751300568, + "learning_rate": 4.696021510100941e-06, + "loss": 0.8171, + "num_tokens": 53086936031.0, + "step": 12701 + }, + { + "epoch": 1.5094474153297681, + "grad_norm": 0.2521394093208757, + "learning_rate": 4.694785507124089e-06, + "loss": 0.797, + "num_tokens": 53091125834.0, + "step": 12702 + }, + { + "epoch": 1.5095662507427212, + "grad_norm": 0.2565396975568158, + "learning_rate": 4.6935497376506496e-06, + "loss": 0.8224, + "num_tokens": 53095288357.0, + "step": 12703 + }, + { + "epoch": 1.5096850861556743, + "grad_norm": 0.2320427566889718, + "learning_rate": 4.692314201726386e-06, + "loss": 0.8172, + "num_tokens": 53099450467.0, + "step": 12704 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 0.25543766071148805, + "learning_rate": 4.691078899397056e-06, + "loss": 0.8287, + "num_tokens": 53103626045.0, + "step": 12705 + }, + { + "epoch": 1.5099227569815805, + "grad_norm": 0.23687899879678886, + "learning_rate": 4.689843830708402e-06, + "loss": 0.7973, + "num_tokens": 53107813415.0, + "step": 12706 + }, + { + "epoch": 1.5100415923945336, + "grad_norm": 0.24591557448273219, + "learning_rate": 4.688608995706165e-06, + "loss": 0.8032, + "num_tokens": 53112002823.0, + "step": 12707 + }, + { + "epoch": 1.5101604278074867, + "grad_norm": 0.24431068282799792, + "learning_rate": 4.687374394436077e-06, + "loss": 0.8368, + "num_tokens": 53116191494.0, + "step": 12708 + }, + { + "epoch": 1.5102792632204398, + "grad_norm": 0.2535774937613544, + "learning_rate": 4.68614002694386e-06, + "loss": 0.812, + "num_tokens": 53120379424.0, + "step": 12709 + }, + { + "epoch": 1.510398098633393, + "grad_norm": 0.2628615356559295, + "learning_rate": 4.684905893275229e-06, + "loss": 0.82, + "num_tokens": 53124568551.0, + "step": 12710 + }, + { + "epoch": 1.5105169340463458, + "grad_norm": 0.2543946008258187, + "learning_rate": 4.68367199347588e-06, + "loss": 0.7988, + "num_tokens": 53128744080.0, + "step": 12711 + }, + { + "epoch": 1.510635769459299, + "grad_norm": 0.25403305626811973, + "learning_rate": 4.6824383275915145e-06, + "loss": 0.798, + "num_tokens": 53132933407.0, + "step": 12712 + }, + { + "epoch": 1.5107546048722518, + "grad_norm": 0.26466036703037016, + "learning_rate": 4.681204895667817e-06, + "loss": 0.7969, + "num_tokens": 53137122718.0, + "step": 12713 + }, + { + "epoch": 1.5108734402852049, + "grad_norm": 0.23707192394546772, + "learning_rate": 4.679971697750464e-06, + "loss": 0.8325, + "num_tokens": 53141310467.0, + "step": 12714 + }, + { + "epoch": 1.510992275698158, + "grad_norm": 0.2715175191696148, + "learning_rate": 4.678738733885134e-06, + "loss": 0.8084, + "num_tokens": 53145492777.0, + "step": 12715 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.250518115635383, + "learning_rate": 4.677506004117477e-06, + "loss": 0.8234, + "num_tokens": 53149665008.0, + "step": 12716 + }, + { + "epoch": 1.5112299465240642, + "grad_norm": 0.28523686080192817, + "learning_rate": 4.676273508493151e-06, + "loss": 0.8149, + "num_tokens": 53153822860.0, + "step": 12717 + }, + { + "epoch": 1.5113487819370173, + "grad_norm": 0.26879087407181584, + "learning_rate": 4.675041247057797e-06, + "loss": 0.8334, + "num_tokens": 53158005999.0, + "step": 12718 + }, + { + "epoch": 1.5114676173499704, + "grad_norm": 0.2567893038899724, + "learning_rate": 4.673809219857051e-06, + "loss": 0.8133, + "num_tokens": 53162195145.0, + "step": 12719 + }, + { + "epoch": 1.5115864527629235, + "grad_norm": 0.26670021380469483, + "learning_rate": 4.67257742693654e-06, + "loss": 0.8653, + "num_tokens": 53166385628.0, + "step": 12720 + }, + { + "epoch": 1.5117052881758766, + "grad_norm": 0.24322347111215656, + "learning_rate": 4.671345868341879e-06, + "loss": 0.8228, + "num_tokens": 53170559131.0, + "step": 12721 + }, + { + "epoch": 1.5118241235888294, + "grad_norm": 0.2754621920841732, + "learning_rate": 4.670114544118679e-06, + "loss": 0.8338, + "num_tokens": 53174747600.0, + "step": 12722 + }, + { + "epoch": 1.5119429590017825, + "grad_norm": 0.23154870856292864, + "learning_rate": 4.668883454312539e-06, + "loss": 0.8098, + "num_tokens": 53178936909.0, + "step": 12723 + }, + { + "epoch": 1.5120617944147354, + "grad_norm": 0.26912299403768564, + "learning_rate": 4.667652598969051e-06, + "loss": 0.827, + "num_tokens": 53183113795.0, + "step": 12724 + }, + { + "epoch": 1.5121806298276885, + "grad_norm": 0.24805904035842874, + "learning_rate": 4.666421978133796e-06, + "loss": 0.795, + "num_tokens": 53187300372.0, + "step": 12725 + }, + { + "epoch": 1.5122994652406416, + "grad_norm": 0.30221960333033315, + "learning_rate": 4.665191591852352e-06, + "loss": 0.8225, + "num_tokens": 53191478329.0, + "step": 12726 + }, + { + "epoch": 1.5124183006535947, + "grad_norm": 0.2454617348482284, + "learning_rate": 4.6639614401702795e-06, + "loss": 0.8131, + "num_tokens": 53195668260.0, + "step": 12727 + }, + { + "epoch": 1.5125371360665478, + "grad_norm": 0.2968884049791548, + "learning_rate": 4.6627315231331396e-06, + "loss": 0.8216, + "num_tokens": 53199834858.0, + "step": 12728 + }, + { + "epoch": 1.512655971479501, + "grad_norm": 0.24905369931723229, + "learning_rate": 4.661501840786477e-06, + "loss": 0.8206, + "num_tokens": 53204021526.0, + "step": 12729 + }, + { + "epoch": 1.512774806892454, + "grad_norm": 0.3179814047575617, + "learning_rate": 4.660272393175832e-06, + "loss": 0.8185, + "num_tokens": 53208210167.0, + "step": 12730 + }, + { + "epoch": 1.5128936423054071, + "grad_norm": 0.2558978540076644, + "learning_rate": 4.659043180346738e-06, + "loss": 0.8334, + "num_tokens": 53212400181.0, + "step": 12731 + }, + { + "epoch": 1.5130124777183602, + "grad_norm": 0.2944082240971828, + "learning_rate": 4.657814202344713e-06, + "loss": 0.8063, + "num_tokens": 53216572321.0, + "step": 12732 + }, + { + "epoch": 1.513131313131313, + "grad_norm": 0.24466686477779376, + "learning_rate": 4.656585459215267e-06, + "loss": 0.8037, + "num_tokens": 53220714542.0, + "step": 12733 + }, + { + "epoch": 1.5132501485442662, + "grad_norm": 0.3055004747910674, + "learning_rate": 4.655356951003913e-06, + "loss": 0.8137, + "num_tokens": 53224905065.0, + "step": 12734 + }, + { + "epoch": 1.5133689839572193, + "grad_norm": 0.24526652502237023, + "learning_rate": 4.654128677756143e-06, + "loss": 0.8273, + "num_tokens": 53229095662.0, + "step": 12735 + }, + { + "epoch": 1.5134878193701722, + "grad_norm": 0.28111826453480915, + "learning_rate": 4.652900639517447e-06, + "loss": 0.8168, + "num_tokens": 53233284961.0, + "step": 12736 + }, + { + "epoch": 1.5136066547831253, + "grad_norm": 0.2665419623827628, + "learning_rate": 4.651672836333296e-06, + "loss": 0.8211, + "num_tokens": 53237474788.0, + "step": 12737 + }, + { + "epoch": 1.5137254901960784, + "grad_norm": 0.26130663527366355, + "learning_rate": 4.650445268249166e-06, + "loss": 0.7838, + "num_tokens": 53241662054.0, + "step": 12738 + }, + { + "epoch": 1.5138443256090315, + "grad_norm": 0.3049566385612688, + "learning_rate": 4.64921793531051e-06, + "loss": 0.8355, + "num_tokens": 53245846578.0, + "step": 12739 + }, + { + "epoch": 1.5139631610219846, + "grad_norm": 0.30584002018423245, + "learning_rate": 4.647990837562792e-06, + "loss": 0.8503, + "num_tokens": 53250034392.0, + "step": 12740 + }, + { + "epoch": 1.5140819964349377, + "grad_norm": 0.23378351271768363, + "learning_rate": 4.646763975051452e-06, + "loss": 0.7881, + "num_tokens": 53254214822.0, + "step": 12741 + }, + { + "epoch": 1.5142008318478908, + "grad_norm": 0.26956267640669457, + "learning_rate": 4.645537347821919e-06, + "loss": 0.7841, + "num_tokens": 53258403893.0, + "step": 12742 + }, + { + "epoch": 1.5143196672608439, + "grad_norm": 0.278914936997267, + "learning_rate": 4.644310955919625e-06, + "loss": 0.8316, + "num_tokens": 53262594143.0, + "step": 12743 + }, + { + "epoch": 1.5144385026737968, + "grad_norm": 0.2666847355082276, + "learning_rate": 4.643084799389984e-06, + "loss": 0.8392, + "num_tokens": 53266784420.0, + "step": 12744 + }, + { + "epoch": 1.5145573380867499, + "grad_norm": 0.27602799600055744, + "learning_rate": 4.641858878278404e-06, + "loss": 0.7818, + "num_tokens": 53270967730.0, + "step": 12745 + }, + { + "epoch": 1.514676173499703, + "grad_norm": 0.25677605494996225, + "learning_rate": 4.640633192630293e-06, + "loss": 0.7885, + "num_tokens": 53275158202.0, + "step": 12746 + }, + { + "epoch": 1.5147950089126558, + "grad_norm": 0.28927544975152775, + "learning_rate": 4.639407742491033e-06, + "loss": 0.8264, + "num_tokens": 53279315939.0, + "step": 12747 + }, + { + "epoch": 1.514913844325609, + "grad_norm": 0.2715487945302376, + "learning_rate": 4.63818252790601e-06, + "loss": 0.8409, + "num_tokens": 53283479592.0, + "step": 12748 + }, + { + "epoch": 1.515032679738562, + "grad_norm": 0.2847893809543514, + "learning_rate": 4.636957548920597e-06, + "loss": 0.7919, + "num_tokens": 53287667674.0, + "step": 12749 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.28534726818632206, + "learning_rate": 4.635732805580162e-06, + "loss": 0.8301, + "num_tokens": 53291856025.0, + "step": 12750 + }, + { + "epoch": 1.5152703505644682, + "grad_norm": 0.2618557797502859, + "learning_rate": 4.634508297930058e-06, + "loss": 0.8451, + "num_tokens": 53296044114.0, + "step": 12751 + }, + { + "epoch": 1.5153891859774213, + "grad_norm": 0.2619802176927954, + "learning_rate": 4.633284026015633e-06, + "loss": 0.8316, + "num_tokens": 53300207549.0, + "step": 12752 + }, + { + "epoch": 1.5155080213903744, + "grad_norm": 0.27137009859612055, + "learning_rate": 4.6320599898822285e-06, + "loss": 0.8093, + "num_tokens": 53304396393.0, + "step": 12753 + }, + { + "epoch": 1.5156268568033275, + "grad_norm": 0.25771831823961927, + "learning_rate": 4.630836189575173e-06, + "loss": 0.8554, + "num_tokens": 53308584870.0, + "step": 12754 + }, + { + "epoch": 1.5157456922162804, + "grad_norm": 0.25398866953475796, + "learning_rate": 4.629612625139787e-06, + "loss": 0.7669, + "num_tokens": 53312773881.0, + "step": 12755 + }, + { + "epoch": 1.5158645276292335, + "grad_norm": 0.25596854345958303, + "learning_rate": 4.628389296621385e-06, + "loss": 0.8133, + "num_tokens": 53316952938.0, + "step": 12756 + }, + { + "epoch": 1.5159833630421866, + "grad_norm": 0.2536398362654304, + "learning_rate": 4.627166204065273e-06, + "loss": 0.8272, + "num_tokens": 53321142444.0, + "step": 12757 + }, + { + "epoch": 1.5161021984551395, + "grad_norm": 0.24811599676322868, + "learning_rate": 4.625943347516737e-06, + "loss": 0.7995, + "num_tokens": 53325311463.0, + "step": 12758 + }, + { + "epoch": 1.5162210338680926, + "grad_norm": 0.2564688614111082, + "learning_rate": 4.6247207270210745e-06, + "loss": 0.8184, + "num_tokens": 53329479887.0, + "step": 12759 + }, + { + "epoch": 1.5163398692810457, + "grad_norm": 0.27169239610642837, + "learning_rate": 4.6234983426235585e-06, + "loss": 0.8375, + "num_tokens": 53333617893.0, + "step": 12760 + }, + { + "epoch": 1.5164587046939988, + "grad_norm": 0.2911952955113901, + "learning_rate": 4.622276194369457e-06, + "loss": 0.7787, + "num_tokens": 53337800093.0, + "step": 12761 + }, + { + "epoch": 1.516577540106952, + "grad_norm": 0.25588444055815474, + "learning_rate": 4.621054282304035e-06, + "loss": 0.7758, + "num_tokens": 53341950762.0, + "step": 12762 + }, + { + "epoch": 1.516696375519905, + "grad_norm": 0.26292220267932553, + "learning_rate": 4.619832606472538e-06, + "loss": 0.8261, + "num_tokens": 53346141142.0, + "step": 12763 + }, + { + "epoch": 1.516815210932858, + "grad_norm": 0.25464615924435685, + "learning_rate": 4.618611166920212e-06, + "loss": 0.8002, + "num_tokens": 53350329575.0, + "step": 12764 + }, + { + "epoch": 1.5169340463458112, + "grad_norm": 0.2308798889000566, + "learning_rate": 4.617389963692288e-06, + "loss": 0.7862, + "num_tokens": 53354519416.0, + "step": 12765 + }, + { + "epoch": 1.517052881758764, + "grad_norm": 0.2607547069643996, + "learning_rate": 4.616168996833995e-06, + "loss": 0.8287, + "num_tokens": 53358708996.0, + "step": 12766 + }, + { + "epoch": 1.5171717171717172, + "grad_norm": 0.2295408309629681, + "learning_rate": 4.6149482663905525e-06, + "loss": 0.8081, + "num_tokens": 53362896873.0, + "step": 12767 + }, + { + "epoch": 1.5172905525846703, + "grad_norm": 0.2537085649033743, + "learning_rate": 4.61372777240716e-06, + "loss": 0.8161, + "num_tokens": 53367085457.0, + "step": 12768 + }, + { + "epoch": 1.5174093879976231, + "grad_norm": 0.23692169782440506, + "learning_rate": 4.612507514929022e-06, + "loss": 0.8229, + "num_tokens": 53371245315.0, + "step": 12769 + }, + { + "epoch": 1.5175282234105762, + "grad_norm": 0.25319213455722245, + "learning_rate": 4.6112874940013256e-06, + "loss": 0.8265, + "num_tokens": 53375411518.0, + "step": 12770 + }, + { + "epoch": 1.5176470588235293, + "grad_norm": 0.25711178744852803, + "learning_rate": 4.610067709669252e-06, + "loss": 0.7799, + "num_tokens": 53379574296.0, + "step": 12771 + }, + { + "epoch": 1.5177658942364824, + "grad_norm": 0.23704471721681603, + "learning_rate": 4.608848161977982e-06, + "loss": 0.795, + "num_tokens": 53383751718.0, + "step": 12772 + }, + { + "epoch": 1.5178847296494355, + "grad_norm": 0.2500165757902052, + "learning_rate": 4.607628850972672e-06, + "loss": 0.779, + "num_tokens": 53387942308.0, + "step": 12773 + }, + { + "epoch": 1.5180035650623886, + "grad_norm": 0.2537883885301749, + "learning_rate": 4.6064097766984764e-06, + "loss": 0.8226, + "num_tokens": 53392132108.0, + "step": 12774 + }, + { + "epoch": 1.5181224004753417, + "grad_norm": 0.24766034546830615, + "learning_rate": 4.605190939200544e-06, + "loss": 0.7793, + "num_tokens": 53396321197.0, + "step": 12775 + }, + { + "epoch": 1.5182412358882948, + "grad_norm": 0.2444602018464247, + "learning_rate": 4.603972338524014e-06, + "loss": 0.7761, + "num_tokens": 53400491997.0, + "step": 12776 + }, + { + "epoch": 1.5183600713012477, + "grad_norm": 0.25129504775971245, + "learning_rate": 4.6027539747140135e-06, + "loss": 0.82, + "num_tokens": 53404680590.0, + "step": 12777 + }, + { + "epoch": 1.5184789067142008, + "grad_norm": 0.2311738852300465, + "learning_rate": 4.601535847815662e-06, + "loss": 0.7812, + "num_tokens": 53408869606.0, + "step": 12778 + }, + { + "epoch": 1.518597742127154, + "grad_norm": 0.2642864390124454, + "learning_rate": 4.600317957874073e-06, + "loss": 0.7845, + "num_tokens": 53413045329.0, + "step": 12779 + }, + { + "epoch": 1.5187165775401068, + "grad_norm": 0.2542902134901807, + "learning_rate": 4.599100304934348e-06, + "loss": 0.8213, + "num_tokens": 53417234365.0, + "step": 12780 + }, + { + "epoch": 1.51883541295306, + "grad_norm": 0.2710221741205541, + "learning_rate": 4.59788288904158e-06, + "loss": 0.8145, + "num_tokens": 53421375578.0, + "step": 12781 + }, + { + "epoch": 1.518954248366013, + "grad_norm": 0.2534023689334762, + "learning_rate": 4.596665710240854e-06, + "loss": 0.7893, + "num_tokens": 53425566054.0, + "step": 12782 + }, + { + "epoch": 1.519073083778966, + "grad_norm": 0.27785650367989256, + "learning_rate": 4.5954487685772495e-06, + "loss": 0.8243, + "num_tokens": 53429754406.0, + "step": 12783 + }, + { + "epoch": 1.5191919191919192, + "grad_norm": 0.25563560860825596, + "learning_rate": 4.594232064095827e-06, + "loss": 0.8314, + "num_tokens": 53433928153.0, + "step": 12784 + }, + { + "epoch": 1.5193107546048723, + "grad_norm": 0.25695468704880364, + "learning_rate": 4.593015596841652e-06, + "loss": 0.8022, + "num_tokens": 53438117185.0, + "step": 12785 + }, + { + "epoch": 1.5194295900178254, + "grad_norm": 0.27592665177798725, + "learning_rate": 4.591799366859772e-06, + "loss": 0.8081, + "num_tokens": 53442306702.0, + "step": 12786 + }, + { + "epoch": 1.5195484254307785, + "grad_norm": 0.26193353769975286, + "learning_rate": 4.590583374195227e-06, + "loss": 0.8353, + "num_tokens": 53446495978.0, + "step": 12787 + }, + { + "epoch": 1.5196672608437314, + "grad_norm": 0.25249132148409026, + "learning_rate": 4.589367618893055e-06, + "loss": 0.7858, + "num_tokens": 53450684116.0, + "step": 12788 + }, + { + "epoch": 1.5197860962566845, + "grad_norm": 0.25253244698421473, + "learning_rate": 4.588152100998272e-06, + "loss": 0.7922, + "num_tokens": 53454803268.0, + "step": 12789 + }, + { + "epoch": 1.5199049316696376, + "grad_norm": 0.24950727611768697, + "learning_rate": 4.586936820555892e-06, + "loss": 0.8034, + "num_tokens": 53458986576.0, + "step": 12790 + }, + { + "epoch": 1.5200237670825905, + "grad_norm": 0.2527718907362935, + "learning_rate": 4.585721777610928e-06, + "loss": 0.8278, + "num_tokens": 53463175697.0, + "step": 12791 + }, + { + "epoch": 1.5201426024955436, + "grad_norm": 0.272016427861085, + "learning_rate": 4.584506972208374e-06, + "loss": 0.8132, + "num_tokens": 53467366243.0, + "step": 12792 + }, + { + "epoch": 1.5202614379084967, + "grad_norm": 0.2605328449670117, + "learning_rate": 4.58329240439322e-06, + "loss": 0.8365, + "num_tokens": 53471556332.0, + "step": 12793 + }, + { + "epoch": 1.5203802733214498, + "grad_norm": 0.24897756014247574, + "learning_rate": 4.582078074210441e-06, + "loss": 0.7964, + "num_tokens": 53475735533.0, + "step": 12794 + }, + { + "epoch": 1.5204991087344029, + "grad_norm": 0.2662376606821279, + "learning_rate": 4.58086398170501e-06, + "loss": 0.7827, + "num_tokens": 53479925171.0, + "step": 12795 + }, + { + "epoch": 1.520617944147356, + "grad_norm": 0.2549067272962937, + "learning_rate": 4.579650126921888e-06, + "loss": 0.7964, + "num_tokens": 53484114313.0, + "step": 12796 + }, + { + "epoch": 1.520736779560309, + "grad_norm": 0.2908135648376868, + "learning_rate": 4.578436509906032e-06, + "loss": 0.8184, + "num_tokens": 53488304276.0, + "step": 12797 + }, + { + "epoch": 1.5208556149732622, + "grad_norm": 0.2609778242019876, + "learning_rate": 4.577223130702385e-06, + "loss": 0.847, + "num_tokens": 53492469531.0, + "step": 12798 + }, + { + "epoch": 1.520974450386215, + "grad_norm": 0.2784926798575413, + "learning_rate": 4.5760099893558805e-06, + "loss": 0.8332, + "num_tokens": 53496627347.0, + "step": 12799 + }, + { + "epoch": 1.5210932857991681, + "grad_norm": 0.2586226919519708, + "learning_rate": 4.574797085911446e-06, + "loss": 0.829, + "num_tokens": 53500816716.0, + "step": 12800 + }, + { + "epoch": 1.5212121212121212, + "grad_norm": 0.2703459192771198, + "learning_rate": 4.573584420413999e-06, + "loss": 0.8334, + "num_tokens": 53505000371.0, + "step": 12801 + }, + { + "epoch": 1.5213309566250741, + "grad_norm": 0.2579348041240815, + "learning_rate": 4.5723719929084484e-06, + "loss": 0.8021, + "num_tokens": 53509187501.0, + "step": 12802 + }, + { + "epoch": 1.5214497920380272, + "grad_norm": 0.2946228712638401, + "learning_rate": 4.571159803439697e-06, + "loss": 0.8015, + "num_tokens": 53513377233.0, + "step": 12803 + }, + { + "epoch": 1.5215686274509803, + "grad_norm": 0.24348493574069827, + "learning_rate": 4.5699478520526325e-06, + "loss": 0.8106, + "num_tokens": 53517567045.0, + "step": 12804 + }, + { + "epoch": 1.5216874628639334, + "grad_norm": 0.2568326986791163, + "learning_rate": 4.5687361387921405e-06, + "loss": 0.8199, + "num_tokens": 53521754699.0, + "step": 12805 + }, + { + "epoch": 1.5218062982768865, + "grad_norm": 0.23578690817501022, + "learning_rate": 4.567524663703094e-06, + "loss": 0.782, + "num_tokens": 53525944939.0, + "step": 12806 + }, + { + "epoch": 1.5219251336898396, + "grad_norm": 0.2495517403880243, + "learning_rate": 4.566313426830357e-06, + "loss": 0.7718, + "num_tokens": 53530135525.0, + "step": 12807 + }, + { + "epoch": 1.5220439691027927, + "grad_norm": 0.2612257618793938, + "learning_rate": 4.565102428218788e-06, + "loss": 0.8063, + "num_tokens": 53534325639.0, + "step": 12808 + }, + { + "epoch": 1.5221628045157458, + "grad_norm": 0.23659869599172903, + "learning_rate": 4.563891667913234e-06, + "loss": 0.8338, + "num_tokens": 53538515054.0, + "step": 12809 + }, + { + "epoch": 1.522281639928699, + "grad_norm": 0.25788520488464195, + "learning_rate": 4.56268114595853e-06, + "loss": 0.8307, + "num_tokens": 53542697276.0, + "step": 12810 + }, + { + "epoch": 1.5224004753416518, + "grad_norm": 0.24641932226125585, + "learning_rate": 4.561470862399509e-06, + "loss": 0.8291, + "num_tokens": 53546887333.0, + "step": 12811 + }, + { + "epoch": 1.522519310754605, + "grad_norm": 0.2481577002846788, + "learning_rate": 4.560260817280991e-06, + "loss": 0.7976, + "num_tokens": 53551074709.0, + "step": 12812 + }, + { + "epoch": 1.5226381461675578, + "grad_norm": 0.25780120432784276, + "learning_rate": 4.559051010647789e-06, + "loss": 0.799, + "num_tokens": 53555264818.0, + "step": 12813 + }, + { + "epoch": 1.5227569815805109, + "grad_norm": 0.26742418417165464, + "learning_rate": 4.557841442544705e-06, + "loss": 0.7931, + "num_tokens": 53559446630.0, + "step": 12814 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.2598605153983773, + "learning_rate": 4.556632113016534e-06, + "loss": 0.8077, + "num_tokens": 53563630636.0, + "step": 12815 + }, + { + "epoch": 1.522994652406417, + "grad_norm": 0.2680279633018199, + "learning_rate": 4.5554230221080606e-06, + "loss": 0.8121, + "num_tokens": 53567819105.0, + "step": 12816 + }, + { + "epoch": 1.5231134878193702, + "grad_norm": 0.25561943589120656, + "learning_rate": 4.554214169864063e-06, + "loss": 0.826, + "num_tokens": 53572008854.0, + "step": 12817 + }, + { + "epoch": 1.5232323232323233, + "grad_norm": 0.25956447235127067, + "learning_rate": 4.553005556329309e-06, + "loss": 0.7921, + "num_tokens": 53576198813.0, + "step": 12818 + }, + { + "epoch": 1.5233511586452764, + "grad_norm": 0.2483108486965703, + "learning_rate": 4.551797181548556e-06, + "loss": 0.8342, + "num_tokens": 53580387984.0, + "step": 12819 + }, + { + "epoch": 1.5234699940582295, + "grad_norm": 0.24034142486573903, + "learning_rate": 4.550589045566559e-06, + "loss": 0.8443, + "num_tokens": 53584567043.0, + "step": 12820 + }, + { + "epoch": 1.5235888294711826, + "grad_norm": 0.2679044462089399, + "learning_rate": 4.549381148428052e-06, + "loss": 0.8054, + "num_tokens": 53588756829.0, + "step": 12821 + }, + { + "epoch": 1.5237076648841354, + "grad_norm": 0.2497988190268499, + "learning_rate": 4.548173490177769e-06, + "loss": 0.7753, + "num_tokens": 53592934973.0, + "step": 12822 + }, + { + "epoch": 1.5238265002970885, + "grad_norm": 0.2613512241821438, + "learning_rate": 4.546966070860438e-06, + "loss": 0.8433, + "num_tokens": 53597111985.0, + "step": 12823 + }, + { + "epoch": 1.5239453357100414, + "grad_norm": 0.24957015292348228, + "learning_rate": 4.545758890520772e-06, + "loss": 0.7925, + "num_tokens": 53601245194.0, + "step": 12824 + }, + { + "epoch": 1.5240641711229945, + "grad_norm": 0.26818721592970807, + "learning_rate": 4.54455194920348e-06, + "loss": 0.8481, + "num_tokens": 53605434076.0, + "step": 12825 + }, + { + "epoch": 1.5241830065359476, + "grad_norm": 0.2649330948140078, + "learning_rate": 4.543345246953251e-06, + "loss": 0.7935, + "num_tokens": 53609617459.0, + "step": 12826 + }, + { + "epoch": 1.5243018419489007, + "grad_norm": 0.2659060684650172, + "learning_rate": 4.5421387838147795e-06, + "loss": 0.7617, + "num_tokens": 53613808077.0, + "step": 12827 + }, + { + "epoch": 1.5244206773618538, + "grad_norm": 0.2694799175784376, + "learning_rate": 4.540932559832739e-06, + "loss": 0.7711, + "num_tokens": 53617986856.0, + "step": 12828 + }, + { + "epoch": 1.524539512774807, + "grad_norm": 0.24716948695344812, + "learning_rate": 4.539726575051809e-06, + "loss": 0.8486, + "num_tokens": 53622117115.0, + "step": 12829 + }, + { + "epoch": 1.52465834818776, + "grad_norm": 0.271814340982198, + "learning_rate": 4.538520829516649e-06, + "loss": 0.8295, + "num_tokens": 53626305190.0, + "step": 12830 + }, + { + "epoch": 1.5247771836007131, + "grad_norm": 0.24256778499072204, + "learning_rate": 4.537315323271906e-06, + "loss": 0.7932, + "num_tokens": 53630479038.0, + "step": 12831 + }, + { + "epoch": 1.5248960190136662, + "grad_norm": 0.24809485563593245, + "learning_rate": 4.53611005636223e-06, + "loss": 0.8347, + "num_tokens": 53634666545.0, + "step": 12832 + }, + { + "epoch": 1.525014854426619, + "grad_norm": 0.24332304141309108, + "learning_rate": 4.53490502883225e-06, + "loss": 0.8103, + "num_tokens": 53638857048.0, + "step": 12833 + }, + { + "epoch": 1.5251336898395722, + "grad_norm": 0.2573425639134458, + "learning_rate": 4.533700240726597e-06, + "loss": 0.8248, + "num_tokens": 53643045882.0, + "step": 12834 + }, + { + "epoch": 1.5252525252525253, + "grad_norm": 0.26422632270441043, + "learning_rate": 4.532495692089888e-06, + "loss": 0.8588, + "num_tokens": 53647236569.0, + "step": 12835 + }, + { + "epoch": 1.5253713606654782, + "grad_norm": 0.23654417027345256, + "learning_rate": 4.53129138296673e-06, + "loss": 0.8122, + "num_tokens": 53651425752.0, + "step": 12836 + }, + { + "epoch": 1.5254901960784313, + "grad_norm": 0.2740712340048423, + "learning_rate": 4.530087313401724e-06, + "loss": 0.8127, + "num_tokens": 53655614285.0, + "step": 12837 + }, + { + "epoch": 1.5256090314913844, + "grad_norm": 0.2571464179084357, + "learning_rate": 4.528883483439459e-06, + "loss": 0.7857, + "num_tokens": 53659803794.0, + "step": 12838 + }, + { + "epoch": 1.5257278669043375, + "grad_norm": 0.2718253020327646, + "learning_rate": 4.527679893124519e-06, + "loss": 0.7738, + "num_tokens": 53663994117.0, + "step": 12839 + }, + { + "epoch": 1.5258467023172906, + "grad_norm": 0.2444373712109254, + "learning_rate": 4.526476542501475e-06, + "loss": 0.8075, + "num_tokens": 53668184455.0, + "step": 12840 + }, + { + "epoch": 1.5259655377302437, + "grad_norm": 0.25370503172672865, + "learning_rate": 4.525273431614893e-06, + "loss": 0.7926, + "num_tokens": 53672326869.0, + "step": 12841 + }, + { + "epoch": 1.5260843731431968, + "grad_norm": 0.2478810345990927, + "learning_rate": 4.524070560509328e-06, + "loss": 0.7968, + "num_tokens": 53676483973.0, + "step": 12842 + }, + { + "epoch": 1.5262032085561499, + "grad_norm": 0.25716160352474704, + "learning_rate": 4.5228679292293235e-06, + "loss": 0.861, + "num_tokens": 53680673191.0, + "step": 12843 + }, + { + "epoch": 1.5263220439691028, + "grad_norm": 0.22856976387872477, + "learning_rate": 4.521665537819421e-06, + "loss": 0.8327, + "num_tokens": 53684843855.0, + "step": 12844 + }, + { + "epoch": 1.5264408793820559, + "grad_norm": 0.2594880920667969, + "learning_rate": 4.520463386324147e-06, + "loss": 0.8112, + "num_tokens": 53689034366.0, + "step": 12845 + }, + { + "epoch": 1.526559714795009, + "grad_norm": 0.25603446160096066, + "learning_rate": 4.519261474788025e-06, + "loss": 0.7707, + "num_tokens": 53693218148.0, + "step": 12846 + }, + { + "epoch": 1.5266785502079618, + "grad_norm": 0.24498784041237193, + "learning_rate": 4.518059803255556e-06, + "loss": 0.8252, + "num_tokens": 53697356262.0, + "step": 12847 + }, + { + "epoch": 1.526797385620915, + "grad_norm": 0.24749587284109123, + "learning_rate": 4.516858371771251e-06, + "loss": 0.7498, + "num_tokens": 53701546471.0, + "step": 12848 + }, + { + "epoch": 1.526916221033868, + "grad_norm": 0.24928862039689195, + "learning_rate": 4.515657180379601e-06, + "loss": 0.8197, + "num_tokens": 53705659628.0, + "step": 12849 + }, + { + "epoch": 1.5270350564468211, + "grad_norm": 0.2754916563847125, + "learning_rate": 4.514456229125091e-06, + "loss": 0.8088, + "num_tokens": 53709849262.0, + "step": 12850 + }, + { + "epoch": 1.5271538918597742, + "grad_norm": 0.2430335921781449, + "learning_rate": 4.513255518052196e-06, + "loss": 0.8387, + "num_tokens": 53714011783.0, + "step": 12851 + }, + { + "epoch": 1.5272727272727273, + "grad_norm": 0.28368283880948664, + "learning_rate": 4.512055047205378e-06, + "loss": 0.7947, + "num_tokens": 53718200223.0, + "step": 12852 + }, + { + "epoch": 1.5273915626856804, + "grad_norm": 0.26196496787454404, + "learning_rate": 4.5108548166291e-06, + "loss": 0.7923, + "num_tokens": 53722389791.0, + "step": 12853 + }, + { + "epoch": 1.5275103980986335, + "grad_norm": 0.26030019146135447, + "learning_rate": 4.509654826367804e-06, + "loss": 0.7862, + "num_tokens": 53726541110.0, + "step": 12854 + }, + { + "epoch": 1.5276292335115864, + "grad_norm": 0.24033560723496158, + "learning_rate": 4.508455076465938e-06, + "loss": 0.7881, + "num_tokens": 53730721464.0, + "step": 12855 + }, + { + "epoch": 1.5277480689245395, + "grad_norm": 0.2635434666618004, + "learning_rate": 4.507255566967931e-06, + "loss": 0.8395, + "num_tokens": 53734910083.0, + "step": 12856 + }, + { + "epoch": 1.5278669043374926, + "grad_norm": 0.24515174029348485, + "learning_rate": 4.506056297918201e-06, + "loss": 0.8047, + "num_tokens": 53739082220.0, + "step": 12857 + }, + { + "epoch": 1.5279857397504455, + "grad_norm": 0.2749797958071067, + "learning_rate": 4.504857269361161e-06, + "loss": 0.7945, + "num_tokens": 53743269894.0, + "step": 12858 + }, + { + "epoch": 1.5281045751633986, + "grad_norm": 0.24476389181574063, + "learning_rate": 4.503658481341219e-06, + "loss": 0.7676, + "num_tokens": 53747458539.0, + "step": 12859 + }, + { + "epoch": 1.5282234105763517, + "grad_norm": 0.27296412362083916, + "learning_rate": 4.502459933902764e-06, + "loss": 0.773, + "num_tokens": 53751647400.0, + "step": 12860 + }, + { + "epoch": 1.5283422459893048, + "grad_norm": 0.26636821091147633, + "learning_rate": 4.501261627090192e-06, + "loss": 0.8215, + "num_tokens": 53755836911.0, + "step": 12861 + }, + { + "epoch": 1.528461081402258, + "grad_norm": 0.23664692333598203, + "learning_rate": 4.500063560947874e-06, + "loss": 0.792, + "num_tokens": 53760023747.0, + "step": 12862 + }, + { + "epoch": 1.528579916815211, + "grad_norm": 0.2523071430908545, + "learning_rate": 4.498865735520177e-06, + "loss": 0.7971, + "num_tokens": 53764193438.0, + "step": 12863 + }, + { + "epoch": 1.528698752228164, + "grad_norm": 0.25503494544122535, + "learning_rate": 4.497668150851463e-06, + "loss": 0.8214, + "num_tokens": 53768383384.0, + "step": 12864 + }, + { + "epoch": 1.5288175876411172, + "grad_norm": 0.2454646829511877, + "learning_rate": 4.496470806986082e-06, + "loss": 0.8523, + "num_tokens": 53772571841.0, + "step": 12865 + }, + { + "epoch": 1.52893642305407, + "grad_norm": 0.26970011504941, + "learning_rate": 4.495273703968377e-06, + "loss": 0.8213, + "num_tokens": 53776759293.0, + "step": 12866 + }, + { + "epoch": 1.5290552584670232, + "grad_norm": 0.25513372836471593, + "learning_rate": 4.494076841842678e-06, + "loss": 0.7936, + "num_tokens": 53780948875.0, + "step": 12867 + }, + { + "epoch": 1.5291740938799763, + "grad_norm": 0.26985575448106713, + "learning_rate": 4.492880220653313e-06, + "loss": 0.795, + "num_tokens": 53785138152.0, + "step": 12868 + }, + { + "epoch": 1.5292929292929291, + "grad_norm": 0.2812534097124811, + "learning_rate": 4.491683840444593e-06, + "loss": 0.8048, + "num_tokens": 53789314193.0, + "step": 12869 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.2459788874218542, + "learning_rate": 4.490487701260825e-06, + "loss": 0.8136, + "num_tokens": 53793503361.0, + "step": 12870 + }, + { + "epoch": 1.5295306001188353, + "grad_norm": 0.2636128579447652, + "learning_rate": 4.4892918031463075e-06, + "loss": 0.7947, + "num_tokens": 53797691394.0, + "step": 12871 + }, + { + "epoch": 1.5296494355317884, + "grad_norm": 0.28149558178835016, + "learning_rate": 4.48809614614533e-06, + "loss": 0.7876, + "num_tokens": 53801881738.0, + "step": 12872 + }, + { + "epoch": 1.5297682709447415, + "grad_norm": 0.26103804373029754, + "learning_rate": 4.486900730302165e-06, + "loss": 0.8009, + "num_tokens": 53806070333.0, + "step": 12873 + }, + { + "epoch": 1.5298871063576946, + "grad_norm": 0.2486838524560966, + "learning_rate": 4.485705555661089e-06, + "loss": 0.7985, + "num_tokens": 53810257974.0, + "step": 12874 + }, + { + "epoch": 1.5300059417706477, + "grad_norm": 0.2765630979000118, + "learning_rate": 4.484510622266361e-06, + "loss": 0.8141, + "num_tokens": 53814412534.0, + "step": 12875 + }, + { + "epoch": 1.5301247771836008, + "grad_norm": 0.2752967517754351, + "learning_rate": 4.483315930162236e-06, + "loss": 0.778, + "num_tokens": 53818603288.0, + "step": 12876 + }, + { + "epoch": 1.5302436125965537, + "grad_norm": 0.2620844394685986, + "learning_rate": 4.4821214793929576e-06, + "loss": 0.8162, + "num_tokens": 53822793791.0, + "step": 12877 + }, + { + "epoch": 1.5303624480095068, + "grad_norm": 0.24319237232966123, + "learning_rate": 4.480927270002755e-06, + "loss": 0.8177, + "num_tokens": 53826982695.0, + "step": 12878 + }, + { + "epoch": 1.53048128342246, + "grad_norm": 0.2597708026849822, + "learning_rate": 4.4797333020358544e-06, + "loss": 0.7949, + "num_tokens": 53831146327.0, + "step": 12879 + }, + { + "epoch": 1.5306001188354128, + "grad_norm": 0.2513505574459998, + "learning_rate": 4.478539575536479e-06, + "loss": 0.7946, + "num_tokens": 53835304179.0, + "step": 12880 + }, + { + "epoch": 1.530718954248366, + "grad_norm": 0.24290350847748948, + "learning_rate": 4.477346090548831e-06, + "loss": 0.8102, + "num_tokens": 53839493175.0, + "step": 12881 + }, + { + "epoch": 1.530837789661319, + "grad_norm": 0.23536735744836132, + "learning_rate": 4.476152847117114e-06, + "loss": 0.8387, + "num_tokens": 53843672456.0, + "step": 12882 + }, + { + "epoch": 1.530956625074272, + "grad_norm": 0.25044922822074905, + "learning_rate": 4.474959845285514e-06, + "loss": 0.7981, + "num_tokens": 53847862026.0, + "step": 12883 + }, + { + "epoch": 1.5310754604872252, + "grad_norm": 0.263392840385503, + "learning_rate": 4.4737670850982094e-06, + "loss": 0.8429, + "num_tokens": 53852036762.0, + "step": 12884 + }, + { + "epoch": 1.5311942959001783, + "grad_norm": 0.25597340477154273, + "learning_rate": 4.4725745665993745e-06, + "loss": 0.7977, + "num_tokens": 53856216332.0, + "step": 12885 + }, + { + "epoch": 1.5313131313131314, + "grad_norm": 0.260193419236508, + "learning_rate": 4.471382289833176e-06, + "loss": 0.7974, + "num_tokens": 53860386591.0, + "step": 12886 + }, + { + "epoch": 1.5314319667260845, + "grad_norm": 0.24776152033969678, + "learning_rate": 4.470190254843767e-06, + "loss": 0.7705, + "num_tokens": 53864576005.0, + "step": 12887 + }, + { + "epoch": 1.5315508021390374, + "grad_norm": 0.26961860868288795, + "learning_rate": 4.468998461675289e-06, + "loss": 0.8302, + "num_tokens": 53868734941.0, + "step": 12888 + }, + { + "epoch": 1.5316696375519905, + "grad_norm": 0.23944590441764357, + "learning_rate": 4.467806910371879e-06, + "loss": 0.8087, + "num_tokens": 53872925569.0, + "step": 12889 + }, + { + "epoch": 1.5317884729649436, + "grad_norm": 0.2552284769519626, + "learning_rate": 4.466615600977665e-06, + "loss": 0.8218, + "num_tokens": 53877093060.0, + "step": 12890 + }, + { + "epoch": 1.5319073083778965, + "grad_norm": 0.2632340698852917, + "learning_rate": 4.465424533536766e-06, + "loss": 0.8146, + "num_tokens": 53881271896.0, + "step": 12891 + }, + { + "epoch": 1.5320261437908496, + "grad_norm": 0.25161874880568336, + "learning_rate": 4.46423370809329e-06, + "loss": 0.809, + "num_tokens": 53885423415.0, + "step": 12892 + }, + { + "epoch": 1.5321449792038027, + "grad_norm": 0.2626086038534958, + "learning_rate": 4.463043124691337e-06, + "loss": 0.7847, + "num_tokens": 53889612817.0, + "step": 12893 + }, + { + "epoch": 1.5322638146167558, + "grad_norm": 0.24562604559890702, + "learning_rate": 4.461852783375e-06, + "loss": 0.7936, + "num_tokens": 53893784774.0, + "step": 12894 + }, + { + "epoch": 1.5323826500297089, + "grad_norm": 0.243279833085117, + "learning_rate": 4.460662684188359e-06, + "loss": 0.8204, + "num_tokens": 53897919392.0, + "step": 12895 + }, + { + "epoch": 1.532501485442662, + "grad_norm": 0.25138239681148616, + "learning_rate": 4.459472827175491e-06, + "loss": 0.8179, + "num_tokens": 53902109053.0, + "step": 12896 + }, + { + "epoch": 1.532620320855615, + "grad_norm": 0.2385157939626512, + "learning_rate": 4.4582832123804565e-06, + "loss": 0.7981, + "num_tokens": 53906298637.0, + "step": 12897 + }, + { + "epoch": 1.5327391562685682, + "grad_norm": 0.2517745079595138, + "learning_rate": 4.457093839847314e-06, + "loss": 0.8048, + "num_tokens": 53910487761.0, + "step": 12898 + }, + { + "epoch": 1.5328579916815213, + "grad_norm": 0.2554687898186576, + "learning_rate": 4.455904709620108e-06, + "loss": 0.8096, + "num_tokens": 53914638916.0, + "step": 12899 + }, + { + "epoch": 1.5329768270944741, + "grad_norm": 0.24615161725427337, + "learning_rate": 4.454715821742876e-06, + "loss": 0.7895, + "num_tokens": 53918828508.0, + "step": 12900 + }, + { + "epoch": 1.5330956625074272, + "grad_norm": 0.2534063439344695, + "learning_rate": 4.453527176259649e-06, + "loss": 0.8311, + "num_tokens": 53923017546.0, + "step": 12901 + }, + { + "epoch": 1.5332144979203801, + "grad_norm": 0.25157405522955434, + "learning_rate": 4.452338773214443e-06, + "loss": 0.8214, + "num_tokens": 53927205295.0, + "step": 12902 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.22972520715366512, + "learning_rate": 4.451150612651274e-06, + "loss": 0.8059, + "num_tokens": 53931393599.0, + "step": 12903 + }, + { + "epoch": 1.5334521687462863, + "grad_norm": 0.24883694169712484, + "learning_rate": 4.449962694614135e-06, + "loss": 0.8259, + "num_tokens": 53935575434.0, + "step": 12904 + }, + { + "epoch": 1.5335710041592394, + "grad_norm": 0.2504305779386469, + "learning_rate": 4.448775019147026e-06, + "loss": 0.8213, + "num_tokens": 53939733599.0, + "step": 12905 + }, + { + "epoch": 1.5336898395721925, + "grad_norm": 0.3391940783345464, + "learning_rate": 4.4475875862939264e-06, + "loss": 0.7954, + "num_tokens": 53943923968.0, + "step": 12906 + }, + { + "epoch": 1.5338086749851456, + "grad_norm": 0.25072711480738, + "learning_rate": 4.446400396098815e-06, + "loss": 0.8096, + "num_tokens": 53948112712.0, + "step": 12907 + }, + { + "epoch": 1.5339275103980987, + "grad_norm": 0.23636224271081147, + "learning_rate": 4.445213448605657e-06, + "loss": 0.7955, + "num_tokens": 53952301825.0, + "step": 12908 + }, + { + "epoch": 1.5340463458110518, + "grad_norm": 0.2364266963644264, + "learning_rate": 4.444026743858405e-06, + "loss": 0.7978, + "num_tokens": 53956485602.0, + "step": 12909 + }, + { + "epoch": 1.534165181224005, + "grad_norm": 0.24425759389151236, + "learning_rate": 4.44284028190101e-06, + "loss": 0.8326, + "num_tokens": 53960663145.0, + "step": 12910 + }, + { + "epoch": 1.5342840166369578, + "grad_norm": 0.2286608875949012, + "learning_rate": 4.441654062777405e-06, + "loss": 0.8308, + "num_tokens": 53964849370.0, + "step": 12911 + }, + { + "epoch": 1.534402852049911, + "grad_norm": 0.2569323156527343, + "learning_rate": 4.440468086531527e-06, + "loss": 0.7978, + "num_tokens": 53969037374.0, + "step": 12912 + }, + { + "epoch": 1.5345216874628638, + "grad_norm": 0.24230003076902598, + "learning_rate": 4.4392823532072984e-06, + "loss": 0.8017, + "num_tokens": 53973227098.0, + "step": 12913 + }, + { + "epoch": 1.5346405228758169, + "grad_norm": 0.24072066692656116, + "learning_rate": 4.4380968628486234e-06, + "loss": 0.7986, + "num_tokens": 53977374164.0, + "step": 12914 + }, + { + "epoch": 1.53475935828877, + "grad_norm": 0.253009972998346, + "learning_rate": 4.436911615499407e-06, + "loss": 0.8212, + "num_tokens": 53981563762.0, + "step": 12915 + }, + { + "epoch": 1.534878193701723, + "grad_norm": 0.24525489161110164, + "learning_rate": 4.435726611203544e-06, + "loss": 0.8225, + "num_tokens": 53985753030.0, + "step": 12916 + }, + { + "epoch": 1.5349970291146762, + "grad_norm": 0.238888429057953, + "learning_rate": 4.434541850004917e-06, + "loss": 0.8007, + "num_tokens": 53989912936.0, + "step": 12917 + }, + { + "epoch": 1.5351158645276293, + "grad_norm": 0.2812386534401895, + "learning_rate": 4.433357331947409e-06, + "loss": 0.8253, + "num_tokens": 53994079507.0, + "step": 12918 + }, + { + "epoch": 1.5352346999405824, + "grad_norm": 0.26427132102264606, + "learning_rate": 4.432173057074877e-06, + "loss": 0.8316, + "num_tokens": 53998267956.0, + "step": 12919 + }, + { + "epoch": 1.5353535353535355, + "grad_norm": 0.269166030272129, + "learning_rate": 4.430989025431185e-06, + "loss": 0.7788, + "num_tokens": 54002457165.0, + "step": 12920 + }, + { + "epoch": 1.5354723707664886, + "grad_norm": 0.2708142286835187, + "learning_rate": 4.429805237060179e-06, + "loss": 0.8105, + "num_tokens": 54006647245.0, + "step": 12921 + }, + { + "epoch": 1.5355912061794414, + "grad_norm": 0.2711455353926096, + "learning_rate": 4.4286216920056994e-06, + "loss": 0.7846, + "num_tokens": 54010834635.0, + "step": 12922 + }, + { + "epoch": 1.5357100415923945, + "grad_norm": 0.2774507294690374, + "learning_rate": 4.427438390311577e-06, + "loss": 0.8257, + "num_tokens": 54014992838.0, + "step": 12923 + }, + { + "epoch": 1.5358288770053476, + "grad_norm": 0.25831168487425, + "learning_rate": 4.426255332021634e-06, + "loss": 0.7596, + "num_tokens": 54019155700.0, + "step": 12924 + }, + { + "epoch": 1.5359477124183005, + "grad_norm": 0.24633482195288914, + "learning_rate": 4.425072517179681e-06, + "loss": 0.8049, + "num_tokens": 54023343986.0, + "step": 12925 + }, + { + "epoch": 1.5360665478312536, + "grad_norm": 0.24994297028702542, + "learning_rate": 4.4238899458295235e-06, + "loss": 0.8099, + "num_tokens": 54027533213.0, + "step": 12926 + }, + { + "epoch": 1.5361853832442067, + "grad_norm": 0.2807000819931769, + "learning_rate": 4.4227076180149556e-06, + "loss": 0.7912, + "num_tokens": 54031702288.0, + "step": 12927 + }, + { + "epoch": 1.5363042186571598, + "grad_norm": 0.23065482058244663, + "learning_rate": 4.421525533779763e-06, + "loss": 0.791, + "num_tokens": 54035889744.0, + "step": 12928 + }, + { + "epoch": 1.536423054070113, + "grad_norm": 0.2725900343307088, + "learning_rate": 4.4203436931677225e-06, + "loss": 0.8018, + "num_tokens": 54040047899.0, + "step": 12929 + }, + { + "epoch": 1.536541889483066, + "grad_norm": 0.26370008863320105, + "learning_rate": 4.4191620962226025e-06, + "loss": 0.8321, + "num_tokens": 54044219808.0, + "step": 12930 + }, + { + "epoch": 1.5366607248960191, + "grad_norm": 0.29228154502198006, + "learning_rate": 4.4179807429881585e-06, + "loss": 0.796, + "num_tokens": 54048383435.0, + "step": 12931 + }, + { + "epoch": 1.5367795603089722, + "grad_norm": 0.244309212448123, + "learning_rate": 4.416799633508143e-06, + "loss": 0.8302, + "num_tokens": 54052550298.0, + "step": 12932 + }, + { + "epoch": 1.536898395721925, + "grad_norm": 0.2917185443215893, + "learning_rate": 4.415618767826296e-06, + "loss": 0.8177, + "num_tokens": 54056712196.0, + "step": 12933 + }, + { + "epoch": 1.5370172311348782, + "grad_norm": 0.27899712986515435, + "learning_rate": 4.414438145986347e-06, + "loss": 0.8354, + "num_tokens": 54060869043.0, + "step": 12934 + }, + { + "epoch": 1.5371360665478313, + "grad_norm": 0.29792882208785065, + "learning_rate": 4.413257768032022e-06, + "loss": 0.7594, + "num_tokens": 54065058451.0, + "step": 12935 + }, + { + "epoch": 1.5372549019607842, + "grad_norm": 0.267668686319555, + "learning_rate": 4.412077634007028e-06, + "loss": 0.7744, + "num_tokens": 54069247204.0, + "step": 12936 + }, + { + "epoch": 1.5373737373737373, + "grad_norm": 0.31205572399096715, + "learning_rate": 4.410897743955077e-06, + "loss": 0.8039, + "num_tokens": 54073436864.0, + "step": 12937 + }, + { + "epoch": 1.5374925727866904, + "grad_norm": 0.2629657968520926, + "learning_rate": 4.409718097919859e-06, + "loss": 0.7852, + "num_tokens": 54077627174.0, + "step": 12938 + }, + { + "epoch": 1.5376114081996435, + "grad_norm": 0.30623215386666364, + "learning_rate": 4.408538695945062e-06, + "loss": 0.8378, + "num_tokens": 54081816305.0, + "step": 12939 + }, + { + "epoch": 1.5377302436125966, + "grad_norm": 0.25540442479713893, + "learning_rate": 4.407359538074367e-06, + "loss": 0.8523, + "num_tokens": 54085976749.0, + "step": 12940 + }, + { + "epoch": 1.5378490790255497, + "grad_norm": 0.28897719784869025, + "learning_rate": 4.406180624351437e-06, + "loss": 0.8167, + "num_tokens": 54090163083.0, + "step": 12941 + }, + { + "epoch": 1.5379679144385028, + "grad_norm": 0.2574017169225524, + "learning_rate": 4.405001954819931e-06, + "loss": 0.8266, + "num_tokens": 54094352289.0, + "step": 12942 + }, + { + "epoch": 1.5380867498514559, + "grad_norm": 0.28876198718922497, + "learning_rate": 4.403823529523499e-06, + "loss": 0.8103, + "num_tokens": 54098526624.0, + "step": 12943 + }, + { + "epoch": 1.5382055852644088, + "grad_norm": 0.2576206118474012, + "learning_rate": 4.4026453485057885e-06, + "loss": 0.7996, + "num_tokens": 54102704033.0, + "step": 12944 + }, + { + "epoch": 1.5383244206773619, + "grad_norm": 0.29089152339546404, + "learning_rate": 4.401467411810425e-06, + "loss": 0.8011, + "num_tokens": 54106871494.0, + "step": 12945 + }, + { + "epoch": 1.538443256090315, + "grad_norm": 0.2722845241863692, + "learning_rate": 4.400289719481033e-06, + "loss": 0.834, + "num_tokens": 54111060954.0, + "step": 12946 + }, + { + "epoch": 1.5385620915032678, + "grad_norm": 0.2785059210441821, + "learning_rate": 4.399112271561226e-06, + "loss": 0.7974, + "num_tokens": 54115250146.0, + "step": 12947 + }, + { + "epoch": 1.538680926916221, + "grad_norm": 0.26154223826666156, + "learning_rate": 4.3979350680946115e-06, + "loss": 0.794, + "num_tokens": 54119414993.0, + "step": 12948 + }, + { + "epoch": 1.538799762329174, + "grad_norm": 0.2574657230784257, + "learning_rate": 4.396758109124783e-06, + "loss": 0.8429, + "num_tokens": 54123588430.0, + "step": 12949 + }, + { + "epoch": 1.5389185977421271, + "grad_norm": 0.2691713794954917, + "learning_rate": 4.395581394695326e-06, + "loss": 0.8121, + "num_tokens": 54127777525.0, + "step": 12950 + }, + { + "epoch": 1.5390374331550802, + "grad_norm": 0.2415096173831976, + "learning_rate": 4.3944049248498225e-06, + "loss": 0.7727, + "num_tokens": 54131966339.0, + "step": 12951 + }, + { + "epoch": 1.5391562685680333, + "grad_norm": 0.25904471592940703, + "learning_rate": 4.393228699631837e-06, + "loss": 0.7924, + "num_tokens": 54136131100.0, + "step": 12952 + }, + { + "epoch": 1.5392751039809864, + "grad_norm": 0.24543700586674214, + "learning_rate": 4.392052719084931e-06, + "loss": 0.8314, + "num_tokens": 54140293060.0, + "step": 12953 + }, + { + "epoch": 1.5393939393939395, + "grad_norm": 0.26101549244842703, + "learning_rate": 4.390876983252655e-06, + "loss": 0.7935, + "num_tokens": 54144452627.0, + "step": 12954 + }, + { + "epoch": 1.5395127748068924, + "grad_norm": 0.242415882133712, + "learning_rate": 4.389701492178551e-06, + "loss": 0.8105, + "num_tokens": 54148609850.0, + "step": 12955 + }, + { + "epoch": 1.5396316102198455, + "grad_norm": 0.257945026839558, + "learning_rate": 4.38852624590615e-06, + "loss": 0.7876, + "num_tokens": 54152800166.0, + "step": 12956 + }, + { + "epoch": 1.5397504456327986, + "grad_norm": 0.2564408259013383, + "learning_rate": 4.387351244478977e-06, + "loss": 0.8336, + "num_tokens": 54156989072.0, + "step": 12957 + }, + { + "epoch": 1.5398692810457515, + "grad_norm": 0.2474034603945072, + "learning_rate": 4.386176487940544e-06, + "loss": 0.8111, + "num_tokens": 54161134307.0, + "step": 12958 + }, + { + "epoch": 1.5399881164587046, + "grad_norm": 0.2524190881449987, + "learning_rate": 4.385001976334358e-06, + "loss": 0.805, + "num_tokens": 54165323378.0, + "step": 12959 + }, + { + "epoch": 1.5401069518716577, + "grad_norm": 0.2528198564129156, + "learning_rate": 4.383827709703915e-06, + "loss": 0.7833, + "num_tokens": 54169512919.0, + "step": 12960 + }, + { + "epoch": 1.5402257872846108, + "grad_norm": 0.24640112574553386, + "learning_rate": 4.3826536880927045e-06, + "loss": 0.8001, + "num_tokens": 54173681678.0, + "step": 12961 + }, + { + "epoch": 1.540344622697564, + "grad_norm": 0.2682489222674203, + "learning_rate": 4.381479911544196e-06, + "loss": 0.8168, + "num_tokens": 54177870911.0, + "step": 12962 + }, + { + "epoch": 1.540463458110517, + "grad_norm": 0.2491147088542546, + "learning_rate": 4.380306380101866e-06, + "loss": 0.7819, + "num_tokens": 54182061427.0, + "step": 12963 + }, + { + "epoch": 1.54058229352347, + "grad_norm": 0.26174149568698024, + "learning_rate": 4.379133093809173e-06, + "loss": 0.795, + "num_tokens": 54186251429.0, + "step": 12964 + }, + { + "epoch": 1.5407011289364232, + "grad_norm": 0.2479406066574862, + "learning_rate": 4.377960052709568e-06, + "loss": 0.7928, + "num_tokens": 54190408134.0, + "step": 12965 + }, + { + "epoch": 1.540819964349376, + "grad_norm": 0.27672694707035944, + "learning_rate": 4.376787256846493e-06, + "loss": 0.7827, + "num_tokens": 54194596548.0, + "step": 12966 + }, + { + "epoch": 1.5409387997623292, + "grad_norm": 0.2625471098682986, + "learning_rate": 4.375614706263379e-06, + "loss": 0.8515, + "num_tokens": 54198753393.0, + "step": 12967 + }, + { + "epoch": 1.5410576351752823, + "grad_norm": 0.27008546549055057, + "learning_rate": 4.374442401003646e-06, + "loss": 0.7961, + "num_tokens": 54202940975.0, + "step": 12968 + }, + { + "epoch": 1.5411764705882351, + "grad_norm": 0.2694753999931797, + "learning_rate": 4.373270341110716e-06, + "loss": 0.8234, + "num_tokens": 54207121986.0, + "step": 12969 + }, + { + "epoch": 1.5412953060011882, + "grad_norm": 0.2503856646113278, + "learning_rate": 4.3720985266279885e-06, + "loss": 0.805, + "num_tokens": 54211312083.0, + "step": 12970 + }, + { + "epoch": 1.5414141414141413, + "grad_norm": 0.2531403793079951, + "learning_rate": 4.370926957598867e-06, + "loss": 0.8272, + "num_tokens": 54215500413.0, + "step": 12971 + }, + { + "epoch": 1.5415329768270944, + "grad_norm": 0.2327063703179585, + "learning_rate": 4.36975563406673e-06, + "loss": 0.8003, + "num_tokens": 54219689189.0, + "step": 12972 + }, + { + "epoch": 1.5416518122400475, + "grad_norm": 0.25227447919872686, + "learning_rate": 4.368584556074959e-06, + "loss": 0.7919, + "num_tokens": 54223878697.0, + "step": 12973 + }, + { + "epoch": 1.5417706476530006, + "grad_norm": 0.25762728613730573, + "learning_rate": 4.367413723666921e-06, + "loss": 0.8291, + "num_tokens": 54228069387.0, + "step": 12974 + }, + { + "epoch": 1.5418894830659537, + "grad_norm": 0.25680228789924286, + "learning_rate": 4.366243136885982e-06, + "loss": 0.7923, + "num_tokens": 54232258247.0, + "step": 12975 + }, + { + "epoch": 1.5420083184789068, + "grad_norm": 0.2658376084940966, + "learning_rate": 4.3650727957754905e-06, + "loss": 0.8169, + "num_tokens": 54236426369.0, + "step": 12976 + }, + { + "epoch": 1.5421271538918597, + "grad_norm": 0.2508514342996123, + "learning_rate": 4.363902700378783e-06, + "loss": 0.8304, + "num_tokens": 54240616922.0, + "step": 12977 + }, + { + "epoch": 1.5422459893048128, + "grad_norm": 0.23801704262560985, + "learning_rate": 4.3627328507391965e-06, + "loss": 0.7839, + "num_tokens": 54244806309.0, + "step": 12978 + }, + { + "epoch": 1.542364824717766, + "grad_norm": 0.25175648824894986, + "learning_rate": 4.3615632469000536e-06, + "loss": 0.7991, + "num_tokens": 54248997103.0, + "step": 12979 + }, + { + "epoch": 1.5424836601307188, + "grad_norm": 0.24436584941886733, + "learning_rate": 4.360393888904669e-06, + "loss": 0.7739, + "num_tokens": 54253186426.0, + "step": 12980 + }, + { + "epoch": 1.542602495543672, + "grad_norm": 0.2504329539017049, + "learning_rate": 4.359224776796348e-06, + "loss": 0.824, + "num_tokens": 54257375021.0, + "step": 12981 + }, + { + "epoch": 1.542721330956625, + "grad_norm": 0.24323650016746573, + "learning_rate": 4.358055910618386e-06, + "loss": 0.7941, + "num_tokens": 54261564249.0, + "step": 12982 + }, + { + "epoch": 1.542840166369578, + "grad_norm": 0.232315521588761, + "learning_rate": 4.35688729041407e-06, + "loss": 0.7595, + "num_tokens": 54265725224.0, + "step": 12983 + }, + { + "epoch": 1.5429590017825312, + "grad_norm": 0.24161361215107827, + "learning_rate": 4.35571891622668e-06, + "loss": 0.8126, + "num_tokens": 54269914733.0, + "step": 12984 + }, + { + "epoch": 1.5430778371954843, + "grad_norm": 0.26388771630502167, + "learning_rate": 4.354550788099483e-06, + "loss": 0.7871, + "num_tokens": 54274093977.0, + "step": 12985 + }, + { + "epoch": 1.5431966726084374, + "grad_norm": 0.2603817525419137, + "learning_rate": 4.3533829060757384e-06, + "loss": 0.841, + "num_tokens": 54278242402.0, + "step": 12986 + }, + { + "epoch": 1.5433155080213905, + "grad_norm": 0.24632761432957573, + "learning_rate": 4.352215270198697e-06, + "loss": 0.8305, + "num_tokens": 54282429732.0, + "step": 12987 + }, + { + "epoch": 1.5434343434343434, + "grad_norm": 0.23883088502435856, + "learning_rate": 4.3510478805116015e-06, + "loss": 0.7868, + "num_tokens": 54286618338.0, + "step": 12988 + }, + { + "epoch": 1.5435531788472965, + "grad_norm": 0.23867229828044897, + "learning_rate": 4.349880737057684e-06, + "loss": 0.8347, + "num_tokens": 54290800387.0, + "step": 12989 + }, + { + "epoch": 1.5436720142602496, + "grad_norm": 0.2503887821868121, + "learning_rate": 4.3487138398801675e-06, + "loss": 0.8173, + "num_tokens": 54294963483.0, + "step": 12990 + }, + { + "epoch": 1.5437908496732025, + "grad_norm": 0.23881682980807262, + "learning_rate": 4.347547189022265e-06, + "loss": 0.7702, + "num_tokens": 54299152874.0, + "step": 12991 + }, + { + "epoch": 1.5439096850861556, + "grad_norm": 0.24455476608651983, + "learning_rate": 4.346380784527187e-06, + "loss": 0.8196, + "num_tokens": 54303330731.0, + "step": 12992 + }, + { + "epoch": 1.5440285204991087, + "grad_norm": 0.24676505274928412, + "learning_rate": 4.345214626438121e-06, + "loss": 0.8186, + "num_tokens": 54307498521.0, + "step": 12993 + }, + { + "epoch": 1.5441473559120618, + "grad_norm": 0.25376334699413367, + "learning_rate": 4.3440487147982554e-06, + "loss": 0.8034, + "num_tokens": 54311688315.0, + "step": 12994 + }, + { + "epoch": 1.5442661913250149, + "grad_norm": 0.2336816282822078, + "learning_rate": 4.342883049650773e-06, + "loss": 0.7965, + "num_tokens": 54315860383.0, + "step": 12995 + }, + { + "epoch": 1.544385026737968, + "grad_norm": 0.2732442713149117, + "learning_rate": 4.341717631038839e-06, + "loss": 0.8024, + "num_tokens": 54320004409.0, + "step": 12996 + }, + { + "epoch": 1.544503862150921, + "grad_norm": 0.2762986034117359, + "learning_rate": 4.340552459005617e-06, + "loss": 0.8041, + "num_tokens": 54324190280.0, + "step": 12997 + }, + { + "epoch": 1.5446226975638742, + "grad_norm": 0.2472220466675758, + "learning_rate": 4.33938753359425e-06, + "loss": 0.8302, + "num_tokens": 54328378758.0, + "step": 12998 + }, + { + "epoch": 1.5447415329768273, + "grad_norm": 0.2543232670981161, + "learning_rate": 4.338222854847884e-06, + "loss": 0.8132, + "num_tokens": 54332541194.0, + "step": 12999 + }, + { + "epoch": 1.5448603683897801, + "grad_norm": 0.262029334075162, + "learning_rate": 4.337058422809645e-06, + "loss": 0.8181, + "num_tokens": 54336731222.0, + "step": 13000 + }, + { + "epoch": 1.5449792038027332, + "grad_norm": 0.24967723545523773, + "learning_rate": 4.335894237522663e-06, + "loss": 0.8476, + "num_tokens": 54340883409.0, + "step": 13001 + }, + { + "epoch": 1.5450980392156861, + "grad_norm": 0.249609569176665, + "learning_rate": 4.334730299030054e-06, + "loss": 0.8421, + "num_tokens": 54345071942.0, + "step": 13002 + }, + { + "epoch": 1.5452168746286392, + "grad_norm": 0.23749676723683708, + "learning_rate": 4.333566607374914e-06, + "loss": 0.7843, + "num_tokens": 54349231071.0, + "step": 13003 + }, + { + "epoch": 1.5453357100415923, + "grad_norm": 0.2857760611923535, + "learning_rate": 4.332403162600343e-06, + "loss": 0.8078, + "num_tokens": 54353382104.0, + "step": 13004 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 0.23884552672821427, + "learning_rate": 4.331239964749426e-06, + "loss": 0.7968, + "num_tokens": 54357543161.0, + "step": 13005 + }, + { + "epoch": 1.5455733808674985, + "grad_norm": 0.2567354204032166, + "learning_rate": 4.330077013865238e-06, + "loss": 0.8101, + "num_tokens": 54361732732.0, + "step": 13006 + }, + { + "epoch": 1.5456922162804516, + "grad_norm": 0.2296658533639967, + "learning_rate": 4.328914309990855e-06, + "loss": 0.7867, + "num_tokens": 54365892932.0, + "step": 13007 + }, + { + "epoch": 1.5458110516934047, + "grad_norm": 0.25205114688913244, + "learning_rate": 4.327751853169329e-06, + "loss": 0.8249, + "num_tokens": 54370059685.0, + "step": 13008 + }, + { + "epoch": 1.5459298871063578, + "grad_norm": 0.2632412473777521, + "learning_rate": 4.32658964344371e-06, + "loss": 0.8177, + "num_tokens": 54374225052.0, + "step": 13009 + }, + { + "epoch": 1.546048722519311, + "grad_norm": 0.24560473006839287, + "learning_rate": 4.3254276808570395e-06, + "loss": 0.8252, + "num_tokens": 54378391701.0, + "step": 13010 + }, + { + "epoch": 1.5461675579322638, + "grad_norm": 0.25914422352136923, + "learning_rate": 4.324265965452351e-06, + "loss": 0.804, + "num_tokens": 54382561067.0, + "step": 13011 + }, + { + "epoch": 1.546286393345217, + "grad_norm": 0.23690065347780576, + "learning_rate": 4.323104497272663e-06, + "loss": 0.8221, + "num_tokens": 54386740709.0, + "step": 13012 + }, + { + "epoch": 1.5464052287581698, + "grad_norm": 0.255999785856511, + "learning_rate": 4.321943276360992e-06, + "loss": 0.7864, + "num_tokens": 54390926496.0, + "step": 13013 + }, + { + "epoch": 1.5465240641711229, + "grad_norm": 0.23072755860855484, + "learning_rate": 4.3207823027603406e-06, + "loss": 0.8004, + "num_tokens": 54395115037.0, + "step": 13014 + }, + { + "epoch": 1.546642899584076, + "grad_norm": 0.23677085821965024, + "learning_rate": 4.319621576513701e-06, + "loss": 0.8129, + "num_tokens": 54399291231.0, + "step": 13015 + }, + { + "epoch": 1.546761734997029, + "grad_norm": 0.24766822562998184, + "learning_rate": 4.318461097664064e-06, + "loss": 0.8154, + "num_tokens": 54403481761.0, + "step": 13016 + }, + { + "epoch": 1.5468805704099822, + "grad_norm": 0.26194094911373794, + "learning_rate": 4.317300866254403e-06, + "loss": 0.8135, + "num_tokens": 54407671841.0, + "step": 13017 + }, + { + "epoch": 1.5469994058229353, + "grad_norm": 0.24864220005898743, + "learning_rate": 4.316140882327688e-06, + "loss": 0.8524, + "num_tokens": 54411861727.0, + "step": 13018 + }, + { + "epoch": 1.5471182412358884, + "grad_norm": 0.2652089644062138, + "learning_rate": 4.314981145926869e-06, + "loss": 0.7852, + "num_tokens": 54416051670.0, + "step": 13019 + }, + { + "epoch": 1.5472370766488415, + "grad_norm": 0.23480489603039179, + "learning_rate": 4.313821657094904e-06, + "loss": 0.8221, + "num_tokens": 54420242403.0, + "step": 13020 + }, + { + "epoch": 1.5473559120617946, + "grad_norm": 0.2621312724958293, + "learning_rate": 4.31266241587473e-06, + "loss": 0.8354, + "num_tokens": 54424431678.0, + "step": 13021 + }, + { + "epoch": 1.5474747474747474, + "grad_norm": 0.23433447264040833, + "learning_rate": 4.3115034223092755e-06, + "loss": 0.7571, + "num_tokens": 54428620544.0, + "step": 13022 + }, + { + "epoch": 1.5475935828877005, + "grad_norm": 0.2650831010145043, + "learning_rate": 4.310344676441468e-06, + "loss": 0.7951, + "num_tokens": 54432810220.0, + "step": 13023 + }, + { + "epoch": 1.5477124183006536, + "grad_norm": 0.250618028631899, + "learning_rate": 4.309186178314212e-06, + "loss": 0.8058, + "num_tokens": 54436998596.0, + "step": 13024 + }, + { + "epoch": 1.5478312537136065, + "grad_norm": 0.27631307169759983, + "learning_rate": 4.30802792797041e-06, + "loss": 0.8486, + "num_tokens": 54441172333.0, + "step": 13025 + }, + { + "epoch": 1.5479500891265596, + "grad_norm": 0.2633239911015751, + "learning_rate": 4.306869925452965e-06, + "loss": 0.7956, + "num_tokens": 54445331567.0, + "step": 13026 + }, + { + "epoch": 1.5480689245395127, + "grad_norm": 0.23106700110929101, + "learning_rate": 4.305712170804756e-06, + "loss": 0.808, + "num_tokens": 54449483834.0, + "step": 13027 + }, + { + "epoch": 1.5481877599524658, + "grad_norm": 0.2664849335948986, + "learning_rate": 4.3045546640686615e-06, + "loss": 0.8101, + "num_tokens": 54453672068.0, + "step": 13028 + }, + { + "epoch": 1.548306595365419, + "grad_norm": 0.24717481566266222, + "learning_rate": 4.303397405287543e-06, + "loss": 0.8235, + "num_tokens": 54457855008.0, + "step": 13029 + }, + { + "epoch": 1.548425430778372, + "grad_norm": 0.2533334565442263, + "learning_rate": 4.30224039450426e-06, + "loss": 0.8181, + "num_tokens": 54461998110.0, + "step": 13030 + }, + { + "epoch": 1.5485442661913251, + "grad_norm": 0.23031145277023257, + "learning_rate": 4.301083631761662e-06, + "loss": 0.7632, + "num_tokens": 54466186387.0, + "step": 13031 + }, + { + "epoch": 1.5486631016042782, + "grad_norm": 0.2350058648011102, + "learning_rate": 4.2999271171025834e-06, + "loss": 0.8286, + "num_tokens": 54470362596.0, + "step": 13032 + }, + { + "epoch": 1.548781937017231, + "grad_norm": 0.25868175895401235, + "learning_rate": 4.298770850569862e-06, + "loss": 0.8175, + "num_tokens": 54474551989.0, + "step": 13033 + }, + { + "epoch": 1.5489007724301842, + "grad_norm": 0.23892531478213602, + "learning_rate": 4.297614832206311e-06, + "loss": 0.8333, + "num_tokens": 54478741013.0, + "step": 13034 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.2562187973055835, + "learning_rate": 4.296459062054744e-06, + "loss": 0.7944, + "num_tokens": 54482908408.0, + "step": 13035 + }, + { + "epoch": 1.5491384432560902, + "grad_norm": 0.24017040875334345, + "learning_rate": 4.295303540157963e-06, + "loss": 0.8013, + "num_tokens": 54487097094.0, + "step": 13036 + }, + { + "epoch": 1.5492572786690433, + "grad_norm": 0.29320125821971216, + "learning_rate": 4.294148266558761e-06, + "loss": 0.8005, + "num_tokens": 54491244122.0, + "step": 13037 + }, + { + "epoch": 1.5493761140819964, + "grad_norm": 0.26429925246601127, + "learning_rate": 4.292993241299922e-06, + "loss": 0.8263, + "num_tokens": 54495432151.0, + "step": 13038 + }, + { + "epoch": 1.5494949494949495, + "grad_norm": 0.2450338152779073, + "learning_rate": 4.29183846442422e-06, + "loss": 0.8267, + "num_tokens": 54499593442.0, + "step": 13039 + }, + { + "epoch": 1.5496137849079026, + "grad_norm": 0.26369129345802045, + "learning_rate": 4.290683935974421e-06, + "loss": 0.8384, + "num_tokens": 54503782136.0, + "step": 13040 + }, + { + "epoch": 1.5497326203208557, + "grad_norm": 0.2462051150508206, + "learning_rate": 4.289529655993281e-06, + "loss": 0.7535, + "num_tokens": 54507970860.0, + "step": 13041 + }, + { + "epoch": 1.5498514557338088, + "grad_norm": 0.2572485240854914, + "learning_rate": 4.288375624523545e-06, + "loss": 0.8506, + "num_tokens": 54512160362.0, + "step": 13042 + }, + { + "epoch": 1.5499702911467619, + "grad_norm": 0.2586834572899786, + "learning_rate": 4.287221841607952e-06, + "loss": 0.8162, + "num_tokens": 54516349919.0, + "step": 13043 + }, + { + "epoch": 1.5500891265597148, + "grad_norm": 0.26552437444089483, + "learning_rate": 4.286068307289233e-06, + "loss": 0.8368, + "num_tokens": 54520538102.0, + "step": 13044 + }, + { + "epoch": 1.5502079619726679, + "grad_norm": 0.2742391467489225, + "learning_rate": 4.284915021610103e-06, + "loss": 0.7564, + "num_tokens": 54524715927.0, + "step": 13045 + }, + { + "epoch": 1.550326797385621, + "grad_norm": 0.25412514344077297, + "learning_rate": 4.283761984613275e-06, + "loss": 0.8555, + "num_tokens": 54528891577.0, + "step": 13046 + }, + { + "epoch": 1.5504456327985738, + "grad_norm": 0.2816715334586225, + "learning_rate": 4.282609196341448e-06, + "loss": 0.8181, + "num_tokens": 54533080181.0, + "step": 13047 + }, + { + "epoch": 1.550564468211527, + "grad_norm": 0.2662357058447942, + "learning_rate": 4.281456656837314e-06, + "loss": 0.7949, + "num_tokens": 54537247714.0, + "step": 13048 + }, + { + "epoch": 1.55068330362448, + "grad_norm": 0.26544328596401956, + "learning_rate": 4.28030436614356e-06, + "loss": 0.7714, + "num_tokens": 54541436670.0, + "step": 13049 + }, + { + "epoch": 1.5508021390374331, + "grad_norm": 0.2651094112088489, + "learning_rate": 4.279152324302852e-06, + "loss": 0.7941, + "num_tokens": 54545624910.0, + "step": 13050 + }, + { + "epoch": 1.5509209744503862, + "grad_norm": 0.25457360094858744, + "learning_rate": 4.2780005313578535e-06, + "loss": 0.8035, + "num_tokens": 54549813837.0, + "step": 13051 + }, + { + "epoch": 1.5510398098633393, + "grad_norm": 0.27617235407297774, + "learning_rate": 4.276848987351227e-06, + "loss": 0.795, + "num_tokens": 54554002974.0, + "step": 13052 + }, + { + "epoch": 1.5511586452762924, + "grad_norm": 0.24976162871830065, + "learning_rate": 4.275697692325613e-06, + "loss": 0.7727, + "num_tokens": 54558180506.0, + "step": 13053 + }, + { + "epoch": 1.5512774806892455, + "grad_norm": 0.264371298354302, + "learning_rate": 4.274546646323652e-06, + "loss": 0.8056, + "num_tokens": 54562354017.0, + "step": 13054 + }, + { + "epoch": 1.5513963161021984, + "grad_norm": 0.2516196718311674, + "learning_rate": 4.273395849387966e-06, + "loss": 0.7926, + "num_tokens": 54566540843.0, + "step": 13055 + }, + { + "epoch": 1.5515151515151515, + "grad_norm": 0.2629899028846619, + "learning_rate": 4.272245301561173e-06, + "loss": 0.7704, + "num_tokens": 54570703464.0, + "step": 13056 + }, + { + "epoch": 1.5516339869281046, + "grad_norm": 0.2540614186083764, + "learning_rate": 4.271095002885883e-06, + "loss": 0.7855, + "num_tokens": 54574879687.0, + "step": 13057 + }, + { + "epoch": 1.5517528223410575, + "grad_norm": 0.27622515845733475, + "learning_rate": 4.269944953404697e-06, + "loss": 0.8382, + "num_tokens": 54579069002.0, + "step": 13058 + }, + { + "epoch": 1.5518716577540106, + "grad_norm": 0.24579853508878047, + "learning_rate": 4.268795153160208e-06, + "loss": 0.8586, + "num_tokens": 54583258003.0, + "step": 13059 + }, + { + "epoch": 1.5519904931669637, + "grad_norm": 0.27752186590965344, + "learning_rate": 4.267645602194991e-06, + "loss": 0.8183, + "num_tokens": 54587447967.0, + "step": 13060 + }, + { + "epoch": 1.5521093285799168, + "grad_norm": 0.2474110928949565, + "learning_rate": 4.2664963005516195e-06, + "loss": 0.827, + "num_tokens": 54591593424.0, + "step": 13061 + }, + { + "epoch": 1.55222816399287, + "grad_norm": 0.26255258251536245, + "learning_rate": 4.265347248272656e-06, + "loss": 0.7691, + "num_tokens": 54595752663.0, + "step": 13062 + }, + { + "epoch": 1.552346999405823, + "grad_norm": 0.24984498297798993, + "learning_rate": 4.264198445400652e-06, + "loss": 0.8071, + "num_tokens": 54599943277.0, + "step": 13063 + }, + { + "epoch": 1.552465834818776, + "grad_norm": 0.2679929424664496, + "learning_rate": 4.263049891978158e-06, + "loss": 0.7764, + "num_tokens": 54604133512.0, + "step": 13064 + }, + { + "epoch": 1.5525846702317292, + "grad_norm": 0.26351817275033085, + "learning_rate": 4.261901588047704e-06, + "loss": 0.8051, + "num_tokens": 54608292473.0, + "step": 13065 + }, + { + "epoch": 1.552703505644682, + "grad_norm": 0.2607547766903586, + "learning_rate": 4.260753533651815e-06, + "loss": 0.7948, + "num_tokens": 54612482589.0, + "step": 13066 + }, + { + "epoch": 1.5528223410576352, + "grad_norm": 0.24216845298885434, + "learning_rate": 4.259605728833008e-06, + "loss": 0.8276, + "num_tokens": 54616670264.0, + "step": 13067 + }, + { + "epoch": 1.5529411764705883, + "grad_norm": 0.2743710494743831, + "learning_rate": 4.258458173633791e-06, + "loss": 0.8059, + "num_tokens": 54620858593.0, + "step": 13068 + }, + { + "epoch": 1.5530600118835411, + "grad_norm": 0.2488906524460419, + "learning_rate": 4.257310868096663e-06, + "loss": 0.8295, + "num_tokens": 54625048310.0, + "step": 13069 + }, + { + "epoch": 1.5531788472964942, + "grad_norm": 0.263726745366842, + "learning_rate": 4.25616381226411e-06, + "loss": 0.8031, + "num_tokens": 54629238898.0, + "step": 13070 + }, + { + "epoch": 1.5532976827094473, + "grad_norm": 0.23414989170711029, + "learning_rate": 4.255017006178612e-06, + "loss": 0.8191, + "num_tokens": 54633401053.0, + "step": 13071 + }, + { + "epoch": 1.5534165181224004, + "grad_norm": 0.23519069114587313, + "learning_rate": 4.253870449882641e-06, + "loss": 0.8217, + "num_tokens": 54637588717.0, + "step": 13072 + }, + { + "epoch": 1.5535353535353535, + "grad_norm": 0.24250347194688882, + "learning_rate": 4.252724143418655e-06, + "loss": 0.7607, + "num_tokens": 54641778883.0, + "step": 13073 + }, + { + "epoch": 1.5536541889483066, + "grad_norm": 0.23129016730745103, + "learning_rate": 4.251578086829109e-06, + "loss": 0.8386, + "num_tokens": 54645967129.0, + "step": 13074 + }, + { + "epoch": 1.5537730243612597, + "grad_norm": 0.2456470723443808, + "learning_rate": 4.250432280156442e-06, + "loss": 0.8237, + "num_tokens": 54650156497.0, + "step": 13075 + }, + { + "epoch": 1.5538918597742128, + "grad_norm": 0.2581051711120908, + "learning_rate": 4.2492867234430875e-06, + "loss": 0.7833, + "num_tokens": 54654333530.0, + "step": 13076 + }, + { + "epoch": 1.5540106951871657, + "grad_norm": 0.22763155818183853, + "learning_rate": 4.248141416731473e-06, + "loss": 0.822, + "num_tokens": 54658516770.0, + "step": 13077 + }, + { + "epoch": 1.5541295306001188, + "grad_norm": 0.26894935342970244, + "learning_rate": 4.246996360064008e-06, + "loss": 0.8452, + "num_tokens": 54662706393.0, + "step": 13078 + }, + { + "epoch": 1.554248366013072, + "grad_norm": 0.2410611833657383, + "learning_rate": 4.245851553483101e-06, + "loss": 0.8173, + "num_tokens": 54666892209.0, + "step": 13079 + }, + { + "epoch": 1.5543672014260248, + "grad_norm": 0.2462147162427183, + "learning_rate": 4.244706997031148e-06, + "loss": 0.8465, + "num_tokens": 54671077111.0, + "step": 13080 + }, + { + "epoch": 1.554486036838978, + "grad_norm": 0.24517277144116884, + "learning_rate": 4.243562690750537e-06, + "loss": 0.8356, + "num_tokens": 54675266372.0, + "step": 13081 + }, + { + "epoch": 1.554604872251931, + "grad_norm": 0.25044271205640994, + "learning_rate": 4.24241863468364e-06, + "loss": 0.8129, + "num_tokens": 54679439776.0, + "step": 13082 + }, + { + "epoch": 1.554723707664884, + "grad_norm": 0.2569075164423285, + "learning_rate": 4.241274828872829e-06, + "loss": 0.797, + "num_tokens": 54683624319.0, + "step": 13083 + }, + { + "epoch": 1.5548425430778372, + "grad_norm": 0.26373733225165397, + "learning_rate": 4.240131273360464e-06, + "loss": 0.8206, + "num_tokens": 54687813502.0, + "step": 13084 + }, + { + "epoch": 1.5549613784907903, + "grad_norm": 0.24182540579265052, + "learning_rate": 4.2389879681888925e-06, + "loss": 0.7988, + "num_tokens": 54692002905.0, + "step": 13085 + }, + { + "epoch": 1.5550802139037434, + "grad_norm": 0.2583275768242648, + "learning_rate": 4.23784491340046e-06, + "loss": 0.7912, + "num_tokens": 54696187982.0, + "step": 13086 + }, + { + "epoch": 1.5551990493166965, + "grad_norm": 0.23776194535700118, + "learning_rate": 4.236702109037491e-06, + "loss": 0.8006, + "num_tokens": 54700377756.0, + "step": 13087 + }, + { + "epoch": 1.5553178847296494, + "grad_norm": 0.24075453655303575, + "learning_rate": 4.2355595551423105e-06, + "loss": 0.7759, + "num_tokens": 54704567858.0, + "step": 13088 + }, + { + "epoch": 1.5554367201426025, + "grad_norm": 0.2398700889100314, + "learning_rate": 4.234417251757227e-06, + "loss": 0.8014, + "num_tokens": 54708756294.0, + "step": 13089 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.25171777099800524, + "learning_rate": 4.233275198924551e-06, + "loss": 0.7915, + "num_tokens": 54712945240.0, + "step": 13090 + }, + { + "epoch": 1.5556743909685085, + "grad_norm": 0.24717067180294652, + "learning_rate": 4.232133396686575e-06, + "loss": 0.7932, + "num_tokens": 54717133565.0, + "step": 13091 + }, + { + "epoch": 1.5557932263814616, + "grad_norm": 0.2492019962158536, + "learning_rate": 4.2309918450855814e-06, + "loss": 0.8346, + "num_tokens": 54721322786.0, + "step": 13092 + }, + { + "epoch": 1.5559120617944147, + "grad_norm": 0.2400404030041668, + "learning_rate": 4.229850544163844e-06, + "loss": 0.7969, + "num_tokens": 54725477364.0, + "step": 13093 + }, + { + "epoch": 1.5560308972073678, + "grad_norm": 0.23900379388648543, + "learning_rate": 4.228709493963633e-06, + "loss": 0.812, + "num_tokens": 54729664649.0, + "step": 13094 + }, + { + "epoch": 1.5561497326203209, + "grad_norm": 0.24601230595621595, + "learning_rate": 4.227568694527201e-06, + "loss": 0.7888, + "num_tokens": 54733854223.0, + "step": 13095 + }, + { + "epoch": 1.556268568033274, + "grad_norm": 0.24056429390287545, + "learning_rate": 4.226428145896803e-06, + "loss": 0.8356, + "num_tokens": 54738044502.0, + "step": 13096 + }, + { + "epoch": 1.556387403446227, + "grad_norm": 0.24056165256210021, + "learning_rate": 4.22528784811467e-06, + "loss": 0.7949, + "num_tokens": 54742234310.0, + "step": 13097 + }, + { + "epoch": 1.5565062388591802, + "grad_norm": 0.2367471501440749, + "learning_rate": 4.224147801223034e-06, + "loss": 0.8306, + "num_tokens": 54746422874.0, + "step": 13098 + }, + { + "epoch": 1.5566250742721333, + "grad_norm": 0.24396881280788685, + "learning_rate": 4.223008005264115e-06, + "loss": 0.7844, + "num_tokens": 54750611119.0, + "step": 13099 + }, + { + "epoch": 1.5567439096850861, + "grad_norm": 0.23804235007914917, + "learning_rate": 4.221868460280123e-06, + "loss": 0.7864, + "num_tokens": 54754795093.0, + "step": 13100 + }, + { + "epoch": 1.5568627450980392, + "grad_norm": 0.23150410736752014, + "learning_rate": 4.220729166313258e-06, + "loss": 0.7878, + "num_tokens": 54758983022.0, + "step": 13101 + }, + { + "epoch": 1.5569815805109921, + "grad_norm": 0.22823406759778103, + "learning_rate": 4.2195901234057145e-06, + "loss": 0.8024, + "num_tokens": 54763166264.0, + "step": 13102 + }, + { + "epoch": 1.5571004159239452, + "grad_norm": 0.22845606411985953, + "learning_rate": 4.218451331599675e-06, + "loss": 0.8007, + "num_tokens": 54767356427.0, + "step": 13103 + }, + { + "epoch": 1.5572192513368983, + "grad_norm": 0.2546438341298501, + "learning_rate": 4.217312790937309e-06, + "loss": 0.8476, + "num_tokens": 54771546083.0, + "step": 13104 + }, + { + "epoch": 1.5573380867498514, + "grad_norm": 0.2439946693173502, + "learning_rate": 4.216174501460784e-06, + "loss": 0.8023, + "num_tokens": 54775714954.0, + "step": 13105 + }, + { + "epoch": 1.5574569221628045, + "grad_norm": 0.25220274829060735, + "learning_rate": 4.2150364632122556e-06, + "loss": 0.7922, + "num_tokens": 54779872277.0, + "step": 13106 + }, + { + "epoch": 1.5575757575757576, + "grad_norm": 0.2577659838847109, + "learning_rate": 4.2138986762338695e-06, + "loss": 0.8233, + "num_tokens": 54784056085.0, + "step": 13107 + }, + { + "epoch": 1.5576945929887107, + "grad_norm": 0.2468942515337258, + "learning_rate": 4.212761140567754e-06, + "loss": 0.8384, + "num_tokens": 54788245375.0, + "step": 13108 + }, + { + "epoch": 1.5578134284016638, + "grad_norm": 0.2497343774686226, + "learning_rate": 4.2116238562560426e-06, + "loss": 0.8089, + "num_tokens": 54792433524.0, + "step": 13109 + }, + { + "epoch": 1.557932263814617, + "grad_norm": 0.23447835652963486, + "learning_rate": 4.2104868233408544e-06, + "loss": 0.8004, + "num_tokens": 54796622794.0, + "step": 13110 + }, + { + "epoch": 1.5580510992275698, + "grad_norm": 0.2501875827600359, + "learning_rate": 4.209350041864294e-06, + "loss": 0.7954, + "num_tokens": 54800810592.0, + "step": 13111 + }, + { + "epoch": 1.558169934640523, + "grad_norm": 0.23366357471803495, + "learning_rate": 4.208213511868464e-06, + "loss": 0.808, + "num_tokens": 54804998988.0, + "step": 13112 + }, + { + "epoch": 1.558288770053476, + "grad_norm": 0.24555039652389096, + "learning_rate": 4.207077233395449e-06, + "loss": 0.8019, + "num_tokens": 54809187998.0, + "step": 13113 + }, + { + "epoch": 1.5584076054664289, + "grad_norm": 0.23534022457413825, + "learning_rate": 4.205941206487327e-06, + "loss": 0.816, + "num_tokens": 54813355653.0, + "step": 13114 + }, + { + "epoch": 1.558526440879382, + "grad_norm": 0.25246390812278996, + "learning_rate": 4.2048054311861785e-06, + "loss": 0.7882, + "num_tokens": 54817545055.0, + "step": 13115 + }, + { + "epoch": 1.558645276292335, + "grad_norm": 0.23538370554395768, + "learning_rate": 4.20366990753406e-06, + "loss": 0.8064, + "num_tokens": 54821695947.0, + "step": 13116 + }, + { + "epoch": 1.5587641117052882, + "grad_norm": 0.2596595681942776, + "learning_rate": 4.202534635573025e-06, + "loss": 0.7854, + "num_tokens": 54825834888.0, + "step": 13117 + }, + { + "epoch": 1.5588829471182413, + "grad_norm": 0.23926775171596582, + "learning_rate": 4.201399615345112e-06, + "loss": 0.817, + "num_tokens": 54829993517.0, + "step": 13118 + }, + { + "epoch": 1.5590017825311944, + "grad_norm": 0.23209809906803872, + "learning_rate": 4.200264846892359e-06, + "loss": 0.7962, + "num_tokens": 54834183297.0, + "step": 13119 + }, + { + "epoch": 1.5591206179441475, + "grad_norm": 0.24336816825421584, + "learning_rate": 4.199130330256789e-06, + "loss": 0.7852, + "num_tokens": 54838351341.0, + "step": 13120 + }, + { + "epoch": 1.5592394533571006, + "grad_norm": 0.23272075652985266, + "learning_rate": 4.197996065480414e-06, + "loss": 0.7896, + "num_tokens": 54842517987.0, + "step": 13121 + }, + { + "epoch": 1.5593582887700534, + "grad_norm": 0.24858386740181004, + "learning_rate": 4.196862052605249e-06, + "loss": 0.812, + "num_tokens": 54846707073.0, + "step": 13122 + }, + { + "epoch": 1.5594771241830065, + "grad_norm": 0.23346091808130637, + "learning_rate": 4.195728291673281e-06, + "loss": 0.788, + "num_tokens": 54850897367.0, + "step": 13123 + }, + { + "epoch": 1.5595959595959596, + "grad_norm": 0.2585669731622351, + "learning_rate": 4.1945947827265e-06, + "loss": 0.8069, + "num_tokens": 54855087347.0, + "step": 13124 + }, + { + "epoch": 1.5597147950089125, + "grad_norm": 0.2322295386611015, + "learning_rate": 4.193461525806883e-06, + "loss": 0.8112, + "num_tokens": 54859276112.0, + "step": 13125 + }, + { + "epoch": 1.5598336304218656, + "grad_norm": 0.25704369618017037, + "learning_rate": 4.192328520956401e-06, + "loss": 0.7937, + "num_tokens": 54863465832.0, + "step": 13126 + }, + { + "epoch": 1.5599524658348187, + "grad_norm": 0.24166085822109734, + "learning_rate": 4.191195768217011e-06, + "loss": 0.8119, + "num_tokens": 54867654555.0, + "step": 13127 + }, + { + "epoch": 1.5600713012477718, + "grad_norm": 0.24451634443756895, + "learning_rate": 4.1900632676306615e-06, + "loss": 0.7921, + "num_tokens": 54871822255.0, + "step": 13128 + }, + { + "epoch": 1.560190136660725, + "grad_norm": 0.2376933011914957, + "learning_rate": 4.188931019239295e-06, + "loss": 0.7865, + "num_tokens": 54875999285.0, + "step": 13129 + }, + { + "epoch": 1.560308972073678, + "grad_norm": 0.26644173193627463, + "learning_rate": 4.187799023084841e-06, + "loss": 0.8267, + "num_tokens": 54880171373.0, + "step": 13130 + }, + { + "epoch": 1.5604278074866311, + "grad_norm": 0.24939184991979302, + "learning_rate": 4.186667279209222e-06, + "loss": 0.8231, + "num_tokens": 54884359067.0, + "step": 13131 + }, + { + "epoch": 1.5605466428995842, + "grad_norm": 0.2510310518355939, + "learning_rate": 4.185535787654351e-06, + "loss": 0.788, + "num_tokens": 54888548124.0, + "step": 13132 + }, + { + "epoch": 1.560665478312537, + "grad_norm": 0.2538756216449602, + "learning_rate": 4.184404548462129e-06, + "loss": 0.7984, + "num_tokens": 54892736471.0, + "step": 13133 + }, + { + "epoch": 1.5607843137254902, + "grad_norm": 0.2550322759873865, + "learning_rate": 4.183273561674451e-06, + "loss": 0.8503, + "num_tokens": 54896925011.0, + "step": 13134 + }, + { + "epoch": 1.5609031491384433, + "grad_norm": 0.25944651092246945, + "learning_rate": 4.182142827333201e-06, + "loss": 0.7538, + "num_tokens": 54901105174.0, + "step": 13135 + }, + { + "epoch": 1.5610219845513962, + "grad_norm": 0.26054318437243934, + "learning_rate": 4.181012345480253e-06, + "loss": 0.8051, + "num_tokens": 54905237136.0, + "step": 13136 + }, + { + "epoch": 1.5611408199643493, + "grad_norm": 0.2545184301144101, + "learning_rate": 4.179882116157474e-06, + "loss": 0.8218, + "num_tokens": 54909407308.0, + "step": 13137 + }, + { + "epoch": 1.5612596553773024, + "grad_norm": 0.24566272268661632, + "learning_rate": 4.178752139406723e-06, + "loss": 0.8182, + "num_tokens": 54913573544.0, + "step": 13138 + }, + { + "epoch": 1.5613784907902555, + "grad_norm": 0.25065019266864946, + "learning_rate": 4.177622415269839e-06, + "loss": 0.8149, + "num_tokens": 54917732180.0, + "step": 13139 + }, + { + "epoch": 1.5614973262032086, + "grad_norm": 0.2513199503858277, + "learning_rate": 4.176492943788664e-06, + "loss": 0.8059, + "num_tokens": 54921904534.0, + "step": 13140 + }, + { + "epoch": 1.5616161616161617, + "grad_norm": 0.25778605487424044, + "learning_rate": 4.175363725005027e-06, + "loss": 0.7768, + "num_tokens": 54926093789.0, + "step": 13141 + }, + { + "epoch": 1.5617349970291148, + "grad_norm": 0.27959673959202125, + "learning_rate": 4.174234758960745e-06, + "loss": 0.8089, + "num_tokens": 54930272679.0, + "step": 13142 + }, + { + "epoch": 1.5618538324420679, + "grad_norm": 0.2693368582312312, + "learning_rate": 4.173106045697632e-06, + "loss": 0.8199, + "num_tokens": 54934461665.0, + "step": 13143 + }, + { + "epoch": 1.5619726678550208, + "grad_norm": 0.26128672081037263, + "learning_rate": 4.1719775852574814e-06, + "loss": 0.8283, + "num_tokens": 54938650604.0, + "step": 13144 + }, + { + "epoch": 1.5620915032679739, + "grad_norm": 0.26196214413449864, + "learning_rate": 4.170849377682088e-06, + "loss": 0.7752, + "num_tokens": 54942836962.0, + "step": 13145 + }, + { + "epoch": 1.562210338680927, + "grad_norm": 0.2549621733636968, + "learning_rate": 4.169721423013229e-06, + "loss": 0.825, + "num_tokens": 54946975716.0, + "step": 13146 + }, + { + "epoch": 1.5623291740938798, + "grad_norm": 0.25016213735122783, + "learning_rate": 4.168593721292682e-06, + "loss": 0.8103, + "num_tokens": 54951163783.0, + "step": 13147 + }, + { + "epoch": 1.562448009506833, + "grad_norm": 0.2492449593603984, + "learning_rate": 4.1674662725622105e-06, + "loss": 0.8035, + "num_tokens": 54955337131.0, + "step": 13148 + }, + { + "epoch": 1.562566844919786, + "grad_norm": 0.2550282499891205, + "learning_rate": 4.1663390768635595e-06, + "loss": 0.7955, + "num_tokens": 54959526067.0, + "step": 13149 + }, + { + "epoch": 1.5626856803327391, + "grad_norm": 0.25941574144057966, + "learning_rate": 4.1652121342384784e-06, + "loss": 0.8148, + "num_tokens": 54963715756.0, + "step": 13150 + }, + { + "epoch": 1.5628045157456922, + "grad_norm": 0.24516106509237848, + "learning_rate": 4.164085444728702e-06, + "loss": 0.7939, + "num_tokens": 54967904880.0, + "step": 13151 + }, + { + "epoch": 1.5629233511586453, + "grad_norm": 0.2555771645260146, + "learning_rate": 4.162959008375951e-06, + "loss": 0.8146, + "num_tokens": 54972095693.0, + "step": 13152 + }, + { + "epoch": 1.5630421865715984, + "grad_norm": 0.25841441229654566, + "learning_rate": 4.161832825221949e-06, + "loss": 0.7987, + "num_tokens": 54976284538.0, + "step": 13153 + }, + { + "epoch": 1.5631610219845515, + "grad_norm": 0.2528080336440362, + "learning_rate": 4.160706895308396e-06, + "loss": 0.8415, + "num_tokens": 54980472160.0, + "step": 13154 + }, + { + "epoch": 1.5632798573975044, + "grad_norm": 0.24807294339226976, + "learning_rate": 4.159581218676992e-06, + "loss": 0.7822, + "num_tokens": 54984630564.0, + "step": 13155 + }, + { + "epoch": 1.5633986928104575, + "grad_norm": 0.2507306239837646, + "learning_rate": 4.1584557953694215e-06, + "loss": 0.7672, + "num_tokens": 54988819659.0, + "step": 13156 + }, + { + "epoch": 1.5635175282234106, + "grad_norm": 0.2231900799466086, + "learning_rate": 4.157330625427366e-06, + "loss": 0.8172, + "num_tokens": 54993007143.0, + "step": 13157 + }, + { + "epoch": 1.5636363636363635, + "grad_norm": 0.23064088904247282, + "learning_rate": 4.1562057088924924e-06, + "loss": 0.824, + "num_tokens": 54997196204.0, + "step": 13158 + }, + { + "epoch": 1.5637551990493166, + "grad_norm": 0.24162084380630167, + "learning_rate": 4.155081045806462e-06, + "loss": 0.8478, + "num_tokens": 55001383149.0, + "step": 13159 + }, + { + "epoch": 1.5638740344622697, + "grad_norm": 0.24937220795552803, + "learning_rate": 4.153956636210923e-06, + "loss": 0.8048, + "num_tokens": 55005563747.0, + "step": 13160 + }, + { + "epoch": 1.5639928698752228, + "grad_norm": 0.22689903025506394, + "learning_rate": 4.152832480147516e-06, + "loss": 0.8178, + "num_tokens": 55009747029.0, + "step": 13161 + }, + { + "epoch": 1.564111705288176, + "grad_norm": 0.23983152130977234, + "learning_rate": 4.151708577657873e-06, + "loss": 0.7931, + "num_tokens": 55013917082.0, + "step": 13162 + }, + { + "epoch": 1.564230540701129, + "grad_norm": 0.23666684767921536, + "learning_rate": 4.150584928783617e-06, + "loss": 0.7903, + "num_tokens": 55018106922.0, + "step": 13163 + }, + { + "epoch": 1.564349376114082, + "grad_norm": 0.23281412003344007, + "learning_rate": 4.149461533566361e-06, + "loss": 0.7826, + "num_tokens": 55022283802.0, + "step": 13164 + }, + { + "epoch": 1.5644682115270352, + "grad_norm": 0.23795876987049097, + "learning_rate": 4.1483383920477024e-06, + "loss": 0.8361, + "num_tokens": 55026474157.0, + "step": 13165 + }, + { + "epoch": 1.564587046939988, + "grad_norm": 0.22460372283622412, + "learning_rate": 4.147215504269242e-06, + "loss": 0.8232, + "num_tokens": 55030663532.0, + "step": 13166 + }, + { + "epoch": 1.5647058823529412, + "grad_norm": 0.22395074119459554, + "learning_rate": 4.146092870272561e-06, + "loss": 0.7937, + "num_tokens": 55034845099.0, + "step": 13167 + }, + { + "epoch": 1.5648247177658943, + "grad_norm": 0.25121231227189866, + "learning_rate": 4.144970490099234e-06, + "loss": 0.7868, + "num_tokens": 55039012274.0, + "step": 13168 + }, + { + "epoch": 1.5649435531788471, + "grad_norm": 0.2528034695739484, + "learning_rate": 4.14384836379083e-06, + "loss": 0.7868, + "num_tokens": 55043202622.0, + "step": 13169 + }, + { + "epoch": 1.5650623885918002, + "grad_norm": 0.24157013713121944, + "learning_rate": 4.142726491388901e-06, + "loss": 0.8173, + "num_tokens": 55047391714.0, + "step": 13170 + }, + { + "epoch": 1.5651812240047533, + "grad_norm": 0.2509864724842542, + "learning_rate": 4.141604872934993e-06, + "loss": 0.7821, + "num_tokens": 55051577430.0, + "step": 13171 + }, + { + "epoch": 1.5653000594177064, + "grad_norm": 0.2556157478983755, + "learning_rate": 4.140483508470643e-06, + "loss": 0.8079, + "num_tokens": 55055765623.0, + "step": 13172 + }, + { + "epoch": 1.5654188948306595, + "grad_norm": 0.2413290112501272, + "learning_rate": 4.1393623980373845e-06, + "loss": 0.8316, + "num_tokens": 55059927887.0, + "step": 13173 + }, + { + "epoch": 1.5655377302436126, + "grad_norm": 0.24184871954904116, + "learning_rate": 4.138241541676736e-06, + "loss": 0.817, + "num_tokens": 55064090377.0, + "step": 13174 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.235845381455628, + "learning_rate": 4.1371209394302e-06, + "loss": 0.7912, + "num_tokens": 55068268950.0, + "step": 13175 + }, + { + "epoch": 1.5657754010695188, + "grad_norm": 0.23840100677170878, + "learning_rate": 4.13600059133928e-06, + "loss": 0.8167, + "num_tokens": 55072459251.0, + "step": 13176 + }, + { + "epoch": 1.5658942364824717, + "grad_norm": 0.25777862020691017, + "learning_rate": 4.1348804974454655e-06, + "loss": 0.825, + "num_tokens": 55076620138.0, + "step": 13177 + }, + { + "epoch": 1.5660130718954248, + "grad_norm": 0.2316825024072574, + "learning_rate": 4.133760657790234e-06, + "loss": 0.8329, + "num_tokens": 55080807671.0, + "step": 13178 + }, + { + "epoch": 1.566131907308378, + "grad_norm": 0.2501571758357672, + "learning_rate": 4.132641072415068e-06, + "loss": 0.8137, + "num_tokens": 55084987547.0, + "step": 13179 + }, + { + "epoch": 1.5662507427213308, + "grad_norm": 0.23062184412971481, + "learning_rate": 4.131521741361418e-06, + "loss": 0.8008, + "num_tokens": 55089177082.0, + "step": 13180 + }, + { + "epoch": 1.566369578134284, + "grad_norm": 0.26986008886200913, + "learning_rate": 4.1304026646707415e-06, + "loss": 0.8345, + "num_tokens": 55093362853.0, + "step": 13181 + }, + { + "epoch": 1.566488413547237, + "grad_norm": 0.24529990558451606, + "learning_rate": 4.12928384238448e-06, + "loss": 0.8266, + "num_tokens": 55097553006.0, + "step": 13182 + }, + { + "epoch": 1.56660724896019, + "grad_norm": 0.2641468944298632, + "learning_rate": 4.12816527454407e-06, + "loss": 0.8353, + "num_tokens": 55101742705.0, + "step": 13183 + }, + { + "epoch": 1.5667260843731432, + "grad_norm": 0.26349972755970896, + "learning_rate": 4.127046961190933e-06, + "loss": 0.7724, + "num_tokens": 55105900821.0, + "step": 13184 + }, + { + "epoch": 1.5668449197860963, + "grad_norm": 0.2745199808982275, + "learning_rate": 4.125928902366485e-06, + "loss": 0.8063, + "num_tokens": 55110033731.0, + "step": 13185 + }, + { + "epoch": 1.5669637551990494, + "grad_norm": 0.2571320969121668, + "learning_rate": 4.124811098112132e-06, + "loss": 0.8339, + "num_tokens": 55114223401.0, + "step": 13186 + }, + { + "epoch": 1.5670825906120025, + "grad_norm": 0.25133164718578194, + "learning_rate": 4.12369354846927e-06, + "loss": 0.7998, + "num_tokens": 55118398241.0, + "step": 13187 + }, + { + "epoch": 1.5672014260249556, + "grad_norm": 0.24147350794443723, + "learning_rate": 4.1225762534792846e-06, + "loss": 0.8163, + "num_tokens": 55122585276.0, + "step": 13188 + }, + { + "epoch": 1.5673202614379085, + "grad_norm": 0.24652734291681416, + "learning_rate": 4.121459213183553e-06, + "loss": 0.8042, + "num_tokens": 55126736607.0, + "step": 13189 + }, + { + "epoch": 1.5674390968508616, + "grad_norm": 0.3281064858690976, + "learning_rate": 4.120342427623445e-06, + "loss": 0.8297, + "num_tokens": 55130924250.0, + "step": 13190 + }, + { + "epoch": 1.5675579322638145, + "grad_norm": 0.2556704884205269, + "learning_rate": 4.119225896840316e-06, + "loss": 0.8043, + "num_tokens": 55135091736.0, + "step": 13191 + }, + { + "epoch": 1.5676767676767676, + "grad_norm": 0.23790359813785944, + "learning_rate": 4.118109620875518e-06, + "loss": 0.805, + "num_tokens": 55139279391.0, + "step": 13192 + }, + { + "epoch": 1.5677956030897207, + "grad_norm": 0.23844111305304883, + "learning_rate": 4.116993599770388e-06, + "loss": 0.7933, + "num_tokens": 55143468651.0, + "step": 13193 + }, + { + "epoch": 1.5679144385026738, + "grad_norm": 0.25603443454722175, + "learning_rate": 4.115877833566256e-06, + "loss": 0.8196, + "num_tokens": 55147657905.0, + "step": 13194 + }, + { + "epoch": 1.5680332739156269, + "grad_norm": 0.2366262195257594, + "learning_rate": 4.114762322304445e-06, + "loss": 0.8676, + "num_tokens": 55151808209.0, + "step": 13195 + }, + { + "epoch": 1.56815210932858, + "grad_norm": 0.2522036944172601, + "learning_rate": 4.113647066026268e-06, + "loss": 0.8179, + "num_tokens": 55155997920.0, + "step": 13196 + }, + { + "epoch": 1.568270944741533, + "grad_norm": 0.2426041990121439, + "learning_rate": 4.112532064773018e-06, + "loss": 0.7818, + "num_tokens": 55160188679.0, + "step": 13197 + }, + { + "epoch": 1.5683897801544862, + "grad_norm": 0.2886393141262674, + "learning_rate": 4.111417318585995e-06, + "loss": 0.829, + "num_tokens": 55164361449.0, + "step": 13198 + }, + { + "epoch": 1.5685086155674393, + "grad_norm": 0.25247098262616763, + "learning_rate": 4.110302827506478e-06, + "loss": 0.7637, + "num_tokens": 55168551480.0, + "step": 13199 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.25505590494278063, + "learning_rate": 4.109188591575744e-06, + "loss": 0.7913, + "num_tokens": 55172741093.0, + "step": 13200 + }, + { + "epoch": 1.5687462863933452, + "grad_norm": 0.27425776787137895, + "learning_rate": 4.1080746108350564e-06, + "loss": 0.8327, + "num_tokens": 55176930780.0, + "step": 13201 + }, + { + "epoch": 1.5688651218062981, + "grad_norm": 0.2655371908129904, + "learning_rate": 4.1069608853256665e-06, + "loss": 0.8461, + "num_tokens": 55181118736.0, + "step": 13202 + }, + { + "epoch": 1.5689839572192512, + "grad_norm": 0.27226809001904834, + "learning_rate": 4.105847415088818e-06, + "loss": 0.7909, + "num_tokens": 55185307840.0, + "step": 13203 + }, + { + "epoch": 1.5691027926322043, + "grad_norm": 0.2633958055929541, + "learning_rate": 4.104734200165751e-06, + "loss": 0.8045, + "num_tokens": 55189476594.0, + "step": 13204 + }, + { + "epoch": 1.5692216280451574, + "grad_norm": 0.2608513895276745, + "learning_rate": 4.10362124059769e-06, + "loss": 0.7907, + "num_tokens": 55193640886.0, + "step": 13205 + }, + { + "epoch": 1.5693404634581105, + "grad_norm": 0.24157272543785052, + "learning_rate": 4.102508536425856e-06, + "loss": 0.806, + "num_tokens": 55197806903.0, + "step": 13206 + }, + { + "epoch": 1.5694592988710636, + "grad_norm": 0.2470484286350307, + "learning_rate": 4.101396087691448e-06, + "loss": 0.8163, + "num_tokens": 55201995888.0, + "step": 13207 + }, + { + "epoch": 1.5695781342840167, + "grad_norm": 0.25345384799891857, + "learning_rate": 4.100283894435669e-06, + "loss": 0.7915, + "num_tokens": 55206185034.0, + "step": 13208 + }, + { + "epoch": 1.5696969696969698, + "grad_norm": 0.25993899488412026, + "learning_rate": 4.099171956699703e-06, + "loss": 0.7707, + "num_tokens": 55210374426.0, + "step": 13209 + }, + { + "epoch": 1.569815805109923, + "grad_norm": 0.2481655661004113, + "learning_rate": 4.098060274524731e-06, + "loss": 0.8258, + "num_tokens": 55214551126.0, + "step": 13210 + }, + { + "epoch": 1.5699346405228758, + "grad_norm": 0.2551743176278518, + "learning_rate": 4.096948847951928e-06, + "loss": 0.7892, + "num_tokens": 55218740401.0, + "step": 13211 + }, + { + "epoch": 1.570053475935829, + "grad_norm": 0.25211442616763463, + "learning_rate": 4.095837677022446e-06, + "loss": 0.7947, + "num_tokens": 55222897530.0, + "step": 13212 + }, + { + "epoch": 1.570172311348782, + "grad_norm": 0.2711981092090016, + "learning_rate": 4.09472676177744e-06, + "loss": 0.8315, + "num_tokens": 55227085673.0, + "step": 13213 + }, + { + "epoch": 1.5702911467617349, + "grad_norm": 0.2575484568260675, + "learning_rate": 4.093616102258047e-06, + "loss": 0.8292, + "num_tokens": 55231274794.0, + "step": 13214 + }, + { + "epoch": 1.570409982174688, + "grad_norm": 0.24725291570950547, + "learning_rate": 4.092505698505402e-06, + "loss": 0.8195, + "num_tokens": 55235409351.0, + "step": 13215 + }, + { + "epoch": 1.570528817587641, + "grad_norm": 0.2414833872622982, + "learning_rate": 4.091395550560625e-06, + "loss": 0.826, + "num_tokens": 55239572764.0, + "step": 13216 + }, + { + "epoch": 1.5706476530005942, + "grad_norm": 0.24359044742186092, + "learning_rate": 4.090285658464831e-06, + "loss": 0.8501, + "num_tokens": 55243762254.0, + "step": 13217 + }, + { + "epoch": 1.5707664884135473, + "grad_norm": 0.2553106666395765, + "learning_rate": 4.0891760222591195e-06, + "loss": 0.7625, + "num_tokens": 55247903838.0, + "step": 13218 + }, + { + "epoch": 1.5708853238265004, + "grad_norm": 0.2571484181173253, + "learning_rate": 4.088066641984587e-06, + "loss": 0.8465, + "num_tokens": 55252092480.0, + "step": 13219 + }, + { + "epoch": 1.5710041592394535, + "grad_norm": 0.25079936109348017, + "learning_rate": 4.086957517682316e-06, + "loss": 0.8288, + "num_tokens": 55256256155.0, + "step": 13220 + }, + { + "epoch": 1.5711229946524066, + "grad_norm": 0.2539810467794983, + "learning_rate": 4.085848649393381e-06, + "loss": 0.828, + "num_tokens": 55260441795.0, + "step": 13221 + }, + { + "epoch": 1.5712418300653594, + "grad_norm": 0.2727737955353785, + "learning_rate": 4.0847400371588495e-06, + "loss": 0.8616, + "num_tokens": 55264631033.0, + "step": 13222 + }, + { + "epoch": 1.5713606654783125, + "grad_norm": 0.26150907203869206, + "learning_rate": 4.083631681019775e-06, + "loss": 0.8201, + "num_tokens": 55268803070.0, + "step": 13223 + }, + { + "epoch": 1.5714795008912656, + "grad_norm": 0.27240092627962426, + "learning_rate": 4.082523581017204e-06, + "loss": 0.8023, + "num_tokens": 55272979380.0, + "step": 13224 + }, + { + "epoch": 1.5715983363042185, + "grad_norm": 0.2623509421237431, + "learning_rate": 4.081415737192173e-06, + "loss": 0.8045, + "num_tokens": 55277168817.0, + "step": 13225 + }, + { + "epoch": 1.5717171717171716, + "grad_norm": 0.24497939291521742, + "learning_rate": 4.08030814958571e-06, + "loss": 0.7856, + "num_tokens": 55281297642.0, + "step": 13226 + }, + { + "epoch": 1.5718360071301247, + "grad_norm": 0.25952711332088635, + "learning_rate": 4.0792008182388345e-06, + "loss": 0.8102, + "num_tokens": 55285488002.0, + "step": 13227 + }, + { + "epoch": 1.5719548425430778, + "grad_norm": 0.252208959546102, + "learning_rate": 4.078093743192549e-06, + "loss": 0.795, + "num_tokens": 55289677642.0, + "step": 13228 + }, + { + "epoch": 1.572073677956031, + "grad_norm": 0.25497340473550256, + "learning_rate": 4.076986924487855e-06, + "loss": 0.8172, + "num_tokens": 55293868218.0, + "step": 13229 + }, + { + "epoch": 1.572192513368984, + "grad_norm": 0.26285660610414935, + "learning_rate": 4.075880362165744e-06, + "loss": 0.8153, + "num_tokens": 55298044405.0, + "step": 13230 + }, + { + "epoch": 1.5723113487819371, + "grad_norm": 0.2524037534526514, + "learning_rate": 4.074774056267193e-06, + "loss": 0.7778, + "num_tokens": 55302234219.0, + "step": 13231 + }, + { + "epoch": 1.5724301841948902, + "grad_norm": 0.26904546568570364, + "learning_rate": 4.073668006833179e-06, + "loss": 0.8139, + "num_tokens": 55306422548.0, + "step": 13232 + }, + { + "epoch": 1.572549019607843, + "grad_norm": 0.27382375758951555, + "learning_rate": 4.072562213904653e-06, + "loss": 0.7998, + "num_tokens": 55310611421.0, + "step": 13233 + }, + { + "epoch": 1.5726678550207962, + "grad_norm": 0.2702061856962505, + "learning_rate": 4.0714566775225695e-06, + "loss": 0.8231, + "num_tokens": 55314799813.0, + "step": 13234 + }, + { + "epoch": 1.5727866904337493, + "grad_norm": 0.25905626630197426, + "learning_rate": 4.07035139772787e-06, + "loss": 0.7901, + "num_tokens": 55318991357.0, + "step": 13235 + }, + { + "epoch": 1.5729055258467022, + "grad_norm": 0.2688293689083469, + "learning_rate": 4.0692463745614895e-06, + "loss": 0.8063, + "num_tokens": 55323180308.0, + "step": 13236 + }, + { + "epoch": 1.5730243612596553, + "grad_norm": 0.2413783193963214, + "learning_rate": 4.0681416080643536e-06, + "loss": 0.793, + "num_tokens": 55327349056.0, + "step": 13237 + }, + { + "epoch": 1.5731431966726084, + "grad_norm": 0.24570078346812196, + "learning_rate": 4.067037098277366e-06, + "loss": 0.8088, + "num_tokens": 55331514657.0, + "step": 13238 + }, + { + "epoch": 1.5732620320855615, + "grad_norm": 0.26364797140143204, + "learning_rate": 4.065932845241437e-06, + "loss": 0.816, + "num_tokens": 55335684811.0, + "step": 13239 + }, + { + "epoch": 1.5733808674985146, + "grad_norm": 0.2499489204622568, + "learning_rate": 4.064828848997459e-06, + "loss": 0.8048, + "num_tokens": 55339873402.0, + "step": 13240 + }, + { + "epoch": 1.5734997029114677, + "grad_norm": 0.23822562850940251, + "learning_rate": 4.063725109586317e-06, + "loss": 0.7902, + "num_tokens": 55344063190.0, + "step": 13241 + }, + { + "epoch": 1.5736185383244208, + "grad_norm": 0.2673020050866528, + "learning_rate": 4.062621627048887e-06, + "loss": 0.8225, + "num_tokens": 55348239274.0, + "step": 13242 + }, + { + "epoch": 1.5737373737373739, + "grad_norm": 0.23602416499377527, + "learning_rate": 4.061518401426033e-06, + "loss": 0.8295, + "num_tokens": 55352428953.0, + "step": 13243 + }, + { + "epoch": 1.5738562091503268, + "grad_norm": 0.2564839867590143, + "learning_rate": 4.060415432758612e-06, + "loss": 0.7902, + "num_tokens": 55356587931.0, + "step": 13244 + }, + { + "epoch": 1.5739750445632799, + "grad_norm": 0.2432099892379126, + "learning_rate": 4.059312721087472e-06, + "loss": 0.807, + "num_tokens": 55360765931.0, + "step": 13245 + }, + { + "epoch": 1.574093879976233, + "grad_norm": 0.23828126362886087, + "learning_rate": 4.058210266453448e-06, + "loss": 0.8107, + "num_tokens": 55364956488.0, + "step": 13246 + }, + { + "epoch": 1.5742127153891858, + "grad_norm": 0.23848409559524153, + "learning_rate": 4.057108068897369e-06, + "loss": 0.8091, + "num_tokens": 55369145837.0, + "step": 13247 + }, + { + "epoch": 1.574331550802139, + "grad_norm": 0.25255559983481596, + "learning_rate": 4.056006128460052e-06, + "loss": 0.8113, + "num_tokens": 55373280768.0, + "step": 13248 + }, + { + "epoch": 1.574450386215092, + "grad_norm": 0.24739333998724697, + "learning_rate": 4.0549044451823084e-06, + "loss": 0.8121, + "num_tokens": 55377469023.0, + "step": 13249 + }, + { + "epoch": 1.5745692216280451, + "grad_norm": 0.24822239711879782, + "learning_rate": 4.053803019104934e-06, + "loss": 0.7886, + "num_tokens": 55381647216.0, + "step": 13250 + }, + { + "epoch": 1.5746880570409982, + "grad_norm": 0.2395209377405229, + "learning_rate": 4.0527018502687186e-06, + "loss": 0.7751, + "num_tokens": 55385833797.0, + "step": 13251 + }, + { + "epoch": 1.5748068924539513, + "grad_norm": 0.24583576604169355, + "learning_rate": 4.051600938714444e-06, + "loss": 0.7913, + "num_tokens": 55390024191.0, + "step": 13252 + }, + { + "epoch": 1.5749257278669044, + "grad_norm": 0.24200782061723933, + "learning_rate": 4.050500284482882e-06, + "loss": 0.7845, + "num_tokens": 55394212502.0, + "step": 13253 + }, + { + "epoch": 1.5750445632798575, + "grad_norm": 0.26612507243463746, + "learning_rate": 4.049399887614786e-06, + "loss": 0.7948, + "num_tokens": 55398380900.0, + "step": 13254 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.22655370940582495, + "learning_rate": 4.048299748150916e-06, + "loss": 0.7931, + "num_tokens": 55402569945.0, + "step": 13255 + }, + { + "epoch": 1.5752822341057635, + "grad_norm": 0.2473623940860772, + "learning_rate": 4.0471998661320085e-06, + "loss": 0.8438, + "num_tokens": 55406689442.0, + "step": 13256 + }, + { + "epoch": 1.5754010695187166, + "grad_norm": 0.25108019456961295, + "learning_rate": 4.046100241598799e-06, + "loss": 0.797, + "num_tokens": 55410879003.0, + "step": 13257 + }, + { + "epoch": 1.5755199049316695, + "grad_norm": 0.2572616186706993, + "learning_rate": 4.045000874592013e-06, + "loss": 0.7993, + "num_tokens": 55415068478.0, + "step": 13258 + }, + { + "epoch": 1.5756387403446226, + "grad_norm": 0.258855464029556, + "learning_rate": 4.0439017651523565e-06, + "loss": 0.7978, + "num_tokens": 55419256034.0, + "step": 13259 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 0.26651760754150067, + "learning_rate": 4.042802913320537e-06, + "loss": 0.7961, + "num_tokens": 55423432173.0, + "step": 13260 + }, + { + "epoch": 1.5758764111705288, + "grad_norm": 0.23856017098506238, + "learning_rate": 4.0417043191372455e-06, + "loss": 0.8085, + "num_tokens": 55427622231.0, + "step": 13261 + }, + { + "epoch": 1.575995246583482, + "grad_norm": 0.2438758533500506, + "learning_rate": 4.040605982643172e-06, + "loss": 0.8384, + "num_tokens": 55431748877.0, + "step": 13262 + }, + { + "epoch": 1.576114081996435, + "grad_norm": 0.2499875075768613, + "learning_rate": 4.039507903878993e-06, + "loss": 0.7851, + "num_tokens": 55435888169.0, + "step": 13263 + }, + { + "epoch": 1.576232917409388, + "grad_norm": 0.2521654694804331, + "learning_rate": 4.038410082885366e-06, + "loss": 0.8333, + "num_tokens": 55440076775.0, + "step": 13264 + }, + { + "epoch": 1.5763517528223412, + "grad_norm": 0.2446932586378133, + "learning_rate": 4.037312519702953e-06, + "loss": 0.851, + "num_tokens": 55444265988.0, + "step": 13265 + }, + { + "epoch": 1.576470588235294, + "grad_norm": 0.24537201599450645, + "learning_rate": 4.036215214372398e-06, + "loss": 0.8149, + "num_tokens": 55448455013.0, + "step": 13266 + }, + { + "epoch": 1.5765894236482472, + "grad_norm": 0.28351699485572357, + "learning_rate": 4.035118166934336e-06, + "loss": 0.8142, + "num_tokens": 55452627923.0, + "step": 13267 + }, + { + "epoch": 1.5767082590612003, + "grad_norm": 0.23739133453009956, + "learning_rate": 4.034021377429403e-06, + "loss": 0.8098, + "num_tokens": 55456815735.0, + "step": 13268 + }, + { + "epoch": 1.5768270944741531, + "grad_norm": 0.2554904421501261, + "learning_rate": 4.0329248458982075e-06, + "loss": 0.8212, + "num_tokens": 55461003029.0, + "step": 13269 + }, + { + "epoch": 1.5769459298871062, + "grad_norm": 0.25016226903140815, + "learning_rate": 4.031828572381362e-06, + "loss": 0.7702, + "num_tokens": 55465193493.0, + "step": 13270 + }, + { + "epoch": 1.5770647653000593, + "grad_norm": 0.26620901953466575, + "learning_rate": 4.030732556919463e-06, + "loss": 0.8421, + "num_tokens": 55469384344.0, + "step": 13271 + }, + { + "epoch": 1.5771836007130124, + "grad_norm": 0.26753566623693165, + "learning_rate": 4.029636799553102e-06, + "loss": 0.8389, + "num_tokens": 55473574955.0, + "step": 13272 + }, + { + "epoch": 1.5773024361259655, + "grad_norm": 0.2574554229095629, + "learning_rate": 4.0285413003228576e-06, + "loss": 0.7938, + "num_tokens": 55477763740.0, + "step": 13273 + }, + { + "epoch": 1.5774212715389186, + "grad_norm": 0.2594938290445593, + "learning_rate": 4.027446059269301e-06, + "loss": 0.8239, + "num_tokens": 55481908675.0, + "step": 13274 + }, + { + "epoch": 1.5775401069518717, + "grad_norm": 0.2853848334035274, + "learning_rate": 4.02635107643299e-06, + "loss": 0.8155, + "num_tokens": 55486061421.0, + "step": 13275 + }, + { + "epoch": 1.5776589423648248, + "grad_norm": 0.25410856900692386, + "learning_rate": 4.0252563518544775e-06, + "loss": 0.7936, + "num_tokens": 55490239980.0, + "step": 13276 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 0.2482875909364235, + "learning_rate": 4.0241618855743045e-06, + "loss": 0.8175, + "num_tokens": 55494427429.0, + "step": 13277 + }, + { + "epoch": 1.5778966131907308, + "grad_norm": 0.2497150312604864, + "learning_rate": 4.023067677633003e-06, + "loss": 0.7871, + "num_tokens": 55498615371.0, + "step": 13278 + }, + { + "epoch": 1.578015448603684, + "grad_norm": 0.25159291707526454, + "learning_rate": 4.021973728071097e-06, + "loss": 0.8276, + "num_tokens": 55502804201.0, + "step": 13279 + }, + { + "epoch": 1.5781342840166368, + "grad_norm": 0.24527775533111287, + "learning_rate": 4.020880036929093e-06, + "loss": 0.8266, + "num_tokens": 55506994123.0, + "step": 13280 + }, + { + "epoch": 1.57825311942959, + "grad_norm": 0.2706465498084389, + "learning_rate": 4.0197866042475e-06, + "loss": 0.8081, + "num_tokens": 55511180470.0, + "step": 13281 + }, + { + "epoch": 1.578371954842543, + "grad_norm": 0.24555701316325332, + "learning_rate": 4.01869343006681e-06, + "loss": 0.8197, + "num_tokens": 55515359030.0, + "step": 13282 + }, + { + "epoch": 1.578490790255496, + "grad_norm": 0.27023811910747986, + "learning_rate": 4.017600514427506e-06, + "loss": 0.8347, + "num_tokens": 55519541109.0, + "step": 13283 + }, + { + "epoch": 1.5786096256684492, + "grad_norm": 0.2539233853054361, + "learning_rate": 4.0165078573700674e-06, + "loss": 0.8012, + "num_tokens": 55523730830.0, + "step": 13284 + }, + { + "epoch": 1.5787284610814023, + "grad_norm": 0.266482224960515, + "learning_rate": 4.01541545893495e-06, + "loss": 0.7689, + "num_tokens": 55527892132.0, + "step": 13285 + }, + { + "epoch": 1.5788472964943554, + "grad_norm": 0.2584772243608878, + "learning_rate": 4.014323319162612e-06, + "loss": 0.7869, + "num_tokens": 55532062706.0, + "step": 13286 + }, + { + "epoch": 1.5789661319073085, + "grad_norm": 0.23975783282399987, + "learning_rate": 4.013231438093502e-06, + "loss": 0.7592, + "num_tokens": 55536226154.0, + "step": 13287 + }, + { + "epoch": 1.5790849673202616, + "grad_norm": 0.24656689176790103, + "learning_rate": 4.012139815768057e-06, + "loss": 0.8071, + "num_tokens": 55540415837.0, + "step": 13288 + }, + { + "epoch": 1.5792038027332145, + "grad_norm": 0.25138170397460174, + "learning_rate": 4.011048452226701e-06, + "loss": 0.8229, + "num_tokens": 55544605050.0, + "step": 13289 + }, + { + "epoch": 1.5793226381461676, + "grad_norm": 0.2511686601692055, + "learning_rate": 4.00995734750985e-06, + "loss": 0.818, + "num_tokens": 55548794728.0, + "step": 13290 + }, + { + "epoch": 1.5794414735591205, + "grad_norm": 0.2695757579970694, + "learning_rate": 4.008866501657913e-06, + "loss": 0.7866, + "num_tokens": 55552982241.0, + "step": 13291 + }, + { + "epoch": 1.5795603089720736, + "grad_norm": 0.23341522302130185, + "learning_rate": 4.007775914711283e-06, + "loss": 0.7853, + "num_tokens": 55557172376.0, + "step": 13292 + }, + { + "epoch": 1.5796791443850267, + "grad_norm": 0.24297505483581835, + "learning_rate": 4.006685586710356e-06, + "loss": 0.7992, + "num_tokens": 55561335283.0, + "step": 13293 + }, + { + "epoch": 1.5797979797979798, + "grad_norm": 0.24957760683932104, + "learning_rate": 4.005595517695507e-06, + "loss": 0.7918, + "num_tokens": 55565525917.0, + "step": 13294 + }, + { + "epoch": 1.5799168152109329, + "grad_norm": 0.2398688748329276, + "learning_rate": 4.004505707707104e-06, + "loss": 0.8122, + "num_tokens": 55569702143.0, + "step": 13295 + }, + { + "epoch": 1.580035650623886, + "grad_norm": 0.24755214127014386, + "learning_rate": 4.003416156785506e-06, + "loss": 0.7985, + "num_tokens": 55573879628.0, + "step": 13296 + }, + { + "epoch": 1.580154486036839, + "grad_norm": 0.2574020670786231, + "learning_rate": 4.0023268649710655e-06, + "loss": 0.8128, + "num_tokens": 55578067842.0, + "step": 13297 + }, + { + "epoch": 1.5802733214497922, + "grad_norm": 0.23600643190300996, + "learning_rate": 4.001237832304119e-06, + "loss": 0.8037, + "num_tokens": 55582244377.0, + "step": 13298 + }, + { + "epoch": 1.5803921568627453, + "grad_norm": 0.25080731015478014, + "learning_rate": 4.000149058824999e-06, + "loss": 0.7849, + "num_tokens": 55586432809.0, + "step": 13299 + }, + { + "epoch": 1.5805109922756981, + "grad_norm": 0.2249345704630251, + "learning_rate": 3.999060544574027e-06, + "loss": 0.7721, + "num_tokens": 55590622520.0, + "step": 13300 + }, + { + "epoch": 1.5806298276886512, + "grad_norm": 0.2633338880827352, + "learning_rate": 3.997972289591514e-06, + "loss": 0.7985, + "num_tokens": 55594796690.0, + "step": 13301 + }, + { + "epoch": 1.5807486631016041, + "grad_norm": 0.22367700726856274, + "learning_rate": 3.996884293917761e-06, + "loss": 0.7854, + "num_tokens": 55598986001.0, + "step": 13302 + }, + { + "epoch": 1.5808674985145572, + "grad_norm": 0.24970000968462186, + "learning_rate": 3.995796557593061e-06, + "loss": 0.8081, + "num_tokens": 55603174419.0, + "step": 13303 + }, + { + "epoch": 1.5809863339275103, + "grad_norm": 0.23119864345045613, + "learning_rate": 3.994709080657696e-06, + "loss": 0.8443, + "num_tokens": 55607348729.0, + "step": 13304 + }, + { + "epoch": 1.5811051693404634, + "grad_norm": 0.2522742577291039, + "learning_rate": 3.99362186315194e-06, + "loss": 0.8137, + "num_tokens": 55611535743.0, + "step": 13305 + }, + { + "epoch": 1.5812240047534165, + "grad_norm": 0.24393435092699614, + "learning_rate": 3.992534905116055e-06, + "loss": 0.8003, + "num_tokens": 55615724945.0, + "step": 13306 + }, + { + "epoch": 1.5813428401663696, + "grad_norm": 0.27544885633384486, + "learning_rate": 3.991448206590296e-06, + "loss": 0.8557, + "num_tokens": 55619912832.0, + "step": 13307 + }, + { + "epoch": 1.5814616755793227, + "grad_norm": 0.2474999855835953, + "learning_rate": 3.990361767614904e-06, + "loss": 0.8311, + "num_tokens": 55624102012.0, + "step": 13308 + }, + { + "epoch": 1.5815805109922758, + "grad_norm": 0.263227699322894, + "learning_rate": 3.9892755882301186e-06, + "loss": 0.7876, + "num_tokens": 55628291021.0, + "step": 13309 + }, + { + "epoch": 1.581699346405229, + "grad_norm": 0.24027555779389612, + "learning_rate": 3.988189668476161e-06, + "loss": 0.808, + "num_tokens": 55632480249.0, + "step": 13310 + }, + { + "epoch": 1.5818181818181818, + "grad_norm": 0.2718892642018682, + "learning_rate": 3.987104008393247e-06, + "loss": 0.8046, + "num_tokens": 55636668278.0, + "step": 13311 + }, + { + "epoch": 1.581937017231135, + "grad_norm": 0.24132697201365058, + "learning_rate": 3.9860186080215834e-06, + "loss": 0.8083, + "num_tokens": 55640849945.0, + "step": 13312 + }, + { + "epoch": 1.582055852644088, + "grad_norm": 0.25893433028717433, + "learning_rate": 3.984933467401364e-06, + "loss": 0.8394, + "num_tokens": 55645012525.0, + "step": 13313 + }, + { + "epoch": 1.5821746880570409, + "grad_norm": 0.24044747008658965, + "learning_rate": 3.9838485865727795e-06, + "loss": 0.8289, + "num_tokens": 55649201836.0, + "step": 13314 + }, + { + "epoch": 1.582293523469994, + "grad_norm": 0.26050836834762675, + "learning_rate": 3.982763965576002e-06, + "loss": 0.7856, + "num_tokens": 55653392336.0, + "step": 13315 + }, + { + "epoch": 1.582412358882947, + "grad_norm": 0.26914780884845124, + "learning_rate": 3.981679604451204e-06, + "loss": 0.8394, + "num_tokens": 55657567849.0, + "step": 13316 + }, + { + "epoch": 1.5825311942959002, + "grad_norm": 0.25682914638181314, + "learning_rate": 3.980595503238537e-06, + "loss": 0.8026, + "num_tokens": 55661756981.0, + "step": 13317 + }, + { + "epoch": 1.5826500297088533, + "grad_norm": 0.277475222585258, + "learning_rate": 3.979511661978149e-06, + "loss": 0.8603, + "num_tokens": 55665946110.0, + "step": 13318 + }, + { + "epoch": 1.5827688651218064, + "grad_norm": 0.2643511449747105, + "learning_rate": 3.978428080710184e-06, + "loss": 0.8237, + "num_tokens": 55670133907.0, + "step": 13319 + }, + { + "epoch": 1.5828877005347595, + "grad_norm": 0.23273188448769946, + "learning_rate": 3.977344759474767e-06, + "loss": 0.8099, + "num_tokens": 55674323549.0, + "step": 13320 + }, + { + "epoch": 1.5830065359477126, + "grad_norm": 0.2580410076093472, + "learning_rate": 3.976261698312021e-06, + "loss": 0.8478, + "num_tokens": 55678511925.0, + "step": 13321 + }, + { + "epoch": 1.5831253713606654, + "grad_norm": 0.25009804883693937, + "learning_rate": 3.975178897262049e-06, + "loss": 0.8233, + "num_tokens": 55682678055.0, + "step": 13322 + }, + { + "epoch": 1.5832442067736185, + "grad_norm": 0.25209668896969173, + "learning_rate": 3.974096356364954e-06, + "loss": 0.7882, + "num_tokens": 55686866960.0, + "step": 13323 + }, + { + "epoch": 1.5833630421865716, + "grad_norm": 0.24524692356596436, + "learning_rate": 3.973014075660824e-06, + "loss": 0.8165, + "num_tokens": 55691056851.0, + "step": 13324 + }, + { + "epoch": 1.5834818775995245, + "grad_norm": 0.25494136029331027, + "learning_rate": 3.9719320551897455e-06, + "loss": 0.8055, + "num_tokens": 55695246276.0, + "step": 13325 + }, + { + "epoch": 1.5836007130124776, + "grad_norm": 0.2565009720160252, + "learning_rate": 3.970850294991784e-06, + "loss": 0.8136, + "num_tokens": 55699435889.0, + "step": 13326 + }, + { + "epoch": 1.5837195484254307, + "grad_norm": 0.25165403244355683, + "learning_rate": 3.969768795107003e-06, + "loss": 0.8342, + "num_tokens": 55703616315.0, + "step": 13327 + }, + { + "epoch": 1.5838383838383838, + "grad_norm": 0.25681635135870806, + "learning_rate": 3.968687555575451e-06, + "loss": 0.7815, + "num_tokens": 55707805210.0, + "step": 13328 + }, + { + "epoch": 1.583957219251337, + "grad_norm": 0.2563027089217644, + "learning_rate": 3.967606576437174e-06, + "loss": 0.8277, + "num_tokens": 55711991239.0, + "step": 13329 + }, + { + "epoch": 1.58407605466429, + "grad_norm": 0.24399404216203274, + "learning_rate": 3.966525857732202e-06, + "loss": 0.7808, + "num_tokens": 55716181379.0, + "step": 13330 + }, + { + "epoch": 1.5841948900772431, + "grad_norm": 0.24003942886673185, + "learning_rate": 3.965445399500559e-06, + "loss": 0.7996, + "num_tokens": 55720371587.0, + "step": 13331 + }, + { + "epoch": 1.5843137254901962, + "grad_norm": 0.24203988447224598, + "learning_rate": 3.964365201782255e-06, + "loss": 0.8189, + "num_tokens": 55724561768.0, + "step": 13332 + }, + { + "epoch": 1.584432560903149, + "grad_norm": 0.23066726539669175, + "learning_rate": 3.963285264617298e-06, + "loss": 0.8222, + "num_tokens": 55728751398.0, + "step": 13333 + }, + { + "epoch": 1.5845513963161022, + "grad_norm": 0.23658681485177785, + "learning_rate": 3.962205588045679e-06, + "loss": 0.7871, + "num_tokens": 55732938261.0, + "step": 13334 + }, + { + "epoch": 1.5846702317290553, + "grad_norm": 0.23620156015977165, + "learning_rate": 3.9611261721073815e-06, + "loss": 0.8139, + "num_tokens": 55737116173.0, + "step": 13335 + }, + { + "epoch": 1.5847890671420082, + "grad_norm": 0.2498423916552788, + "learning_rate": 3.9600470168423815e-06, + "loss": 0.7983, + "num_tokens": 55741304039.0, + "step": 13336 + }, + { + "epoch": 1.5849079025549613, + "grad_norm": 0.23992610471614162, + "learning_rate": 3.958968122290642e-06, + "loss": 0.8474, + "num_tokens": 55745492254.0, + "step": 13337 + }, + { + "epoch": 1.5850267379679144, + "grad_norm": 0.24756551431033263, + "learning_rate": 3.95788948849212e-06, + "loss": 0.8059, + "num_tokens": 55749681513.0, + "step": 13338 + }, + { + "epoch": 1.5851455733808675, + "grad_norm": 0.26291419495715224, + "learning_rate": 3.956811115486761e-06, + "loss": 0.7796, + "num_tokens": 55753843033.0, + "step": 13339 + }, + { + "epoch": 1.5852644087938206, + "grad_norm": 0.2363476791843263, + "learning_rate": 3.9557330033145e-06, + "loss": 0.7744, + "num_tokens": 55758022910.0, + "step": 13340 + }, + { + "epoch": 1.5853832442067737, + "grad_norm": 0.23439834015245847, + "learning_rate": 3.954655152015261e-06, + "loss": 0.8149, + "num_tokens": 55762210684.0, + "step": 13341 + }, + { + "epoch": 1.5855020796197268, + "grad_norm": 0.24154137373161333, + "learning_rate": 3.953577561628968e-06, + "loss": 0.836, + "num_tokens": 55766398823.0, + "step": 13342 + }, + { + "epoch": 1.5856209150326799, + "grad_norm": 0.24523590491889288, + "learning_rate": 3.9525002321955145e-06, + "loss": 0.8169, + "num_tokens": 55770579316.0, + "step": 13343 + }, + { + "epoch": 1.5857397504456328, + "grad_norm": 0.2354500445494323, + "learning_rate": 3.95142316375481e-06, + "loss": 0.8066, + "num_tokens": 55774747071.0, + "step": 13344 + }, + { + "epoch": 1.5858585858585859, + "grad_norm": 0.24446635810023487, + "learning_rate": 3.950346356346737e-06, + "loss": 0.8089, + "num_tokens": 55778934937.0, + "step": 13345 + }, + { + "epoch": 1.585977421271539, + "grad_norm": 0.23863903031389216, + "learning_rate": 3.949269810011173e-06, + "loss": 0.7743, + "num_tokens": 55783122974.0, + "step": 13346 + }, + { + "epoch": 1.5860962566844918, + "grad_norm": 0.2611364368533171, + "learning_rate": 3.948193524787991e-06, + "loss": 0.84, + "num_tokens": 55787312122.0, + "step": 13347 + }, + { + "epoch": 1.586215092097445, + "grad_norm": 0.24815060902572872, + "learning_rate": 3.9471175007170424e-06, + "loss": 0.7918, + "num_tokens": 55791479573.0, + "step": 13348 + }, + { + "epoch": 1.586333927510398, + "grad_norm": 0.2616331420477718, + "learning_rate": 3.946041737838179e-06, + "loss": 0.8265, + "num_tokens": 55795666167.0, + "step": 13349 + }, + { + "epoch": 1.5864527629233511, + "grad_norm": 0.26632770968288444, + "learning_rate": 3.9449662361912374e-06, + "loss": 0.823, + "num_tokens": 55799855988.0, + "step": 13350 + }, + { + "epoch": 1.5865715983363042, + "grad_norm": 0.23669268126038684, + "learning_rate": 3.943890995816052e-06, + "loss": 0.7905, + "num_tokens": 55804043698.0, + "step": 13351 + }, + { + "epoch": 1.5866904337492573, + "grad_norm": 0.2505435584219182, + "learning_rate": 3.942816016752443e-06, + "loss": 0.7886, + "num_tokens": 55808202355.0, + "step": 13352 + }, + { + "epoch": 1.5868092691622104, + "grad_norm": 0.2531467363291731, + "learning_rate": 3.941741299040217e-06, + "loss": 0.7854, + "num_tokens": 55812384127.0, + "step": 13353 + }, + { + "epoch": 1.5869281045751635, + "grad_norm": 0.2701339319815203, + "learning_rate": 3.940666842719173e-06, + "loss": 0.798, + "num_tokens": 55816550522.0, + "step": 13354 + }, + { + "epoch": 1.5870469399881164, + "grad_norm": 0.2503277491097627, + "learning_rate": 3.939592647829105e-06, + "loss": 0.8071, + "num_tokens": 55820711535.0, + "step": 13355 + }, + { + "epoch": 1.5871657754010695, + "grad_norm": 0.2311979545093704, + "learning_rate": 3.93851871440979e-06, + "loss": 0.8065, + "num_tokens": 55824879405.0, + "step": 13356 + }, + { + "epoch": 1.5872846108140226, + "grad_norm": 0.26577273939708046, + "learning_rate": 3.937445042501008e-06, + "loss": 0.8034, + "num_tokens": 55829045187.0, + "step": 13357 + }, + { + "epoch": 1.5874034462269755, + "grad_norm": 0.23199094347150598, + "learning_rate": 3.936371632142513e-06, + "loss": 0.8549, + "num_tokens": 55833234886.0, + "step": 13358 + }, + { + "epoch": 1.5875222816399286, + "grad_norm": 0.2610058311082744, + "learning_rate": 3.935298483374058e-06, + "loss": 0.8124, + "num_tokens": 55837423262.0, + "step": 13359 + }, + { + "epoch": 1.5876411170528817, + "grad_norm": 0.24709161043878558, + "learning_rate": 3.934225596235387e-06, + "loss": 0.8057, + "num_tokens": 55841613669.0, + "step": 13360 + }, + { + "epoch": 1.5877599524658348, + "grad_norm": 0.2579203442792232, + "learning_rate": 3.933152970766232e-06, + "loss": 0.8214, + "num_tokens": 55845802725.0, + "step": 13361 + }, + { + "epoch": 1.587878787878788, + "grad_norm": 0.25547786160100877, + "learning_rate": 3.932080607006317e-06, + "loss": 0.78, + "num_tokens": 55849991580.0, + "step": 13362 + }, + { + "epoch": 1.587997623291741, + "grad_norm": 0.23432502514978673, + "learning_rate": 3.931008504995353e-06, + "loss": 0.8169, + "num_tokens": 55854181563.0, + "step": 13363 + }, + { + "epoch": 1.588116458704694, + "grad_norm": 0.25320189467823495, + "learning_rate": 3.929936664773045e-06, + "loss": 0.8071, + "num_tokens": 55858371995.0, + "step": 13364 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.22679687351112085, + "learning_rate": 3.9288650863790874e-06, + "loss": 0.8228, + "num_tokens": 55862538679.0, + "step": 13365 + }, + { + "epoch": 1.5883541295306, + "grad_norm": 0.2456902345533431, + "learning_rate": 3.927793769853163e-06, + "loss": 0.829, + "num_tokens": 55866729611.0, + "step": 13366 + }, + { + "epoch": 1.5884729649435532, + "grad_norm": 0.23075557587609885, + "learning_rate": 3.926722715234948e-06, + "loss": 0.8472, + "num_tokens": 55870891636.0, + "step": 13367 + }, + { + "epoch": 1.5885918003565063, + "grad_norm": 0.2631029333802916, + "learning_rate": 3.925651922564107e-06, + "loss": 0.7957, + "num_tokens": 55875055312.0, + "step": 13368 + }, + { + "epoch": 1.5887106357694591, + "grad_norm": 0.23055974022401993, + "learning_rate": 3.924581391880292e-06, + "loss": 0.8245, + "num_tokens": 55879216299.0, + "step": 13369 + }, + { + "epoch": 1.5888294711824122, + "grad_norm": 0.242506801848306, + "learning_rate": 3.923511123223151e-06, + "loss": 0.8199, + "num_tokens": 55883382473.0, + "step": 13370 + }, + { + "epoch": 1.5889483065953653, + "grad_norm": 0.26287714371487353, + "learning_rate": 3.922441116632321e-06, + "loss": 0.8124, + "num_tokens": 55887569782.0, + "step": 13371 + }, + { + "epoch": 1.5890671420083184, + "grad_norm": 0.23238675594422484, + "learning_rate": 3.9213713721474246e-06, + "loss": 0.8336, + "num_tokens": 55891734910.0, + "step": 13372 + }, + { + "epoch": 1.5891859774212715, + "grad_norm": 0.24792379341897225, + "learning_rate": 3.920301889808082e-06, + "loss": 0.8081, + "num_tokens": 55895906102.0, + "step": 13373 + }, + { + "epoch": 1.5893048128342246, + "grad_norm": 0.2326201274103376, + "learning_rate": 3.919232669653895e-06, + "loss": 0.8399, + "num_tokens": 55900092809.0, + "step": 13374 + }, + { + "epoch": 1.5894236482471777, + "grad_norm": 0.23402460711914985, + "learning_rate": 3.9181637117244616e-06, + "loss": 0.7731, + "num_tokens": 55904281300.0, + "step": 13375 + }, + { + "epoch": 1.5895424836601308, + "grad_norm": 0.23604092693891418, + "learning_rate": 3.917095016059373e-06, + "loss": 0.8225, + "num_tokens": 55908470605.0, + "step": 13376 + }, + { + "epoch": 1.589661319073084, + "grad_norm": 0.24967106672507824, + "learning_rate": 3.916026582698203e-06, + "loss": 0.8164, + "num_tokens": 55912626761.0, + "step": 13377 + }, + { + "epoch": 1.5897801544860368, + "grad_norm": 0.23877038322850658, + "learning_rate": 3.914958411680521e-06, + "loss": 0.7848, + "num_tokens": 55916815548.0, + "step": 13378 + }, + { + "epoch": 1.58989898989899, + "grad_norm": 0.2377150896216219, + "learning_rate": 3.913890503045884e-06, + "loss": 0.7848, + "num_tokens": 55921004534.0, + "step": 13379 + }, + { + "epoch": 1.5900178253119428, + "grad_norm": 0.24956291972225306, + "learning_rate": 3.912822856833839e-06, + "loss": 0.8046, + "num_tokens": 55925193314.0, + "step": 13380 + }, + { + "epoch": 1.590136660724896, + "grad_norm": 0.24043872149568343, + "learning_rate": 3.911755473083925e-06, + "loss": 0.8128, + "num_tokens": 55929382646.0, + "step": 13381 + }, + { + "epoch": 1.590255496137849, + "grad_norm": 0.24687159564894634, + "learning_rate": 3.910688351835673e-06, + "loss": 0.8168, + "num_tokens": 55933572612.0, + "step": 13382 + }, + { + "epoch": 1.590374331550802, + "grad_norm": 0.24184257856075161, + "learning_rate": 3.909621493128604e-06, + "loss": 0.7931, + "num_tokens": 55937683571.0, + "step": 13383 + }, + { + "epoch": 1.5904931669637552, + "grad_norm": 0.2409780117098945, + "learning_rate": 3.90855489700222e-06, + "loss": 0.8053, + "num_tokens": 55941845186.0, + "step": 13384 + }, + { + "epoch": 1.5906120023767083, + "grad_norm": 0.25014471141648875, + "learning_rate": 3.907488563496027e-06, + "loss": 0.817, + "num_tokens": 55946033092.0, + "step": 13385 + }, + { + "epoch": 1.5907308377896614, + "grad_norm": 0.23602571072462794, + "learning_rate": 3.906422492649512e-06, + "loss": 0.7948, + "num_tokens": 55950198723.0, + "step": 13386 + }, + { + "epoch": 1.5908496732026145, + "grad_norm": 0.2668175167362417, + "learning_rate": 3.905356684502156e-06, + "loss": 0.8346, + "num_tokens": 55954382855.0, + "step": 13387 + }, + { + "epoch": 1.5909685086155676, + "grad_norm": 0.25557352050447596, + "learning_rate": 3.904291139093429e-06, + "loss": 0.7936, + "num_tokens": 55958571812.0, + "step": 13388 + }, + { + "epoch": 1.5910873440285205, + "grad_norm": 0.24205383221092047, + "learning_rate": 3.903225856462793e-06, + "loss": 0.7985, + "num_tokens": 55962711781.0, + "step": 13389 + }, + { + "epoch": 1.5912061794414736, + "grad_norm": 0.24610710192778137, + "learning_rate": 3.902160836649699e-06, + "loss": 0.7792, + "num_tokens": 55966902547.0, + "step": 13390 + }, + { + "epoch": 1.5913250148544265, + "grad_norm": 0.2576022325346529, + "learning_rate": 3.901096079693587e-06, + "loss": 0.8033, + "num_tokens": 55971078707.0, + "step": 13391 + }, + { + "epoch": 1.5914438502673796, + "grad_norm": 0.2533303446689999, + "learning_rate": 3.900031585633889e-06, + "loss": 0.8233, + "num_tokens": 55975263583.0, + "step": 13392 + }, + { + "epoch": 1.5915626856803327, + "grad_norm": 0.2462430171910828, + "learning_rate": 3.898967354510027e-06, + "loss": 0.7959, + "num_tokens": 55979453092.0, + "step": 13393 + }, + { + "epoch": 1.5916815210932858, + "grad_norm": 0.2692221174656113, + "learning_rate": 3.897903386361413e-06, + "loss": 0.8103, + "num_tokens": 55983619594.0, + "step": 13394 + }, + { + "epoch": 1.5918003565062389, + "grad_norm": 0.2459532979937308, + "learning_rate": 3.896839681227449e-06, + "loss": 0.8022, + "num_tokens": 55987776935.0, + "step": 13395 + }, + { + "epoch": 1.591919191919192, + "grad_norm": 0.26189438120939657, + "learning_rate": 3.8957762391475285e-06, + "loss": 0.787, + "num_tokens": 55991937245.0, + "step": 13396 + }, + { + "epoch": 1.592038027332145, + "grad_norm": 0.23496669804681888, + "learning_rate": 3.894713060161034e-06, + "loss": 0.7874, + "num_tokens": 55996123585.0, + "step": 13397 + }, + { + "epoch": 1.5921568627450982, + "grad_norm": 0.2536994481647347, + "learning_rate": 3.8936501443073375e-06, + "loss": 0.8241, + "num_tokens": 56000312206.0, + "step": 13398 + }, + { + "epoch": 1.5922756981580513, + "grad_norm": 0.2517454432904562, + "learning_rate": 3.892587491625805e-06, + "loss": 0.7911, + "num_tokens": 56004501999.0, + "step": 13399 + }, + { + "epoch": 1.5923945335710041, + "grad_norm": 0.25246218735900455, + "learning_rate": 3.891525102155788e-06, + "loss": 0.8125, + "num_tokens": 56008672289.0, + "step": 13400 + }, + { + "epoch": 1.5925133689839572, + "grad_norm": 0.25219550102302796, + "learning_rate": 3.890462975936627e-06, + "loss": 0.8216, + "num_tokens": 56012860604.0, + "step": 13401 + }, + { + "epoch": 1.5926322043969103, + "grad_norm": 0.25736577644034025, + "learning_rate": 3.889401113007664e-06, + "loss": 0.822, + "num_tokens": 56017035077.0, + "step": 13402 + }, + { + "epoch": 1.5927510398098632, + "grad_norm": 0.2964457336319206, + "learning_rate": 3.8883395134082176e-06, + "loss": 0.8103, + "num_tokens": 56021222643.0, + "step": 13403 + }, + { + "epoch": 1.5928698752228163, + "grad_norm": 0.25785176549326466, + "learning_rate": 3.887278177177607e-06, + "loss": 0.8041, + "num_tokens": 56025411951.0, + "step": 13404 + }, + { + "epoch": 1.5929887106357694, + "grad_norm": 0.2397588950704635, + "learning_rate": 3.8862171043551325e-06, + "loss": 0.797, + "num_tokens": 56029577916.0, + "step": 13405 + }, + { + "epoch": 1.5931075460487225, + "grad_norm": 0.27035855068701603, + "learning_rate": 3.8851562949800905e-06, + "loss": 0.7862, + "num_tokens": 56033766919.0, + "step": 13406 + }, + { + "epoch": 1.5932263814616756, + "grad_norm": 0.2588394519378606, + "learning_rate": 3.8840957490917645e-06, + "loss": 0.8107, + "num_tokens": 56037954515.0, + "step": 13407 + }, + { + "epoch": 1.5933452168746287, + "grad_norm": 0.2729508310200619, + "learning_rate": 3.883035466729433e-06, + "loss": 0.7848, + "num_tokens": 56042095899.0, + "step": 13408 + }, + { + "epoch": 1.5934640522875818, + "grad_norm": 0.24110505852231864, + "learning_rate": 3.881975447932365e-06, + "loss": 0.8291, + "num_tokens": 56046284174.0, + "step": 13409 + }, + { + "epoch": 1.593582887700535, + "grad_norm": 0.2792895019242076, + "learning_rate": 3.8809156927398114e-06, + "loss": 0.8375, + "num_tokens": 56050457786.0, + "step": 13410 + }, + { + "epoch": 1.5937017231134878, + "grad_norm": 0.2769432559498342, + "learning_rate": 3.8798562011910175e-06, + "loss": 0.8105, + "num_tokens": 56054648043.0, + "step": 13411 + }, + { + "epoch": 1.593820558526441, + "grad_norm": 0.2288737187362074, + "learning_rate": 3.878796973325225e-06, + "loss": 0.7743, + "num_tokens": 56058837411.0, + "step": 13412 + }, + { + "epoch": 1.593939393939394, + "grad_norm": 0.26968278198374945, + "learning_rate": 3.877738009181653e-06, + "loss": 0.7552, + "num_tokens": 56063026070.0, + "step": 13413 + }, + { + "epoch": 1.5940582293523469, + "grad_norm": 0.23968262777024418, + "learning_rate": 3.87667930879953e-06, + "loss": 0.79, + "num_tokens": 56067215463.0, + "step": 13414 + }, + { + "epoch": 1.5941770647653, + "grad_norm": 0.2671969741771741, + "learning_rate": 3.8756208722180535e-06, + "loss": 0.796, + "num_tokens": 56071384407.0, + "step": 13415 + }, + { + "epoch": 1.594295900178253, + "grad_norm": 0.26395606412579625, + "learning_rate": 3.874562699476424e-06, + "loss": 0.8198, + "num_tokens": 56075571812.0, + "step": 13416 + }, + { + "epoch": 1.5944147355912062, + "grad_norm": 0.2735350780364593, + "learning_rate": 3.8735047906138304e-06, + "loss": 0.793, + "num_tokens": 56079678085.0, + "step": 13417 + }, + { + "epoch": 1.5945335710041593, + "grad_norm": 0.24779528563258119, + "learning_rate": 3.872447145669449e-06, + "loss": 0.7932, + "num_tokens": 56083868257.0, + "step": 13418 + }, + { + "epoch": 1.5946524064171124, + "grad_norm": 0.24629013722519297, + "learning_rate": 3.871389764682448e-06, + "loss": 0.7748, + "num_tokens": 56088053018.0, + "step": 13419 + }, + { + "epoch": 1.5947712418300655, + "grad_norm": 0.2431609558110121, + "learning_rate": 3.870332647691988e-06, + "loss": 0.8254, + "num_tokens": 56092241292.0, + "step": 13420 + }, + { + "epoch": 1.5948900772430186, + "grad_norm": 0.23352049348611723, + "learning_rate": 3.869275794737215e-06, + "loss": 0.8287, + "num_tokens": 56096431773.0, + "step": 13421 + }, + { + "epoch": 1.5950089126559714, + "grad_norm": 0.24412659338085785, + "learning_rate": 3.868219205857269e-06, + "loss": 0.8223, + "num_tokens": 56100592628.0, + "step": 13422 + }, + { + "epoch": 1.5951277480689245, + "grad_norm": 0.2461822740109957, + "learning_rate": 3.867162881091278e-06, + "loss": 0.8354, + "num_tokens": 56104749493.0, + "step": 13423 + }, + { + "epoch": 1.5952465834818776, + "grad_norm": 0.23547925731895425, + "learning_rate": 3.866106820478365e-06, + "loss": 0.8053, + "num_tokens": 56108892497.0, + "step": 13424 + }, + { + "epoch": 1.5953654188948305, + "grad_norm": 0.24418668439826066, + "learning_rate": 3.8650510240576345e-06, + "loss": 0.8143, + "num_tokens": 56113081582.0, + "step": 13425 + }, + { + "epoch": 1.5954842543077836, + "grad_norm": 0.2290096410648356, + "learning_rate": 3.863995491868189e-06, + "loss": 0.7795, + "num_tokens": 56117268875.0, + "step": 13426 + }, + { + "epoch": 1.5956030897207367, + "grad_norm": 0.2562889075032072, + "learning_rate": 3.862940223949117e-06, + "loss": 0.7956, + "num_tokens": 56121457618.0, + "step": 13427 + }, + { + "epoch": 1.5957219251336898, + "grad_norm": 0.24354230366298257, + "learning_rate": 3.8618852203395005e-06, + "loss": 0.7974, + "num_tokens": 56125645295.0, + "step": 13428 + }, + { + "epoch": 1.595840760546643, + "grad_norm": 0.24638629042147708, + "learning_rate": 3.8608304810784084e-06, + "loss": 0.8196, + "num_tokens": 56129833934.0, + "step": 13429 + }, + { + "epoch": 1.595959595959596, + "grad_norm": 0.2745187292187946, + "learning_rate": 3.859776006204904e-06, + "loss": 0.8045, + "num_tokens": 56134023505.0, + "step": 13430 + }, + { + "epoch": 1.5960784313725491, + "grad_norm": 0.2537774747109409, + "learning_rate": 3.858721795758034e-06, + "loss": 0.7908, + "num_tokens": 56138199418.0, + "step": 13431 + }, + { + "epoch": 1.5961972667855022, + "grad_norm": 0.2558223067952358, + "learning_rate": 3.857667849776838e-06, + "loss": 0.8076, + "num_tokens": 56142389803.0, + "step": 13432 + }, + { + "epoch": 1.596316102198455, + "grad_norm": 0.2558766899811818, + "learning_rate": 3.856614168300352e-06, + "loss": 0.7866, + "num_tokens": 56146579462.0, + "step": 13433 + }, + { + "epoch": 1.5964349376114082, + "grad_norm": 0.23755467055671384, + "learning_rate": 3.8555607513675975e-06, + "loss": 0.8058, + "num_tokens": 56150769428.0, + "step": 13434 + }, + { + "epoch": 1.5965537730243613, + "grad_norm": 0.2603208004707725, + "learning_rate": 3.8545075990175855e-06, + "loss": 0.8211, + "num_tokens": 56154952405.0, + "step": 13435 + }, + { + "epoch": 1.5966726084373142, + "grad_norm": 0.23757433380048557, + "learning_rate": 3.853454711289314e-06, + "loss": 0.8293, + "num_tokens": 56159103971.0, + "step": 13436 + }, + { + "epoch": 1.5967914438502673, + "grad_norm": 0.24313160850156895, + "learning_rate": 3.852402088221779e-06, + "loss": 0.7903, + "num_tokens": 56163270246.0, + "step": 13437 + }, + { + "epoch": 1.5969102792632204, + "grad_norm": 0.24549345612939888, + "learning_rate": 3.85134972985396e-06, + "loss": 0.826, + "num_tokens": 56167459720.0, + "step": 13438 + }, + { + "epoch": 1.5970291146761735, + "grad_norm": 0.24963439317573927, + "learning_rate": 3.850297636224828e-06, + "loss": 0.7809, + "num_tokens": 56171647824.0, + "step": 13439 + }, + { + "epoch": 1.5971479500891266, + "grad_norm": 0.2482415947216052, + "learning_rate": 3.849245807373354e-06, + "loss": 0.8026, + "num_tokens": 56175835205.0, + "step": 13440 + }, + { + "epoch": 1.5972667855020797, + "grad_norm": 0.24846355170329873, + "learning_rate": 3.848194243338482e-06, + "loss": 0.7796, + "num_tokens": 56180002677.0, + "step": 13441 + }, + { + "epoch": 1.5973856209150328, + "grad_norm": 0.23447262476235295, + "learning_rate": 3.847142944159159e-06, + "loss": 0.8642, + "num_tokens": 56184191323.0, + "step": 13442 + }, + { + "epoch": 1.5975044563279859, + "grad_norm": 0.24860287105372114, + "learning_rate": 3.846091909874316e-06, + "loss": 0.8082, + "num_tokens": 56188381088.0, + "step": 13443 + }, + { + "epoch": 1.5976232917409388, + "grad_norm": 0.23973885951456875, + "learning_rate": 3.845041140522878e-06, + "loss": 0.7625, + "num_tokens": 56192554428.0, + "step": 13444 + }, + { + "epoch": 1.5977421271538919, + "grad_norm": 0.2489598281724174, + "learning_rate": 3.843990636143759e-06, + "loss": 0.7902, + "num_tokens": 56196732717.0, + "step": 13445 + }, + { + "epoch": 1.597860962566845, + "grad_norm": 0.25091213868653905, + "learning_rate": 3.842940396775861e-06, + "loss": 0.775, + "num_tokens": 56200920965.0, + "step": 13446 + }, + { + "epoch": 1.5979797979797978, + "grad_norm": 0.24294357333673808, + "learning_rate": 3.841890422458079e-06, + "loss": 0.8355, + "num_tokens": 56205083513.0, + "step": 13447 + }, + { + "epoch": 1.598098633392751, + "grad_norm": 0.23797199890399992, + "learning_rate": 3.8408407132292955e-06, + "loss": 0.7756, + "num_tokens": 56209244758.0, + "step": 13448 + }, + { + "epoch": 1.598217468805704, + "grad_norm": 0.2473499090251822, + "learning_rate": 3.839791269128387e-06, + "loss": 0.7672, + "num_tokens": 56213434191.0, + "step": 13449 + }, + { + "epoch": 1.5983363042186571, + "grad_norm": 0.23312069054838447, + "learning_rate": 3.838742090194217e-06, + "loss": 0.8174, + "num_tokens": 56217623073.0, + "step": 13450 + }, + { + "epoch": 1.5984551396316102, + "grad_norm": 0.24987897953713895, + "learning_rate": 3.83769317646564e-06, + "loss": 0.8282, + "num_tokens": 56221811282.0, + "step": 13451 + }, + { + "epoch": 1.5985739750445633, + "grad_norm": 0.2792997821420848, + "learning_rate": 3.836644527981501e-06, + "loss": 0.8064, + "num_tokens": 56225995841.0, + "step": 13452 + }, + { + "epoch": 1.5986928104575164, + "grad_norm": 0.2525462557639834, + "learning_rate": 3.835596144780635e-06, + "loss": 0.8128, + "num_tokens": 56230184131.0, + "step": 13453 + }, + { + "epoch": 1.5988116458704695, + "grad_norm": 0.2559159029484149, + "learning_rate": 3.834548026901868e-06, + "loss": 0.7783, + "num_tokens": 56234373379.0, + "step": 13454 + }, + { + "epoch": 1.5989304812834224, + "grad_norm": 0.25633802994997407, + "learning_rate": 3.833500174384013e-06, + "loss": 0.8024, + "num_tokens": 56238562674.0, + "step": 13455 + }, + { + "epoch": 1.5990493166963755, + "grad_norm": 0.28539289729230416, + "learning_rate": 3.832452587265878e-06, + "loss": 0.8458, + "num_tokens": 56242751525.0, + "step": 13456 + }, + { + "epoch": 1.5991681521093286, + "grad_norm": 0.2577707772810889, + "learning_rate": 3.831405265586261e-06, + "loss": 0.8113, + "num_tokens": 56246907498.0, + "step": 13457 + }, + { + "epoch": 1.5992869875222815, + "grad_norm": 0.2766519868606894, + "learning_rate": 3.830358209383939e-06, + "loss": 0.7786, + "num_tokens": 56251097868.0, + "step": 13458 + }, + { + "epoch": 1.5994058229352346, + "grad_norm": 0.2572577996058098, + "learning_rate": 3.8293114186976956e-06, + "loss": 0.8108, + "num_tokens": 56255263544.0, + "step": 13459 + }, + { + "epoch": 1.5995246583481877, + "grad_norm": 0.28334285768140194, + "learning_rate": 3.828264893566295e-06, + "loss": 0.8126, + "num_tokens": 56259452773.0, + "step": 13460 + }, + { + "epoch": 1.5996434937611408, + "grad_norm": 0.28543531406285866, + "learning_rate": 3.827218634028493e-06, + "loss": 0.8407, + "num_tokens": 56263626432.0, + "step": 13461 + }, + { + "epoch": 1.5997623291740939, + "grad_norm": 0.24015659301687728, + "learning_rate": 3.826172640123041e-06, + "loss": 0.7931, + "num_tokens": 56267791306.0, + "step": 13462 + }, + { + "epoch": 1.599881164587047, + "grad_norm": 0.2665019404997498, + "learning_rate": 3.825126911888668e-06, + "loss": 0.8177, + "num_tokens": 56271976147.0, + "step": 13463 + }, + { + "epoch": 1.6, + "grad_norm": 0.24988870919283956, + "learning_rate": 3.8240814493641025e-06, + "loss": 0.8147, + "num_tokens": 56276166335.0, + "step": 13464 + }, + { + "epoch": 1.6001188354129532, + "grad_norm": 0.27256918752155773, + "learning_rate": 3.823036252588066e-06, + "loss": 0.8053, + "num_tokens": 56280357108.0, + "step": 13465 + }, + { + "epoch": 1.600237670825906, + "grad_norm": 0.2456732070623033, + "learning_rate": 3.821991321599261e-06, + "loss": 0.8214, + "num_tokens": 56284547970.0, + "step": 13466 + }, + { + "epoch": 1.6003565062388592, + "grad_norm": 0.23820523994321088, + "learning_rate": 3.8209466564363905e-06, + "loss": 0.8214, + "num_tokens": 56288707210.0, + "step": 13467 + }, + { + "epoch": 1.6004753416518123, + "grad_norm": 0.2424574129363314, + "learning_rate": 3.819902257138136e-06, + "loss": 0.7929, + "num_tokens": 56292896001.0, + "step": 13468 + }, + { + "epoch": 1.6005941770647651, + "grad_norm": 0.23268709359157141, + "learning_rate": 3.818858123743176e-06, + "loss": 0.8076, + "num_tokens": 56297078146.0, + "step": 13469 + }, + { + "epoch": 1.6007130124777182, + "grad_norm": 0.25754659350968007, + "learning_rate": 3.817814256290178e-06, + "loss": 0.7994, + "num_tokens": 56301265332.0, + "step": 13470 + }, + { + "epoch": 1.6008318478906713, + "grad_norm": 0.24474881725137246, + "learning_rate": 3.816770654817805e-06, + "loss": 0.8077, + "num_tokens": 56305453995.0, + "step": 13471 + }, + { + "epoch": 1.6009506833036244, + "grad_norm": 0.26764901676390446, + "learning_rate": 3.815727319364705e-06, + "loss": 0.793, + "num_tokens": 56309645101.0, + "step": 13472 + }, + { + "epoch": 1.6010695187165775, + "grad_norm": 0.2660342438459325, + "learning_rate": 3.814684249969508e-06, + "loss": 0.7887, + "num_tokens": 56313835113.0, + "step": 13473 + }, + { + "epoch": 1.6011883541295306, + "grad_norm": 0.24376492633805216, + "learning_rate": 3.8136414466708484e-06, + "loss": 0.8311, + "num_tokens": 56318025347.0, + "step": 13474 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.26911659706353763, + "learning_rate": 3.8125989095073434e-06, + "loss": 0.827, + "num_tokens": 56322213254.0, + "step": 13475 + }, + { + "epoch": 1.6014260249554368, + "grad_norm": 0.2395693733559095, + "learning_rate": 3.811556638517603e-06, + "loss": 0.7996, + "num_tokens": 56326403052.0, + "step": 13476 + }, + { + "epoch": 1.60154486036839, + "grad_norm": 0.24497967658754807, + "learning_rate": 3.8105146337402244e-06, + "loss": 0.8188, + "num_tokens": 56330568161.0, + "step": 13477 + }, + { + "epoch": 1.6016636957813428, + "grad_norm": 0.2497248208971386, + "learning_rate": 3.809472895213797e-06, + "loss": 0.8461, + "num_tokens": 56334758347.0, + "step": 13478 + }, + { + "epoch": 1.601782531194296, + "grad_norm": 0.2416429575960266, + "learning_rate": 3.8084314229768993e-06, + "loss": 0.8159, + "num_tokens": 56338947250.0, + "step": 13479 + }, + { + "epoch": 1.6019013666072488, + "grad_norm": 0.23819267551658113, + "learning_rate": 3.8073902170681033e-06, + "loss": 0.7899, + "num_tokens": 56343137383.0, + "step": 13480 + }, + { + "epoch": 1.602020202020202, + "grad_norm": 0.23286943049117437, + "learning_rate": 3.806349277525964e-06, + "loss": 0.7928, + "num_tokens": 56347325547.0, + "step": 13481 + }, + { + "epoch": 1.602139037433155, + "grad_norm": 0.25514203890753057, + "learning_rate": 3.805308604389035e-06, + "loss": 0.8291, + "num_tokens": 56351513331.0, + "step": 13482 + }, + { + "epoch": 1.602257872846108, + "grad_norm": 0.25219730341821345, + "learning_rate": 3.8042681976958527e-06, + "loss": 0.8013, + "num_tokens": 56355701692.0, + "step": 13483 + }, + { + "epoch": 1.6023767082590612, + "grad_norm": 0.23961757666828534, + "learning_rate": 3.8032280574849495e-06, + "loss": 0.8245, + "num_tokens": 56359889382.0, + "step": 13484 + }, + { + "epoch": 1.6024955436720143, + "grad_norm": 0.2623591740422938, + "learning_rate": 3.8021881837948437e-06, + "loss": 0.7743, + "num_tokens": 56364011609.0, + "step": 13485 + }, + { + "epoch": 1.6026143790849674, + "grad_norm": 0.27697354059850765, + "learning_rate": 3.8011485766640467e-06, + "loss": 0.8328, + "num_tokens": 56368202266.0, + "step": 13486 + }, + { + "epoch": 1.6027332144979205, + "grad_norm": 0.25243619105544, + "learning_rate": 3.8001092361310566e-06, + "loss": 0.8054, + "num_tokens": 56372391853.0, + "step": 13487 + }, + { + "epoch": 1.6028520499108736, + "grad_norm": 0.2588717773544229, + "learning_rate": 3.799070162234369e-06, + "loss": 0.8364, + "num_tokens": 56376582290.0, + "step": 13488 + }, + { + "epoch": 1.6029708853238265, + "grad_norm": 0.2681948028882203, + "learning_rate": 3.7980313550124563e-06, + "loss": 0.804, + "num_tokens": 56380758353.0, + "step": 13489 + }, + { + "epoch": 1.6030897207367796, + "grad_norm": 0.25631688927284707, + "learning_rate": 3.7969928145037904e-06, + "loss": 0.7864, + "num_tokens": 56384894701.0, + "step": 13490 + }, + { + "epoch": 1.6032085561497325, + "grad_norm": 0.26162576905996754, + "learning_rate": 3.795954540746838e-06, + "loss": 0.791, + "num_tokens": 56389081177.0, + "step": 13491 + }, + { + "epoch": 1.6033273915626856, + "grad_norm": 0.23676353018697008, + "learning_rate": 3.7949165337800454e-06, + "loss": 0.8158, + "num_tokens": 56393232232.0, + "step": 13492 + }, + { + "epoch": 1.6034462269756387, + "grad_norm": 0.2374424293192055, + "learning_rate": 3.7938787936418577e-06, + "loss": 0.7773, + "num_tokens": 56397422596.0, + "step": 13493 + }, + { + "epoch": 1.6035650623885918, + "grad_norm": 0.23512987564545967, + "learning_rate": 3.7928413203707e-06, + "loss": 0.7914, + "num_tokens": 56401586861.0, + "step": 13494 + }, + { + "epoch": 1.6036838978015449, + "grad_norm": 0.2460859026944939, + "learning_rate": 3.791804114004996e-06, + "loss": 0.8237, + "num_tokens": 56405775362.0, + "step": 13495 + }, + { + "epoch": 1.603802733214498, + "grad_norm": 0.2237860753184551, + "learning_rate": 3.7907671745831554e-06, + "loss": 0.7957, + "num_tokens": 56409965726.0, + "step": 13496 + }, + { + "epoch": 1.603921568627451, + "grad_norm": 0.24825720720081626, + "learning_rate": 3.7897305021435837e-06, + "loss": 0.7855, + "num_tokens": 56414155221.0, + "step": 13497 + }, + { + "epoch": 1.6040404040404042, + "grad_norm": 0.23719158997128273, + "learning_rate": 3.788694096724671e-06, + "loss": 0.7982, + "num_tokens": 56418345618.0, + "step": 13498 + }, + { + "epoch": 1.6041592394533573, + "grad_norm": 0.23297491675149248, + "learning_rate": 3.7876579583647975e-06, + "loss": 0.8261, + "num_tokens": 56422536023.0, + "step": 13499 + }, + { + "epoch": 1.6042780748663101, + "grad_norm": 0.2353315328420969, + "learning_rate": 3.786622087102334e-06, + "loss": 0.8347, + "num_tokens": 56426716901.0, + "step": 13500 + }, + { + "epoch": 1.6043969102792632, + "grad_norm": 0.23937422068241132, + "learning_rate": 3.785586482975644e-06, + "loss": 0.7858, + "num_tokens": 56430905326.0, + "step": 13501 + }, + { + "epoch": 1.6045157456922163, + "grad_norm": 0.23171230315715322, + "learning_rate": 3.7845511460230767e-06, + "loss": 0.8205, + "num_tokens": 56435093735.0, + "step": 13502 + }, + { + "epoch": 1.6046345811051692, + "grad_norm": 0.2382684793691317, + "learning_rate": 3.7835160762829803e-06, + "loss": 0.806, + "num_tokens": 56439257176.0, + "step": 13503 + }, + { + "epoch": 1.6047534165181223, + "grad_norm": 0.23151906034095632, + "learning_rate": 3.7824812737936817e-06, + "loss": 0.7851, + "num_tokens": 56443446366.0, + "step": 13504 + }, + { + "epoch": 1.6048722519310754, + "grad_norm": 0.23109519109171128, + "learning_rate": 3.781446738593505e-06, + "loss": 0.818, + "num_tokens": 56447611726.0, + "step": 13505 + }, + { + "epoch": 1.6049910873440285, + "grad_norm": 0.234349215286623, + "learning_rate": 3.780412470720761e-06, + "loss": 0.8317, + "num_tokens": 56451781946.0, + "step": 13506 + }, + { + "epoch": 1.6051099227569816, + "grad_norm": 0.2546277556567546, + "learning_rate": 3.779378470213754e-06, + "loss": 0.8278, + "num_tokens": 56455924535.0, + "step": 13507 + }, + { + "epoch": 1.6052287581699347, + "grad_norm": 0.23594035157545054, + "learning_rate": 3.7783447371107757e-06, + "loss": 0.8188, + "num_tokens": 56460113005.0, + "step": 13508 + }, + { + "epoch": 1.6053475935828878, + "grad_norm": 0.26447991238439733, + "learning_rate": 3.777311271450108e-06, + "loss": 0.7992, + "num_tokens": 56464301210.0, + "step": 13509 + }, + { + "epoch": 1.605466428995841, + "grad_norm": 0.2375517116761738, + "learning_rate": 3.776278073270025e-06, + "loss": 0.8304, + "num_tokens": 56468477740.0, + "step": 13510 + }, + { + "epoch": 1.6055852644087938, + "grad_norm": 0.2486585651003128, + "learning_rate": 3.7752451426087888e-06, + "loss": 0.7902, + "num_tokens": 56472607535.0, + "step": 13511 + }, + { + "epoch": 1.6057040998217469, + "grad_norm": 0.23483462512923367, + "learning_rate": 3.7742124795046523e-06, + "loss": 0.7809, + "num_tokens": 56476796931.0, + "step": 13512 + }, + { + "epoch": 1.6058229352347, + "grad_norm": 0.24198363403762993, + "learning_rate": 3.77318008399586e-06, + "loss": 0.7852, + "num_tokens": 56480985001.0, + "step": 13513 + }, + { + "epoch": 1.6059417706476529, + "grad_norm": 0.22623591448034858, + "learning_rate": 3.7721479561206435e-06, + "loss": 0.8013, + "num_tokens": 56485174794.0, + "step": 13514 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.24280209487831134, + "learning_rate": 3.7711160959172234e-06, + "loss": 0.8422, + "num_tokens": 56489334334.0, + "step": 13515 + }, + { + "epoch": 1.606179441473559, + "grad_norm": 0.21734885503852605, + "learning_rate": 3.7700845034238175e-06, + "loss": 0.8197, + "num_tokens": 56493523931.0, + "step": 13516 + }, + { + "epoch": 1.6062982768865122, + "grad_norm": 0.25089262962493275, + "learning_rate": 3.769053178678628e-06, + "loss": 0.7573, + "num_tokens": 56497712163.0, + "step": 13517 + }, + { + "epoch": 1.6064171122994653, + "grad_norm": 0.23084829220495262, + "learning_rate": 3.7680221217198466e-06, + "loss": 0.843, + "num_tokens": 56501901730.0, + "step": 13518 + }, + { + "epoch": 1.6065359477124184, + "grad_norm": 0.2409524536473803, + "learning_rate": 3.7669913325856606e-06, + "loss": 0.8237, + "num_tokens": 56506090866.0, + "step": 13519 + }, + { + "epoch": 1.6066547831253715, + "grad_norm": 0.2458759329634315, + "learning_rate": 3.765960811314239e-06, + "loss": 0.819, + "num_tokens": 56510280868.0, + "step": 13520 + }, + { + "epoch": 1.6067736185383246, + "grad_norm": 0.250607751112464, + "learning_rate": 3.7649305579437446e-06, + "loss": 0.7921, + "num_tokens": 56514470179.0, + "step": 13521 + }, + { + "epoch": 1.6068924539512774, + "grad_norm": 0.24241995003686795, + "learning_rate": 3.7639005725123355e-06, + "loss": 0.8115, + "num_tokens": 56518639642.0, + "step": 13522 + }, + { + "epoch": 1.6070112893642305, + "grad_norm": 0.2563831005510493, + "learning_rate": 3.7628708550581563e-06, + "loss": 0.812, + "num_tokens": 56522826774.0, + "step": 13523 + }, + { + "epoch": 1.6071301247771836, + "grad_norm": 0.2358689422143434, + "learning_rate": 3.7618414056193385e-06, + "loss": 0.8327, + "num_tokens": 56526992824.0, + "step": 13524 + }, + { + "epoch": 1.6072489601901365, + "grad_norm": 0.25687692931245426, + "learning_rate": 3.760812224234005e-06, + "loss": 0.819, + "num_tokens": 56531154142.0, + "step": 13525 + }, + { + "epoch": 1.6073677956030896, + "grad_norm": 0.2559406453282995, + "learning_rate": 3.7597833109402716e-06, + "loss": 0.8219, + "num_tokens": 56535313258.0, + "step": 13526 + }, + { + "epoch": 1.6074866310160427, + "grad_norm": 0.25568377110987917, + "learning_rate": 3.758754665776242e-06, + "loss": 0.8264, + "num_tokens": 56539501355.0, + "step": 13527 + }, + { + "epoch": 1.6076054664289958, + "grad_norm": 0.2525144792410352, + "learning_rate": 3.7577262887800078e-06, + "loss": 0.8244, + "num_tokens": 56543689048.0, + "step": 13528 + }, + { + "epoch": 1.607724301841949, + "grad_norm": 0.22702578805639798, + "learning_rate": 3.7566981799896596e-06, + "loss": 0.7917, + "num_tokens": 56547876853.0, + "step": 13529 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.2630389402739906, + "learning_rate": 3.755670339443267e-06, + "loss": 0.8674, + "num_tokens": 56552032669.0, + "step": 13530 + }, + { + "epoch": 1.6079619726678551, + "grad_norm": 0.2335875425669341, + "learning_rate": 3.7546427671788945e-06, + "loss": 0.7829, + "num_tokens": 56556221694.0, + "step": 13531 + }, + { + "epoch": 1.6080808080808082, + "grad_norm": 0.24537780379780633, + "learning_rate": 3.753615463234598e-06, + "loss": 0.7991, + "num_tokens": 56560382328.0, + "step": 13532 + }, + { + "epoch": 1.608199643493761, + "grad_norm": 0.2513741784358811, + "learning_rate": 3.75258842764842e-06, + "loss": 0.817, + "num_tokens": 56564571297.0, + "step": 13533 + }, + { + "epoch": 1.6083184789067142, + "grad_norm": 0.23706537264618457, + "learning_rate": 3.751561660458398e-06, + "loss": 0.8182, + "num_tokens": 56568759678.0, + "step": 13534 + }, + { + "epoch": 1.6084373143196673, + "grad_norm": 0.2574641631999927, + "learning_rate": 3.7505351617025553e-06, + "loss": 0.7814, + "num_tokens": 56572941496.0, + "step": 13535 + }, + { + "epoch": 1.6085561497326202, + "grad_norm": 0.24881975768221748, + "learning_rate": 3.7495089314189067e-06, + "loss": 0.7739, + "num_tokens": 56577130928.0, + "step": 13536 + }, + { + "epoch": 1.6086749851455733, + "grad_norm": 0.24712427556995617, + "learning_rate": 3.748482969645457e-06, + "loss": 0.809, + "num_tokens": 56581307390.0, + "step": 13537 + }, + { + "epoch": 1.6087938205585264, + "grad_norm": 0.2341458691599311, + "learning_rate": 3.7474572764202e-06, + "loss": 0.8716, + "num_tokens": 56585495017.0, + "step": 13538 + }, + { + "epoch": 1.6089126559714795, + "grad_norm": 0.24895252601376017, + "learning_rate": 3.7464318517811218e-06, + "loss": 0.8135, + "num_tokens": 56589684437.0, + "step": 13539 + }, + { + "epoch": 1.6090314913844326, + "grad_norm": 0.24289413447382985, + "learning_rate": 3.7454066957661968e-06, + "loss": 0.8568, + "num_tokens": 56593842414.0, + "step": 13540 + }, + { + "epoch": 1.6091503267973857, + "grad_norm": 0.2519201663758937, + "learning_rate": 3.7443818084133904e-06, + "loss": 0.8051, + "num_tokens": 56598032247.0, + "step": 13541 + }, + { + "epoch": 1.6092691622103388, + "grad_norm": 0.23562351867724263, + "learning_rate": 3.7433571897606572e-06, + "loss": 0.7995, + "num_tokens": 56602171776.0, + "step": 13542 + }, + { + "epoch": 1.6093879976232919, + "grad_norm": 0.266028276768407, + "learning_rate": 3.7423328398459436e-06, + "loss": 0.8417, + "num_tokens": 56606351930.0, + "step": 13543 + }, + { + "epoch": 1.6095068330362448, + "grad_norm": 0.2515675932062706, + "learning_rate": 3.7413087587071826e-06, + "loss": 0.7928, + "num_tokens": 56610541893.0, + "step": 13544 + }, + { + "epoch": 1.6096256684491979, + "grad_norm": 0.24689262615888338, + "learning_rate": 3.7402849463823043e-06, + "loss": 0.7751, + "num_tokens": 56614731393.0, + "step": 13545 + }, + { + "epoch": 1.609744503862151, + "grad_norm": 0.24606521523803496, + "learning_rate": 3.7392614029092165e-06, + "loss": 0.8298, + "num_tokens": 56618921046.0, + "step": 13546 + }, + { + "epoch": 1.6098633392751038, + "grad_norm": 0.24297683536662343, + "learning_rate": 3.738238128325825e-06, + "loss": 0.7862, + "num_tokens": 56623110947.0, + "step": 13547 + }, + { + "epoch": 1.609982174688057, + "grad_norm": 0.2484522380995241, + "learning_rate": 3.7372151226700327e-06, + "loss": 0.8114, + "num_tokens": 56627261158.0, + "step": 13548 + }, + { + "epoch": 1.61010101010101, + "grad_norm": 0.26547982977512236, + "learning_rate": 3.736192385979719e-06, + "loss": 0.7991, + "num_tokens": 56631444509.0, + "step": 13549 + }, + { + "epoch": 1.6102198455139631, + "grad_norm": 0.25330989968435386, + "learning_rate": 3.735169918292763e-06, + "loss": 0.8042, + "num_tokens": 56635634790.0, + "step": 13550 + }, + { + "epoch": 1.6103386809269162, + "grad_norm": 0.23780540128503633, + "learning_rate": 3.7341477196470255e-06, + "loss": 0.8142, + "num_tokens": 56639797692.0, + "step": 13551 + }, + { + "epoch": 1.6104575163398693, + "grad_norm": 0.24373943124505745, + "learning_rate": 3.7331257900803657e-06, + "loss": 0.7938, + "num_tokens": 56643988428.0, + "step": 13552 + }, + { + "epoch": 1.6105763517528224, + "grad_norm": 0.24002367905191965, + "learning_rate": 3.7321041296306244e-06, + "loss": 0.805, + "num_tokens": 56648153718.0, + "step": 13553 + }, + { + "epoch": 1.6106951871657755, + "grad_norm": 0.23434931577580548, + "learning_rate": 3.7310827383356436e-06, + "loss": 0.7747, + "num_tokens": 56652343417.0, + "step": 13554 + }, + { + "epoch": 1.6108140225787284, + "grad_norm": 0.22432964043101206, + "learning_rate": 3.7300616162332472e-06, + "loss": 0.8046, + "num_tokens": 56656510851.0, + "step": 13555 + }, + { + "epoch": 1.6109328579916815, + "grad_norm": 0.23482914889523018, + "learning_rate": 3.729040763361247e-06, + "loss": 0.8183, + "num_tokens": 56660700053.0, + "step": 13556 + }, + { + "epoch": 1.6110516934046346, + "grad_norm": 0.23846748014066235, + "learning_rate": 3.728020179757452e-06, + "loss": 0.8021, + "num_tokens": 56664886767.0, + "step": 13557 + }, + { + "epoch": 1.6111705288175875, + "grad_norm": 0.23691833428932474, + "learning_rate": 3.7269998654596554e-06, + "loss": 0.8216, + "num_tokens": 56669075691.0, + "step": 13558 + }, + { + "epoch": 1.6112893642305406, + "grad_norm": 0.24208962355878386, + "learning_rate": 3.7259798205056453e-06, + "loss": 0.8332, + "num_tokens": 56673263891.0, + "step": 13559 + }, + { + "epoch": 1.6114081996434937, + "grad_norm": 0.2436575906780377, + "learning_rate": 3.7249600449331968e-06, + "loss": 0.7866, + "num_tokens": 56677446807.0, + "step": 13560 + }, + { + "epoch": 1.6115270350564468, + "grad_norm": 0.2463052363570099, + "learning_rate": 3.7239405387800753e-06, + "loss": 0.7939, + "num_tokens": 56681622175.0, + "step": 13561 + }, + { + "epoch": 1.6116458704693999, + "grad_norm": 0.2750153093920097, + "learning_rate": 3.722921302084036e-06, + "loss": 0.8161, + "num_tokens": 56685811858.0, + "step": 13562 + }, + { + "epoch": 1.611764705882353, + "grad_norm": 0.24723900757870537, + "learning_rate": 3.7219023348828265e-06, + "loss": 0.8258, + "num_tokens": 56689999728.0, + "step": 13563 + }, + { + "epoch": 1.611883541295306, + "grad_norm": 0.24453918327240726, + "learning_rate": 3.72088363721418e-06, + "loss": 0.8134, + "num_tokens": 56694188864.0, + "step": 13564 + }, + { + "epoch": 1.6120023767082592, + "grad_norm": 0.23548159877041694, + "learning_rate": 3.719865209115824e-06, + "loss": 0.7833, + "num_tokens": 56698373938.0, + "step": 13565 + }, + { + "epoch": 1.612121212121212, + "grad_norm": 0.2454152583678023, + "learning_rate": 3.7188470506254746e-06, + "loss": 0.8404, + "num_tokens": 56702561185.0, + "step": 13566 + }, + { + "epoch": 1.6122400475341652, + "grad_norm": 0.23858756555053814, + "learning_rate": 3.717829161780836e-06, + "loss": 0.7876, + "num_tokens": 56706750929.0, + "step": 13567 + }, + { + "epoch": 1.6123588829471183, + "grad_norm": 0.23617152828090665, + "learning_rate": 3.716811542619606e-06, + "loss": 0.7945, + "num_tokens": 56710924923.0, + "step": 13568 + }, + { + "epoch": 1.6124777183600711, + "grad_norm": 0.23421588192138038, + "learning_rate": 3.7157941931794694e-06, + "loss": 0.8069, + "num_tokens": 56715115020.0, + "step": 13569 + }, + { + "epoch": 1.6125965537730242, + "grad_norm": 0.2634891209120985, + "learning_rate": 3.7147771134981027e-06, + "loss": 0.8378, + "num_tokens": 56719304182.0, + "step": 13570 + }, + { + "epoch": 1.6127153891859773, + "grad_norm": 0.23902295816945554, + "learning_rate": 3.71376030361317e-06, + "loss": 0.7638, + "num_tokens": 56723492886.0, + "step": 13571 + }, + { + "epoch": 1.6128342245989304, + "grad_norm": 0.23799757931962007, + "learning_rate": 3.712743763562331e-06, + "loss": 0.8005, + "num_tokens": 56727682133.0, + "step": 13572 + }, + { + "epoch": 1.6129530600118835, + "grad_norm": 0.28031693891700693, + "learning_rate": 3.711727493383227e-06, + "loss": 0.8373, + "num_tokens": 56731871983.0, + "step": 13573 + }, + { + "epoch": 1.6130718954248366, + "grad_norm": 0.24188461642836814, + "learning_rate": 3.7107114931134973e-06, + "loss": 0.8298, + "num_tokens": 56736060558.0, + "step": 13574 + }, + { + "epoch": 1.6131907308377897, + "grad_norm": 0.23841368821012113, + "learning_rate": 3.7096957627907666e-06, + "loss": 0.8038, + "num_tokens": 56740249646.0, + "step": 13575 + }, + { + "epoch": 1.6133095662507428, + "grad_norm": 0.2427906018893999, + "learning_rate": 3.7086803024526502e-06, + "loss": 0.7868, + "num_tokens": 56744419261.0, + "step": 13576 + }, + { + "epoch": 1.613428401663696, + "grad_norm": 0.21339703054165843, + "learning_rate": 3.707665112136758e-06, + "loss": 0.7875, + "num_tokens": 56748608420.0, + "step": 13577 + }, + { + "epoch": 1.6135472370766488, + "grad_norm": 0.25198538230042633, + "learning_rate": 3.7066501918806795e-06, + "loss": 0.8231, + "num_tokens": 56752797579.0, + "step": 13578 + }, + { + "epoch": 1.613666072489602, + "grad_norm": 0.24979503273403253, + "learning_rate": 3.7056355417220026e-06, + "loss": 0.8329, + "num_tokens": 56756986187.0, + "step": 13579 + }, + { + "epoch": 1.6137849079025548, + "grad_norm": 0.25438807489037596, + "learning_rate": 3.704621161698305e-06, + "loss": 0.8198, + "num_tokens": 56761174699.0, + "step": 13580 + }, + { + "epoch": 1.613903743315508, + "grad_norm": 0.25303905715950287, + "learning_rate": 3.7036070518471543e-06, + "loss": 0.8016, + "num_tokens": 56765363129.0, + "step": 13581 + }, + { + "epoch": 1.614022578728461, + "grad_norm": 0.27050323252144814, + "learning_rate": 3.702593212206105e-06, + "loss": 0.7865, + "num_tokens": 56769541793.0, + "step": 13582 + }, + { + "epoch": 1.614141414141414, + "grad_norm": 0.23853916199700456, + "learning_rate": 3.7015796428127005e-06, + "loss": 0.829, + "num_tokens": 56773705657.0, + "step": 13583 + }, + { + "epoch": 1.6142602495543672, + "grad_norm": 0.26704890385105523, + "learning_rate": 3.700566343704478e-06, + "loss": 0.8188, + "num_tokens": 56777872326.0, + "step": 13584 + }, + { + "epoch": 1.6143790849673203, + "grad_norm": 0.24136415251547538, + "learning_rate": 3.6995533149189613e-06, + "loss": 0.8252, + "num_tokens": 56782061549.0, + "step": 13585 + }, + { + "epoch": 1.6144979203802734, + "grad_norm": 0.28484457858676193, + "learning_rate": 3.698540556493671e-06, + "loss": 0.808, + "num_tokens": 56786231344.0, + "step": 13586 + }, + { + "epoch": 1.6146167557932265, + "grad_norm": 0.2416791064195985, + "learning_rate": 3.697528068466113e-06, + "loss": 0.8152, + "num_tokens": 56790402810.0, + "step": 13587 + }, + { + "epoch": 1.6147355912061796, + "grad_norm": 0.26108874605447435, + "learning_rate": 3.6965158508737786e-06, + "loss": 0.8274, + "num_tokens": 56794570693.0, + "step": 13588 + }, + { + "epoch": 1.6148544266191325, + "grad_norm": 0.3081685581836633, + "learning_rate": 3.695503903754156e-06, + "loss": 0.7921, + "num_tokens": 56798738144.0, + "step": 13589 + }, + { + "epoch": 1.6149732620320856, + "grad_norm": 0.2524498659460962, + "learning_rate": 3.694492227144721e-06, + "loss": 0.8027, + "num_tokens": 56802890149.0, + "step": 13590 + }, + { + "epoch": 1.6150920974450387, + "grad_norm": 0.2807748750906843, + "learning_rate": 3.6934808210829377e-06, + "loss": 0.8082, + "num_tokens": 56807077929.0, + "step": 13591 + }, + { + "epoch": 1.6152109328579916, + "grad_norm": 0.26715364337083103, + "learning_rate": 3.692469685606267e-06, + "loss": 0.8108, + "num_tokens": 56811250193.0, + "step": 13592 + }, + { + "epoch": 1.6153297682709447, + "grad_norm": 0.25696656566040277, + "learning_rate": 3.691458820752149e-06, + "loss": 0.8192, + "num_tokens": 56815401188.0, + "step": 13593 + }, + { + "epoch": 1.6154486036838978, + "grad_norm": 0.27580716586643556, + "learning_rate": 3.690448226558022e-06, + "loss": 0.8081, + "num_tokens": 56819558428.0, + "step": 13594 + }, + { + "epoch": 1.6155674390968509, + "grad_norm": 0.2448418343424128, + "learning_rate": 3.6894379030613114e-06, + "loss": 0.8026, + "num_tokens": 56823748726.0, + "step": 13595 + }, + { + "epoch": 1.615686274509804, + "grad_norm": 0.27642149600280663, + "learning_rate": 3.688427850299433e-06, + "loss": 0.7896, + "num_tokens": 56827916375.0, + "step": 13596 + }, + { + "epoch": 1.615805109922757, + "grad_norm": 0.2568957728390198, + "learning_rate": 3.687418068309792e-06, + "loss": 0.8377, + "num_tokens": 56832057259.0, + "step": 13597 + }, + { + "epoch": 1.6159239453357102, + "grad_norm": 0.2506844952859375, + "learning_rate": 3.686408557129785e-06, + "loss": 0.8281, + "num_tokens": 56836245874.0, + "step": 13598 + }, + { + "epoch": 1.6160427807486633, + "grad_norm": 0.24600220294440617, + "learning_rate": 3.6853993167967962e-06, + "loss": 0.7768, + "num_tokens": 56840435735.0, + "step": 13599 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.2561932564898792, + "learning_rate": 3.684390347348202e-06, + "loss": 0.7728, + "num_tokens": 56844602772.0, + "step": 13600 + }, + { + "epoch": 1.6162804515745692, + "grad_norm": 0.2443558315666547, + "learning_rate": 3.683381648821368e-06, + "loss": 0.826, + "num_tokens": 56848744714.0, + "step": 13601 + }, + { + "epoch": 1.6163992869875223, + "grad_norm": 0.24883185798697477, + "learning_rate": 3.68237322125365e-06, + "loss": 0.7998, + "num_tokens": 56852931494.0, + "step": 13602 + }, + { + "epoch": 1.6165181224004752, + "grad_norm": 0.24667562441730637, + "learning_rate": 3.681365064682395e-06, + "loss": 0.8356, + "num_tokens": 56857093068.0, + "step": 13603 + }, + { + "epoch": 1.6166369578134283, + "grad_norm": 0.28284538079168386, + "learning_rate": 3.680357179144932e-06, + "loss": 0.807, + "num_tokens": 56861283515.0, + "step": 13604 + }, + { + "epoch": 1.6167557932263814, + "grad_norm": 0.23397789271609604, + "learning_rate": 3.679349564678593e-06, + "loss": 0.7996, + "num_tokens": 56865472895.0, + "step": 13605 + }, + { + "epoch": 1.6168746286393345, + "grad_norm": 0.2784454681285859, + "learning_rate": 3.678342221320692e-06, + "loss": 0.8675, + "num_tokens": 56869637608.0, + "step": 13606 + }, + { + "epoch": 1.6169934640522876, + "grad_norm": 0.258564682882241, + "learning_rate": 3.677335149108533e-06, + "loss": 0.8281, + "num_tokens": 56873826152.0, + "step": 13607 + }, + { + "epoch": 1.6171122994652407, + "grad_norm": 0.26340781756031373, + "learning_rate": 3.676328348079415e-06, + "loss": 0.8102, + "num_tokens": 56878016120.0, + "step": 13608 + }, + { + "epoch": 1.6172311348781938, + "grad_norm": 0.2602555085965833, + "learning_rate": 3.6753218182706164e-06, + "loss": 0.8377, + "num_tokens": 56882205819.0, + "step": 13609 + }, + { + "epoch": 1.617349970291147, + "grad_norm": 0.24896223582014693, + "learning_rate": 3.674315559719416e-06, + "loss": 0.8142, + "num_tokens": 56886389193.0, + "step": 13610 + }, + { + "epoch": 1.6174688057040998, + "grad_norm": 0.27138932931869086, + "learning_rate": 3.6733095724630797e-06, + "loss": 0.8143, + "num_tokens": 56890529377.0, + "step": 13611 + }, + { + "epoch": 1.6175876411170529, + "grad_norm": 0.24859959267247303, + "learning_rate": 3.672303856538863e-06, + "loss": 0.8258, + "num_tokens": 56894717338.0, + "step": 13612 + }, + { + "epoch": 1.617706476530006, + "grad_norm": 0.2568101429496536, + "learning_rate": 3.6712984119840123e-06, + "loss": 0.845, + "num_tokens": 56898906379.0, + "step": 13613 + }, + { + "epoch": 1.6178253119429589, + "grad_norm": 0.2364883340769422, + "learning_rate": 3.670293238835757e-06, + "loss": 0.8044, + "num_tokens": 56903091182.0, + "step": 13614 + }, + { + "epoch": 1.617944147355912, + "grad_norm": 0.2619695112674954, + "learning_rate": 3.669288337131326e-06, + "loss": 0.8102, + "num_tokens": 56907279816.0, + "step": 13615 + }, + { + "epoch": 1.618062982768865, + "grad_norm": 0.23412488141047613, + "learning_rate": 3.6682837069079343e-06, + "loss": 0.8096, + "num_tokens": 56911468051.0, + "step": 13616 + }, + { + "epoch": 1.6181818181818182, + "grad_norm": 0.23627649667899758, + "learning_rate": 3.6672793482027836e-06, + "loss": 0.7908, + "num_tokens": 56915656352.0, + "step": 13617 + }, + { + "epoch": 1.6183006535947713, + "grad_norm": 0.23636873623314308, + "learning_rate": 3.6662752610530754e-06, + "loss": 0.7918, + "num_tokens": 56919825142.0, + "step": 13618 + }, + { + "epoch": 1.6184194890077244, + "grad_norm": 0.24210413756734714, + "learning_rate": 3.6652714454959877e-06, + "loss": 0.8157, + "num_tokens": 56924014891.0, + "step": 13619 + }, + { + "epoch": 1.6185383244206775, + "grad_norm": 0.24493700826580692, + "learning_rate": 3.6642679015686976e-06, + "loss": 0.7999, + "num_tokens": 56928204688.0, + "step": 13620 + }, + { + "epoch": 1.6186571598336306, + "grad_norm": 0.2311896758594142, + "learning_rate": 3.66326462930837e-06, + "loss": 0.8315, + "num_tokens": 56932370360.0, + "step": 13621 + }, + { + "epoch": 1.6187759952465834, + "grad_norm": 0.233449073295407, + "learning_rate": 3.6622616287521595e-06, + "loss": 0.832, + "num_tokens": 56936558272.0, + "step": 13622 + }, + { + "epoch": 1.6188948306595365, + "grad_norm": 0.24423857928606382, + "learning_rate": 3.6612588999372107e-06, + "loss": 0.8117, + "num_tokens": 56940743029.0, + "step": 13623 + }, + { + "epoch": 1.6190136660724896, + "grad_norm": 0.2294043323998696, + "learning_rate": 3.660256442900657e-06, + "loss": 0.8184, + "num_tokens": 56944907897.0, + "step": 13624 + }, + { + "epoch": 1.6191325014854425, + "grad_norm": 0.2547815089527242, + "learning_rate": 3.659254257679623e-06, + "loss": 0.8055, + "num_tokens": 56949095258.0, + "step": 13625 + }, + { + "epoch": 1.6192513368983956, + "grad_norm": 0.22187090926340783, + "learning_rate": 3.658252344311225e-06, + "loss": 0.7946, + "num_tokens": 56953284799.0, + "step": 13626 + }, + { + "epoch": 1.6193701723113487, + "grad_norm": 0.25475260187671317, + "learning_rate": 3.657250702832564e-06, + "loss": 0.8045, + "num_tokens": 56957475006.0, + "step": 13627 + }, + { + "epoch": 1.6194890077243018, + "grad_norm": 0.2540016660269872, + "learning_rate": 3.6562493332807367e-06, + "loss": 0.79, + "num_tokens": 56961664804.0, + "step": 13628 + }, + { + "epoch": 1.619607843137255, + "grad_norm": 0.22339978377040098, + "learning_rate": 3.6552482356928267e-06, + "loss": 0.8426, + "num_tokens": 56965813884.0, + "step": 13629 + }, + { + "epoch": 1.619726678550208, + "grad_norm": 0.25150454444294673, + "learning_rate": 3.6542474101059083e-06, + "loss": 0.7715, + "num_tokens": 56970002010.0, + "step": 13630 + }, + { + "epoch": 1.6198455139631611, + "grad_norm": 0.25247418631233876, + "learning_rate": 3.6532468565570438e-06, + "loss": 0.7825, + "num_tokens": 56974141200.0, + "step": 13631 + }, + { + "epoch": 1.6199643493761142, + "grad_norm": 0.2388288932490651, + "learning_rate": 3.652246575083288e-06, + "loss": 0.8177, + "num_tokens": 56978299796.0, + "step": 13632 + }, + { + "epoch": 1.620083184789067, + "grad_norm": 0.24123956511302216, + "learning_rate": 3.651246565721685e-06, + "loss": 0.8297, + "num_tokens": 56982488434.0, + "step": 13633 + }, + { + "epoch": 1.6202020202020202, + "grad_norm": 0.23433911203664612, + "learning_rate": 3.650246828509271e-06, + "loss": 0.8052, + "num_tokens": 56986677316.0, + "step": 13634 + }, + { + "epoch": 1.6203208556149733, + "grad_norm": 0.22915385215867834, + "learning_rate": 3.649247363483065e-06, + "loss": 0.8166, + "num_tokens": 56990865811.0, + "step": 13635 + }, + { + "epoch": 1.6204396910279262, + "grad_norm": 0.2292572377150314, + "learning_rate": 3.648248170680081e-06, + "loss": 0.8102, + "num_tokens": 56995055613.0, + "step": 13636 + }, + { + "epoch": 1.6205585264408793, + "grad_norm": 0.23757875874351403, + "learning_rate": 3.6472492501373245e-06, + "loss": 0.7806, + "num_tokens": 56999246112.0, + "step": 13637 + }, + { + "epoch": 1.6206773618538324, + "grad_norm": 0.23392190968999782, + "learning_rate": 3.6462506018917897e-06, + "loss": 0.8027, + "num_tokens": 57003429319.0, + "step": 13638 + }, + { + "epoch": 1.6207961972667855, + "grad_norm": 0.21738224366888098, + "learning_rate": 3.6452522259804613e-06, + "loss": 0.7695, + "num_tokens": 57007618120.0, + "step": 13639 + }, + { + "epoch": 1.6209150326797386, + "grad_norm": 0.23312501065846916, + "learning_rate": 3.644254122440307e-06, + "loss": 0.791, + "num_tokens": 57011807213.0, + "step": 13640 + }, + { + "epoch": 1.6210338680926917, + "grad_norm": 0.22982826468444068, + "learning_rate": 3.6432562913082923e-06, + "loss": 0.7952, + "num_tokens": 57015993105.0, + "step": 13641 + }, + { + "epoch": 1.6211527035056448, + "grad_norm": 0.23748031749017423, + "learning_rate": 3.64225873262137e-06, + "loss": 0.7983, + "num_tokens": 57020181635.0, + "step": 13642 + }, + { + "epoch": 1.6212715389185979, + "grad_norm": 0.24037199151234936, + "learning_rate": 3.641261446416486e-06, + "loss": 0.8246, + "num_tokens": 57024369812.0, + "step": 13643 + }, + { + "epoch": 1.6213903743315508, + "grad_norm": 0.2585545120744553, + "learning_rate": 3.6402644327305725e-06, + "loss": 0.7682, + "num_tokens": 57028558785.0, + "step": 13644 + }, + { + "epoch": 1.6215092097445039, + "grad_norm": 0.23651540930692058, + "learning_rate": 3.63926769160055e-06, + "loss": 0.8077, + "num_tokens": 57032747204.0, + "step": 13645 + }, + { + "epoch": 1.621628045157457, + "grad_norm": 0.22933215960050254, + "learning_rate": 3.63827122306333e-06, + "loss": 0.7861, + "num_tokens": 57036915890.0, + "step": 13646 + }, + { + "epoch": 1.6217468805704098, + "grad_norm": 0.24750164160171348, + "learning_rate": 3.637275027155819e-06, + "loss": 0.7847, + "num_tokens": 57041100529.0, + "step": 13647 + }, + { + "epoch": 1.621865715983363, + "grad_norm": 0.2473416164371052, + "learning_rate": 3.6362791039149077e-06, + "loss": 0.8028, + "num_tokens": 57045281236.0, + "step": 13648 + }, + { + "epoch": 1.621984551396316, + "grad_norm": 0.2585805304992151, + "learning_rate": 3.6352834533774774e-06, + "loss": 0.8166, + "num_tokens": 57049455538.0, + "step": 13649 + }, + { + "epoch": 1.6221033868092691, + "grad_norm": 0.2558953388669461, + "learning_rate": 3.6342880755804024e-06, + "loss": 0.8391, + "num_tokens": 57053644798.0, + "step": 13650 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.2538258317027519, + "learning_rate": 3.6332929705605437e-06, + "loss": 0.8129, + "num_tokens": 57057834960.0, + "step": 13651 + }, + { + "epoch": 1.6223410576351753, + "grad_norm": 0.2539767823058526, + "learning_rate": 3.632298138354754e-06, + "loss": 0.7765, + "num_tokens": 57062011682.0, + "step": 13652 + }, + { + "epoch": 1.6224598930481284, + "grad_norm": 0.24425657629383526, + "learning_rate": 3.631303578999874e-06, + "loss": 0.8196, + "num_tokens": 57066199965.0, + "step": 13653 + }, + { + "epoch": 1.6225787284610815, + "grad_norm": 0.26636876687560124, + "learning_rate": 3.6303092925327376e-06, + "loss": 0.8562, + "num_tokens": 57070358803.0, + "step": 13654 + }, + { + "epoch": 1.6226975638740344, + "grad_norm": 0.25066595033842876, + "learning_rate": 3.6293152789901653e-06, + "loss": 0.7956, + "num_tokens": 57074520693.0, + "step": 13655 + }, + { + "epoch": 1.6228163992869875, + "grad_norm": 0.26279148072038944, + "learning_rate": 3.62832153840897e-06, + "loss": 0.7851, + "num_tokens": 57078690596.0, + "step": 13656 + }, + { + "epoch": 1.6229352346999406, + "grad_norm": 0.2650883476830094, + "learning_rate": 3.6273280708259507e-06, + "loss": 0.791, + "num_tokens": 57082881235.0, + "step": 13657 + }, + { + "epoch": 1.6230540701128935, + "grad_norm": 0.2639097738411585, + "learning_rate": 3.6263348762779016e-06, + "loss": 0.8276, + "num_tokens": 57087063689.0, + "step": 13658 + }, + { + "epoch": 1.6231729055258466, + "grad_norm": 0.24809454989911858, + "learning_rate": 3.625341954801602e-06, + "loss": 0.8127, + "num_tokens": 57091252742.0, + "step": 13659 + }, + { + "epoch": 1.6232917409387997, + "grad_norm": 0.26020448045605155, + "learning_rate": 3.6243493064338275e-06, + "loss": 0.8586, + "num_tokens": 57095442682.0, + "step": 13660 + }, + { + "epoch": 1.6234105763517528, + "grad_norm": 0.261860771712915, + "learning_rate": 3.6233569312113314e-06, + "loss": 0.8162, + "num_tokens": 57099632551.0, + "step": 13661 + }, + { + "epoch": 1.6235294117647059, + "grad_norm": 0.26332395341482456, + "learning_rate": 3.622364829170871e-06, + "loss": 0.7914, + "num_tokens": 57103811225.0, + "step": 13662 + }, + { + "epoch": 1.623648247177659, + "grad_norm": 0.2527206596416318, + "learning_rate": 3.6213730003491856e-06, + "loss": 0.7838, + "num_tokens": 57107974334.0, + "step": 13663 + }, + { + "epoch": 1.623767082590612, + "grad_norm": 0.2404930641809689, + "learning_rate": 3.6203814447830054e-06, + "loss": 0.8233, + "num_tokens": 57112164266.0, + "step": 13664 + }, + { + "epoch": 1.6238859180035652, + "grad_norm": 0.2533588533175486, + "learning_rate": 3.6193901625090542e-06, + "loss": 0.7934, + "num_tokens": 57116344820.0, + "step": 13665 + }, + { + "epoch": 1.6240047534165183, + "grad_norm": 0.22851248318868178, + "learning_rate": 3.618399153564036e-06, + "loss": 0.772, + "num_tokens": 57120521989.0, + "step": 13666 + }, + { + "epoch": 1.6241235888294712, + "grad_norm": 0.25646930923259414, + "learning_rate": 3.6174084179846557e-06, + "loss": 0.8009, + "num_tokens": 57124710711.0, + "step": 13667 + }, + { + "epoch": 1.6242424242424243, + "grad_norm": 0.23356537613132586, + "learning_rate": 3.616417955807601e-06, + "loss": 0.8372, + "num_tokens": 57128899757.0, + "step": 13668 + }, + { + "epoch": 1.6243612596553771, + "grad_norm": 0.23842194457683297, + "learning_rate": 3.6154277670695564e-06, + "loss": 0.7826, + "num_tokens": 57133079475.0, + "step": 13669 + }, + { + "epoch": 1.6244800950683302, + "grad_norm": 0.22505749875015785, + "learning_rate": 3.614437851807191e-06, + "loss": 0.8196, + "num_tokens": 57137229666.0, + "step": 13670 + }, + { + "epoch": 1.6245989304812833, + "grad_norm": 0.24271104377816644, + "learning_rate": 3.613448210057161e-06, + "loss": 0.8269, + "num_tokens": 57141420394.0, + "step": 13671 + }, + { + "epoch": 1.6247177658942364, + "grad_norm": 0.258025395012838, + "learning_rate": 3.612458841856119e-06, + "loss": 0.8232, + "num_tokens": 57145610474.0, + "step": 13672 + }, + { + "epoch": 1.6248366013071895, + "grad_norm": 0.2327771319940139, + "learning_rate": 3.6114697472407023e-06, + "loss": 0.8099, + "num_tokens": 57149752165.0, + "step": 13673 + }, + { + "epoch": 1.6249554367201426, + "grad_norm": 0.23593239165294302, + "learning_rate": 3.6104809262475416e-06, + "loss": 0.8198, + "num_tokens": 57153942879.0, + "step": 13674 + }, + { + "epoch": 1.6250742721330957, + "grad_norm": 0.23613366016151657, + "learning_rate": 3.6094923789132608e-06, + "loss": 0.7668, + "num_tokens": 57158131640.0, + "step": 13675 + }, + { + "epoch": 1.6251931075460488, + "grad_norm": 0.25402801840357153, + "learning_rate": 3.6085041052744618e-06, + "loss": 0.7946, + "num_tokens": 57162303709.0, + "step": 13676 + }, + { + "epoch": 1.625311942959002, + "grad_norm": 0.22941930063929883, + "learning_rate": 3.6075161053677487e-06, + "loss": 0.7905, + "num_tokens": 57166493588.0, + "step": 13677 + }, + { + "epoch": 1.6254307783719548, + "grad_norm": 0.23175876051507271, + "learning_rate": 3.606528379229708e-06, + "loss": 0.8268, + "num_tokens": 57170664200.0, + "step": 13678 + }, + { + "epoch": 1.625549613784908, + "grad_norm": 0.260185854464627, + "learning_rate": 3.605540926896918e-06, + "loss": 0.8343, + "num_tokens": 57174829748.0, + "step": 13679 + }, + { + "epoch": 1.6256684491978608, + "grad_norm": 0.2491826181005898, + "learning_rate": 3.60455374840595e-06, + "loss": 0.7852, + "num_tokens": 57179005171.0, + "step": 13680 + }, + { + "epoch": 1.625787284610814, + "grad_norm": 0.24347746785938765, + "learning_rate": 3.6035668437933626e-06, + "loss": 0.8198, + "num_tokens": 57183194408.0, + "step": 13681 + }, + { + "epoch": 1.625906120023767, + "grad_norm": 0.23118747993686634, + "learning_rate": 3.602580213095701e-06, + "loss": 0.7985, + "num_tokens": 57187383177.0, + "step": 13682 + }, + { + "epoch": 1.62602495543672, + "grad_norm": 0.2399528446935502, + "learning_rate": 3.601593856349506e-06, + "loss": 0.7901, + "num_tokens": 57191572415.0, + "step": 13683 + }, + { + "epoch": 1.6261437908496732, + "grad_norm": 0.24049231953644903, + "learning_rate": 3.600607773591305e-06, + "loss": 0.7772, + "num_tokens": 57195745125.0, + "step": 13684 + }, + { + "epoch": 1.6262626262626263, + "grad_norm": 0.23352858162751203, + "learning_rate": 3.5996219648576156e-06, + "loss": 0.8091, + "num_tokens": 57199906972.0, + "step": 13685 + }, + { + "epoch": 1.6263814616755794, + "grad_norm": 0.2291249931628096, + "learning_rate": 3.5986364301849462e-06, + "loss": 0.8052, + "num_tokens": 57204094600.0, + "step": 13686 + }, + { + "epoch": 1.6265002970885325, + "grad_norm": 0.23122696585858046, + "learning_rate": 3.5976511696097956e-06, + "loss": 0.8094, + "num_tokens": 57208265423.0, + "step": 13687 + }, + { + "epoch": 1.6266191325014856, + "grad_norm": 0.24344571995535527, + "learning_rate": 3.596666183168649e-06, + "loss": 0.8327, + "num_tokens": 57212455711.0, + "step": 13688 + }, + { + "epoch": 1.6267379679144385, + "grad_norm": 0.2377489549662799, + "learning_rate": 3.595681470897985e-06, + "loss": 0.8092, + "num_tokens": 57216646064.0, + "step": 13689 + }, + { + "epoch": 1.6268568033273916, + "grad_norm": 0.23007347420685945, + "learning_rate": 3.594697032834271e-06, + "loss": 0.8096, + "num_tokens": 57220835067.0, + "step": 13690 + }, + { + "epoch": 1.6269756387403447, + "grad_norm": 0.22729804414521626, + "learning_rate": 3.5937128690139638e-06, + "loss": 0.7906, + "num_tokens": 57225014994.0, + "step": 13691 + }, + { + "epoch": 1.6270944741532976, + "grad_norm": 0.21926883281991508, + "learning_rate": 3.592728979473513e-06, + "loss": 0.7652, + "num_tokens": 57229174469.0, + "step": 13692 + }, + { + "epoch": 1.6272133095662507, + "grad_norm": 0.22061810846888805, + "learning_rate": 3.591745364249347e-06, + "loss": 0.8244, + "num_tokens": 57233343698.0, + "step": 13693 + }, + { + "epoch": 1.6273321449792038, + "grad_norm": 0.22655565852114293, + "learning_rate": 3.5907620233779016e-06, + "loss": 0.8161, + "num_tokens": 57237530956.0, + "step": 13694 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.2310400528986572, + "learning_rate": 3.589778956895589e-06, + "loss": 0.8322, + "num_tokens": 57241720379.0, + "step": 13695 + }, + { + "epoch": 1.62756981580511, + "grad_norm": 0.225517323532712, + "learning_rate": 3.588796164838817e-06, + "loss": 0.7954, + "num_tokens": 57245909983.0, + "step": 13696 + }, + { + "epoch": 1.627688651218063, + "grad_norm": 0.24690334805851402, + "learning_rate": 3.5878136472439817e-06, + "loss": 0.8141, + "num_tokens": 57250098725.0, + "step": 13697 + }, + { + "epoch": 1.6278074866310162, + "grad_norm": 0.2716927126314888, + "learning_rate": 3.586831404147467e-06, + "loss": 0.8279, + "num_tokens": 57254263597.0, + "step": 13698 + }, + { + "epoch": 1.6279263220439693, + "grad_norm": 0.22992458900357965, + "learning_rate": 3.5858494355856465e-06, + "loss": 0.8039, + "num_tokens": 57258409048.0, + "step": 13699 + }, + { + "epoch": 1.6280451574569221, + "grad_norm": 0.23134754152915699, + "learning_rate": 3.584867741594893e-06, + "loss": 0.7672, + "num_tokens": 57262595892.0, + "step": 13700 + }, + { + "epoch": 1.6281639928698752, + "grad_norm": 0.24288286939315462, + "learning_rate": 3.5838863222115573e-06, + "loss": 0.8315, + "num_tokens": 57266768513.0, + "step": 13701 + }, + { + "epoch": 1.6282828282828283, + "grad_norm": 0.2549396192320951, + "learning_rate": 3.5829051774719893e-06, + "loss": 0.786, + "num_tokens": 57270931911.0, + "step": 13702 + }, + { + "epoch": 1.6284016636957812, + "grad_norm": 0.25951241571323425, + "learning_rate": 3.5819243074125158e-06, + "loss": 0.8097, + "num_tokens": 57275120824.0, + "step": 13703 + }, + { + "epoch": 1.6285204991087343, + "grad_norm": 0.2616922703446131, + "learning_rate": 3.5809437120694678e-06, + "loss": 0.823, + "num_tokens": 57279310560.0, + "step": 13704 + }, + { + "epoch": 1.6286393345216874, + "grad_norm": 0.26696818674421474, + "learning_rate": 3.5799633914791575e-06, + "loss": 0.8474, + "num_tokens": 57283499631.0, + "step": 13705 + }, + { + "epoch": 1.6287581699346405, + "grad_norm": 0.230039442253928, + "learning_rate": 3.5789833456778898e-06, + "loss": 0.8268, + "num_tokens": 57287687605.0, + "step": 13706 + }, + { + "epoch": 1.6288770053475936, + "grad_norm": 0.2632816801524383, + "learning_rate": 3.5780035747019624e-06, + "loss": 0.7972, + "num_tokens": 57291864903.0, + "step": 13707 + }, + { + "epoch": 1.6289958407605467, + "grad_norm": 0.23553522643523786, + "learning_rate": 3.5770240785876563e-06, + "loss": 0.7952, + "num_tokens": 57296045756.0, + "step": 13708 + }, + { + "epoch": 1.6291146761734998, + "grad_norm": 0.2525810369967214, + "learning_rate": 3.5760448573712458e-06, + "loss": 0.7801, + "num_tokens": 57300212312.0, + "step": 13709 + }, + { + "epoch": 1.629233511586453, + "grad_norm": 0.24328423113979492, + "learning_rate": 3.575065911088995e-06, + "loss": 0.8203, + "num_tokens": 57304373552.0, + "step": 13710 + }, + { + "epoch": 1.6293523469994058, + "grad_norm": 0.2435404722776119, + "learning_rate": 3.574087239777159e-06, + "loss": 0.8104, + "num_tokens": 57308563485.0, + "step": 13711 + }, + { + "epoch": 1.6294711824123589, + "grad_norm": 0.23097506744328766, + "learning_rate": 3.573108843471979e-06, + "loss": 0.7938, + "num_tokens": 57312720433.0, + "step": 13712 + }, + { + "epoch": 1.629590017825312, + "grad_norm": 0.23460165590999457, + "learning_rate": 3.572130722209689e-06, + "loss": 0.7959, + "num_tokens": 57316910557.0, + "step": 13713 + }, + { + "epoch": 1.6297088532382649, + "grad_norm": 0.2525834233825154, + "learning_rate": 3.5711528760265134e-06, + "loss": 0.8145, + "num_tokens": 57321096229.0, + "step": 13714 + }, + { + "epoch": 1.629827688651218, + "grad_norm": 0.2482645234198364, + "learning_rate": 3.5701753049586646e-06, + "loss": 0.8194, + "num_tokens": 57325284063.0, + "step": 13715 + }, + { + "epoch": 1.629946524064171, + "grad_norm": 0.24150141039057602, + "learning_rate": 3.569198009042344e-06, + "loss": 0.8132, + "num_tokens": 57329473561.0, + "step": 13716 + }, + { + "epoch": 1.6300653594771242, + "grad_norm": 0.24274208802246242, + "learning_rate": 3.5682209883137464e-06, + "loss": 0.8051, + "num_tokens": 57333660147.0, + "step": 13717 + }, + { + "epoch": 1.6301841948900773, + "grad_norm": 0.26998990090921654, + "learning_rate": 3.567244242809052e-06, + "loss": 0.8085, + "num_tokens": 57337848578.0, + "step": 13718 + }, + { + "epoch": 1.6303030303030304, + "grad_norm": 0.2311373561524928, + "learning_rate": 3.5662677725644344e-06, + "loss": 0.8004, + "num_tokens": 57342019883.0, + "step": 13719 + }, + { + "epoch": 1.6304218657159835, + "grad_norm": 0.25648870350402314, + "learning_rate": 3.5652915776160558e-06, + "loss": 0.8353, + "num_tokens": 57346209228.0, + "step": 13720 + }, + { + "epoch": 1.6305407011289366, + "grad_norm": 0.24113131243474104, + "learning_rate": 3.564315658000067e-06, + "loss": 0.8393, + "num_tokens": 57350397788.0, + "step": 13721 + }, + { + "epoch": 1.6306595365418894, + "grad_norm": 0.27077538332676077, + "learning_rate": 3.563340013752611e-06, + "loss": 0.8175, + "num_tokens": 57354566861.0, + "step": 13722 + }, + { + "epoch": 1.6307783719548425, + "grad_norm": 0.26122993668373845, + "learning_rate": 3.562364644909819e-06, + "loss": 0.7646, + "num_tokens": 57358755429.0, + "step": 13723 + }, + { + "epoch": 1.6308972073677956, + "grad_norm": 0.25918416693094987, + "learning_rate": 3.5613895515078088e-06, + "loss": 0.8089, + "num_tokens": 57362928812.0, + "step": 13724 + }, + { + "epoch": 1.6310160427807485, + "grad_norm": 0.2628914953700219, + "learning_rate": 3.5604147335826932e-06, + "loss": 0.7754, + "num_tokens": 57367117515.0, + "step": 13725 + }, + { + "epoch": 1.6311348781937016, + "grad_norm": 0.24051281029787802, + "learning_rate": 3.5594401911705755e-06, + "loss": 0.7636, + "num_tokens": 57371298216.0, + "step": 13726 + }, + { + "epoch": 1.6312537136066547, + "grad_norm": 0.2773950788075029, + "learning_rate": 3.558465924307545e-06, + "loss": 0.8021, + "num_tokens": 57375487554.0, + "step": 13727 + }, + { + "epoch": 1.6313725490196078, + "grad_norm": 0.2169126334897133, + "learning_rate": 3.557491933029683e-06, + "loss": 0.7931, + "num_tokens": 57379676173.0, + "step": 13728 + }, + { + "epoch": 1.631491384432561, + "grad_norm": 0.27424654546690636, + "learning_rate": 3.556518217373057e-06, + "loss": 0.8228, + "num_tokens": 57383864160.0, + "step": 13729 + }, + { + "epoch": 1.631610219845514, + "grad_norm": 0.23578427786885398, + "learning_rate": 3.555544777373729e-06, + "loss": 0.7921, + "num_tokens": 57388052631.0, + "step": 13730 + }, + { + "epoch": 1.6317290552584671, + "grad_norm": 0.25005859837025235, + "learning_rate": 3.5545716130677453e-06, + "loss": 0.7829, + "num_tokens": 57392222497.0, + "step": 13731 + }, + { + "epoch": 1.6318478906714202, + "grad_norm": 0.24319974832111677, + "learning_rate": 3.5535987244911506e-06, + "loss": 0.8283, + "num_tokens": 57396348160.0, + "step": 13732 + }, + { + "epoch": 1.631966726084373, + "grad_norm": 0.2359925420707488, + "learning_rate": 3.5526261116799744e-06, + "loss": 0.8304, + "num_tokens": 57400527201.0, + "step": 13733 + }, + { + "epoch": 1.6320855614973262, + "grad_norm": 0.23405135263880472, + "learning_rate": 3.5516537746702314e-06, + "loss": 0.8187, + "num_tokens": 57404692871.0, + "step": 13734 + }, + { + "epoch": 1.6322043969102793, + "grad_norm": 0.23490772030075568, + "learning_rate": 3.550681713497932e-06, + "loss": 0.8278, + "num_tokens": 57408882021.0, + "step": 13735 + }, + { + "epoch": 1.6323232323232322, + "grad_norm": 0.2229904240614566, + "learning_rate": 3.5497099281990753e-06, + "loss": 0.7645, + "num_tokens": 57413071912.0, + "step": 13736 + }, + { + "epoch": 1.6324420677361853, + "grad_norm": 0.23815430099555052, + "learning_rate": 3.5487384188096507e-06, + "loss": 0.8056, + "num_tokens": 57417255668.0, + "step": 13737 + }, + { + "epoch": 1.6325609031491384, + "grad_norm": 0.234638941222363, + "learning_rate": 3.5477671853656358e-06, + "loss": 0.8215, + "num_tokens": 57421434490.0, + "step": 13738 + }, + { + "epoch": 1.6326797385620915, + "grad_norm": 0.23225553413326128, + "learning_rate": 3.546796227902998e-06, + "loss": 0.7979, + "num_tokens": 57425623427.0, + "step": 13739 + }, + { + "epoch": 1.6327985739750446, + "grad_norm": 0.22731993319333987, + "learning_rate": 3.5458255464576957e-06, + "loss": 0.7825, + "num_tokens": 57429793010.0, + "step": 13740 + }, + { + "epoch": 1.6329174093879977, + "grad_norm": 0.23882732629637166, + "learning_rate": 3.544855141065677e-06, + "loss": 0.8298, + "num_tokens": 57433953456.0, + "step": 13741 + }, + { + "epoch": 1.6330362448009508, + "grad_norm": 0.23674655827944993, + "learning_rate": 3.5438850117628785e-06, + "loss": 0.8062, + "num_tokens": 57438098428.0, + "step": 13742 + }, + { + "epoch": 1.6331550802139039, + "grad_norm": 0.22434682186917623, + "learning_rate": 3.5429151585852283e-06, + "loss": 0.7958, + "num_tokens": 57442289027.0, + "step": 13743 + }, + { + "epoch": 1.6332739156268568, + "grad_norm": 0.2484926024908276, + "learning_rate": 3.541945581568642e-06, + "loss": 0.8237, + "num_tokens": 57446479397.0, + "step": 13744 + }, + { + "epoch": 1.6333927510398099, + "grad_norm": 0.23629554975025052, + "learning_rate": 3.5409762807490287e-06, + "loss": 0.7993, + "num_tokens": 57450667760.0, + "step": 13745 + }, + { + "epoch": 1.633511586452763, + "grad_norm": 0.2283966328465477, + "learning_rate": 3.5400072561622822e-06, + "loss": 0.8072, + "num_tokens": 57454854740.0, + "step": 13746 + }, + { + "epoch": 1.6336304218657158, + "grad_norm": 0.2582427190626359, + "learning_rate": 3.539038507844289e-06, + "loss": 0.78, + "num_tokens": 57459044464.0, + "step": 13747 + }, + { + "epoch": 1.633749257278669, + "grad_norm": 0.23782536646729874, + "learning_rate": 3.5380700358309273e-06, + "loss": 0.8056, + "num_tokens": 57463234950.0, + "step": 13748 + }, + { + "epoch": 1.633868092691622, + "grad_norm": 0.25184189094621745, + "learning_rate": 3.537101840158064e-06, + "loss": 0.8196, + "num_tokens": 57467422722.0, + "step": 13749 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 0.23650237715386294, + "learning_rate": 3.5361339208615466e-06, + "loss": 0.822, + "num_tokens": 57471611557.0, + "step": 13750 + }, + { + "epoch": 1.6341057635175282, + "grad_norm": 0.24791674927093818, + "learning_rate": 3.5351662779772287e-06, + "loss": 0.8015, + "num_tokens": 57475771362.0, + "step": 13751 + }, + { + "epoch": 1.6342245989304813, + "grad_norm": 0.2563579848123924, + "learning_rate": 3.5341989115409427e-06, + "loss": 0.7963, + "num_tokens": 57479936919.0, + "step": 13752 + }, + { + "epoch": 1.6343434343434344, + "grad_norm": 0.2334496259446079, + "learning_rate": 3.533231821588513e-06, + "loss": 0.7711, + "num_tokens": 57484126737.0, + "step": 13753 + }, + { + "epoch": 1.6344622697563875, + "grad_norm": 0.25672910987747527, + "learning_rate": 3.5322650081557577e-06, + "loss": 0.7603, + "num_tokens": 57488284503.0, + "step": 13754 + }, + { + "epoch": 1.6345811051693404, + "grad_norm": 0.26126020279370604, + "learning_rate": 3.531298471278476e-06, + "loss": 0.7502, + "num_tokens": 57492474286.0, + "step": 13755 + }, + { + "epoch": 1.6346999405822935, + "grad_norm": 0.2429393278747131, + "learning_rate": 3.5303322109924643e-06, + "loss": 0.8603, + "num_tokens": 57496662877.0, + "step": 13756 + }, + { + "epoch": 1.6348187759952466, + "grad_norm": 0.2489829811744753, + "learning_rate": 3.529366227333504e-06, + "loss": 0.8445, + "num_tokens": 57500846055.0, + "step": 13757 + }, + { + "epoch": 1.6349376114081995, + "grad_norm": 0.23530426553370037, + "learning_rate": 3.528400520337373e-06, + "loss": 0.8164, + "num_tokens": 57505002641.0, + "step": 13758 + }, + { + "epoch": 1.6350564468211526, + "grad_norm": 0.24694783582722843, + "learning_rate": 3.5274350900398346e-06, + "loss": 0.8216, + "num_tokens": 57509161855.0, + "step": 13759 + }, + { + "epoch": 1.6351752822341057, + "grad_norm": 0.25285052288922977, + "learning_rate": 3.526469936476638e-06, + "loss": 0.7624, + "num_tokens": 57513348953.0, + "step": 13760 + }, + { + "epoch": 1.6352941176470588, + "grad_norm": 0.23269516682339145, + "learning_rate": 3.5255050596835283e-06, + "loss": 0.7948, + "num_tokens": 57517537425.0, + "step": 13761 + }, + { + "epoch": 1.6354129530600119, + "grad_norm": 0.2721360542046123, + "learning_rate": 3.524540459696239e-06, + "loss": 0.8275, + "num_tokens": 57521725652.0, + "step": 13762 + }, + { + "epoch": 1.635531788472965, + "grad_norm": 0.24852642467735597, + "learning_rate": 3.5235761365504883e-06, + "loss": 0.842, + "num_tokens": 57525913903.0, + "step": 13763 + }, + { + "epoch": 1.635650623885918, + "grad_norm": 0.25075628093714225, + "learning_rate": 3.522612090281997e-06, + "loss": 0.827, + "num_tokens": 57530093633.0, + "step": 13764 + }, + { + "epoch": 1.6357694592988712, + "grad_norm": 0.2645668339751143, + "learning_rate": 3.5216483209264586e-06, + "loss": 0.8095, + "num_tokens": 57534282210.0, + "step": 13765 + }, + { + "epoch": 1.6358882947118243, + "grad_norm": 0.24983170586782485, + "learning_rate": 3.5206848285195688e-06, + "loss": 0.8343, + "num_tokens": 57538447775.0, + "step": 13766 + }, + { + "epoch": 1.6360071301247772, + "grad_norm": 0.2568816553019668, + "learning_rate": 3.5197216130970076e-06, + "loss": 0.7661, + "num_tokens": 57542637820.0, + "step": 13767 + }, + { + "epoch": 1.6361259655377303, + "grad_norm": 0.2743108294349086, + "learning_rate": 3.5187586746944458e-06, + "loss": 0.8292, + "num_tokens": 57546802457.0, + "step": 13768 + }, + { + "epoch": 1.6362448009506831, + "grad_norm": 0.24316435085138247, + "learning_rate": 3.517796013347545e-06, + "loss": 0.8173, + "num_tokens": 57550991047.0, + "step": 13769 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.2793625404053909, + "learning_rate": 3.5168336290919576e-06, + "loss": 0.771, + "num_tokens": 57555179529.0, + "step": 13770 + }, + { + "epoch": 1.6364824717765893, + "grad_norm": 0.26204912769276084, + "learning_rate": 3.5158715219633204e-06, + "loss": 0.829, + "num_tokens": 57559368405.0, + "step": 13771 + }, + { + "epoch": 1.6366013071895424, + "grad_norm": 0.262587706851313, + "learning_rate": 3.514909691997266e-06, + "loss": 0.8065, + "num_tokens": 57563557874.0, + "step": 13772 + }, + { + "epoch": 1.6367201426024955, + "grad_norm": 0.2635899518282071, + "learning_rate": 3.513948139229412e-06, + "loss": 0.7917, + "num_tokens": 57567747419.0, + "step": 13773 + }, + { + "epoch": 1.6368389780154486, + "grad_norm": 0.2655562185771139, + "learning_rate": 3.51298686369537e-06, + "loss": 0.8199, + "num_tokens": 57571937190.0, + "step": 13774 + }, + { + "epoch": 1.6369578134284017, + "grad_norm": 0.28151931703773336, + "learning_rate": 3.512025865430741e-06, + "loss": 0.8354, + "num_tokens": 57576127331.0, + "step": 13775 + }, + { + "epoch": 1.6370766488413548, + "grad_norm": 0.26956509822406977, + "learning_rate": 3.5110651444711074e-06, + "loss": 0.7856, + "num_tokens": 57580316415.0, + "step": 13776 + }, + { + "epoch": 1.637195484254308, + "grad_norm": 0.25923668051963983, + "learning_rate": 3.510104700852055e-06, + "loss": 0.8009, + "num_tokens": 57584495489.0, + "step": 13777 + }, + { + "epoch": 1.6373143196672608, + "grad_norm": 0.24328202678499802, + "learning_rate": 3.509144534609148e-06, + "loss": 0.8021, + "num_tokens": 57588655229.0, + "step": 13778 + }, + { + "epoch": 1.637433155080214, + "grad_norm": 0.2686102608159018, + "learning_rate": 3.5081846457779466e-06, + "loss": 0.8044, + "num_tokens": 57592842444.0, + "step": 13779 + }, + { + "epoch": 1.6375519904931668, + "grad_norm": 0.25029445028302433, + "learning_rate": 3.5072250343940012e-06, + "loss": 0.7935, + "num_tokens": 57597029922.0, + "step": 13780 + }, + { + "epoch": 1.63767082590612, + "grad_norm": 0.25747467273580277, + "learning_rate": 3.5062657004928435e-06, + "loss": 0.7901, + "num_tokens": 57601202059.0, + "step": 13781 + }, + { + "epoch": 1.637789661319073, + "grad_norm": 0.22114703126301277, + "learning_rate": 3.505306644110002e-06, + "loss": 0.8215, + "num_tokens": 57605389010.0, + "step": 13782 + }, + { + "epoch": 1.637908496732026, + "grad_norm": 0.2692971981498074, + "learning_rate": 3.504347865280998e-06, + "loss": 0.8325, + "num_tokens": 57609560309.0, + "step": 13783 + }, + { + "epoch": 1.6380273321449792, + "grad_norm": 0.24843091232512102, + "learning_rate": 3.5033893640413354e-06, + "loss": 0.796, + "num_tokens": 57613749216.0, + "step": 13784 + }, + { + "epoch": 1.6381461675579323, + "grad_norm": 0.2531199019489808, + "learning_rate": 3.5024311404265133e-06, + "loss": 0.7823, + "num_tokens": 57617910290.0, + "step": 13785 + }, + { + "epoch": 1.6382650029708854, + "grad_norm": 0.2387214934487683, + "learning_rate": 3.501473194472015e-06, + "loss": 0.8225, + "num_tokens": 57622098537.0, + "step": 13786 + }, + { + "epoch": 1.6383838383838385, + "grad_norm": 0.22711155435397642, + "learning_rate": 3.5005155262133163e-06, + "loss": 0.8043, + "num_tokens": 57626261113.0, + "step": 13787 + }, + { + "epoch": 1.6385026737967916, + "grad_norm": 0.25499585429620103, + "learning_rate": 3.499558135685882e-06, + "loss": 0.7726, + "num_tokens": 57630449362.0, + "step": 13788 + }, + { + "epoch": 1.6386215092097445, + "grad_norm": 0.22741117339292946, + "learning_rate": 3.4986010229251717e-06, + "loss": 0.8215, + "num_tokens": 57634605522.0, + "step": 13789 + }, + { + "epoch": 1.6387403446226976, + "grad_norm": 0.28123986354386116, + "learning_rate": 3.4976441879666303e-06, + "loss": 0.7954, + "num_tokens": 57638795106.0, + "step": 13790 + }, + { + "epoch": 1.6388591800356507, + "grad_norm": 0.2516169606754097, + "learning_rate": 3.496687630845689e-06, + "loss": 0.8408, + "num_tokens": 57642956125.0, + "step": 13791 + }, + { + "epoch": 1.6389780154486036, + "grad_norm": 0.222050134405128, + "learning_rate": 3.495731351597774e-06, + "loss": 0.8073, + "num_tokens": 57647144837.0, + "step": 13792 + }, + { + "epoch": 1.6390968508615567, + "grad_norm": 0.24672925197511492, + "learning_rate": 3.4947753502582994e-06, + "loss": 0.8152, + "num_tokens": 57651333776.0, + "step": 13793 + }, + { + "epoch": 1.6392156862745098, + "grad_norm": 0.2489166908544088, + "learning_rate": 3.4938196268626693e-06, + "loss": 0.8565, + "num_tokens": 57655516089.0, + "step": 13794 + }, + { + "epoch": 1.6393345216874629, + "grad_norm": 0.23777050059819435, + "learning_rate": 3.492864181446277e-06, + "loss": 0.7909, + "num_tokens": 57659704506.0, + "step": 13795 + }, + { + "epoch": 1.639453357100416, + "grad_norm": 0.23451927187703525, + "learning_rate": 3.4919090140445076e-06, + "loss": 0.7984, + "num_tokens": 57663875344.0, + "step": 13796 + }, + { + "epoch": 1.639572192513369, + "grad_norm": 0.22659771158137837, + "learning_rate": 3.4909541246927325e-06, + "loss": 0.8136, + "num_tokens": 57668063221.0, + "step": 13797 + }, + { + "epoch": 1.6396910279263222, + "grad_norm": 0.24179223565778, + "learning_rate": 3.489999513426314e-06, + "loss": 0.8001, + "num_tokens": 57672253023.0, + "step": 13798 + }, + { + "epoch": 1.6398098633392753, + "grad_norm": 0.2330097197589096, + "learning_rate": 3.489045180280606e-06, + "loss": 0.8628, + "num_tokens": 57676441906.0, + "step": 13799 + }, + { + "epoch": 1.6399286987522281, + "grad_norm": 0.22850800454153422, + "learning_rate": 3.4880911252909495e-06, + "loss": 0.7935, + "num_tokens": 57680589168.0, + "step": 13800 + }, + { + "epoch": 1.6400475341651812, + "grad_norm": 0.2503619480916719, + "learning_rate": 3.4871373484926784e-06, + "loss": 0.8354, + "num_tokens": 57684770985.0, + "step": 13801 + }, + { + "epoch": 1.6401663695781343, + "grad_norm": 0.30617028975947463, + "learning_rate": 3.4861838499211114e-06, + "loss": 0.7821, + "num_tokens": 57688944075.0, + "step": 13802 + }, + { + "epoch": 1.6402852049910872, + "grad_norm": 0.2676201342718246, + "learning_rate": 3.4852306296115613e-06, + "loss": 0.7989, + "num_tokens": 57693133814.0, + "step": 13803 + }, + { + "epoch": 1.6404040404040403, + "grad_norm": 0.22902755778212772, + "learning_rate": 3.48427768759933e-06, + "loss": 0.8067, + "num_tokens": 57697324524.0, + "step": 13804 + }, + { + "epoch": 1.6405228758169934, + "grad_norm": 0.2570468530042307, + "learning_rate": 3.483325023919707e-06, + "loss": 0.7964, + "num_tokens": 57701482530.0, + "step": 13805 + }, + { + "epoch": 1.6406417112299465, + "grad_norm": 0.2533716358967383, + "learning_rate": 3.4823726386079714e-06, + "loss": 0.8223, + "num_tokens": 57705672671.0, + "step": 13806 + }, + { + "epoch": 1.6407605466428996, + "grad_norm": 0.24085713398894157, + "learning_rate": 3.4814205316993988e-06, + "loss": 0.817, + "num_tokens": 57709862401.0, + "step": 13807 + }, + { + "epoch": 1.6408793820558527, + "grad_norm": 0.26129524177658425, + "learning_rate": 3.4804687032292393e-06, + "loss": 0.7875, + "num_tokens": 57714047760.0, + "step": 13808 + }, + { + "epoch": 1.6409982174688058, + "grad_norm": 0.22139216715447668, + "learning_rate": 3.479517153232751e-06, + "loss": 0.781, + "num_tokens": 57718238134.0, + "step": 13809 + }, + { + "epoch": 1.641117052881759, + "grad_norm": 0.2374420232639756, + "learning_rate": 3.4785658817451694e-06, + "loss": 0.8103, + "num_tokens": 57722427399.0, + "step": 13810 + }, + { + "epoch": 1.6412358882947118, + "grad_norm": 0.2319819721721881, + "learning_rate": 3.4776148888017258e-06, + "loss": 0.7942, + "num_tokens": 57726609953.0, + "step": 13811 + }, + { + "epoch": 1.6413547237076649, + "grad_norm": 0.2283840375227007, + "learning_rate": 3.476664174437635e-06, + "loss": 0.8208, + "num_tokens": 57730799007.0, + "step": 13812 + }, + { + "epoch": 1.641473559120618, + "grad_norm": 0.2441655397798343, + "learning_rate": 3.4757137386881055e-06, + "loss": 0.7794, + "num_tokens": 57734957600.0, + "step": 13813 + }, + { + "epoch": 1.6415923945335709, + "grad_norm": 0.2457749797158529, + "learning_rate": 3.474763581588335e-06, + "loss": 0.8397, + "num_tokens": 57739145056.0, + "step": 13814 + }, + { + "epoch": 1.641711229946524, + "grad_norm": 0.2593678888097455, + "learning_rate": 3.4738137031735155e-06, + "loss": 0.8047, + "num_tokens": 57743305013.0, + "step": 13815 + }, + { + "epoch": 1.641830065359477, + "grad_norm": 0.23559197828944115, + "learning_rate": 3.472864103478822e-06, + "loss": 0.8175, + "num_tokens": 57747493833.0, + "step": 13816 + }, + { + "epoch": 1.6419489007724302, + "grad_norm": 0.24543425276167086, + "learning_rate": 3.4719147825394196e-06, + "loss": 0.7858, + "num_tokens": 57751657703.0, + "step": 13817 + }, + { + "epoch": 1.6420677361853833, + "grad_norm": 0.23197713907850423, + "learning_rate": 3.470965740390466e-06, + "loss": 0.8538, + "num_tokens": 57755847241.0, + "step": 13818 + }, + { + "epoch": 1.6421865715983364, + "grad_norm": 0.24918544022054437, + "learning_rate": 3.4700169770671056e-06, + "loss": 0.7715, + "num_tokens": 57760034888.0, + "step": 13819 + }, + { + "epoch": 1.6423054070112895, + "grad_norm": 0.230419678255752, + "learning_rate": 3.469068492604475e-06, + "loss": 0.813, + "num_tokens": 57764212472.0, + "step": 13820 + }, + { + "epoch": 1.6424242424242426, + "grad_norm": 0.24002513536906936, + "learning_rate": 3.4681202870377056e-06, + "loss": 0.8382, + "num_tokens": 57768350212.0, + "step": 13821 + }, + { + "epoch": 1.6425430778371954, + "grad_norm": 0.239463462753857, + "learning_rate": 3.4671723604019046e-06, + "loss": 0.8017, + "num_tokens": 57772538763.0, + "step": 13822 + }, + { + "epoch": 1.6426619132501485, + "grad_norm": 0.24896348670878551, + "learning_rate": 3.4662247127321814e-06, + "loss": 0.8181, + "num_tokens": 57776717274.0, + "step": 13823 + }, + { + "epoch": 1.6427807486631016, + "grad_norm": 0.239105169170297, + "learning_rate": 3.4652773440636288e-06, + "loss": 0.7852, + "num_tokens": 57780880324.0, + "step": 13824 + }, + { + "epoch": 1.6428995840760545, + "grad_norm": 0.25187368677304156, + "learning_rate": 3.464330254431331e-06, + "loss": 0.7995, + "num_tokens": 57785069471.0, + "step": 13825 + }, + { + "epoch": 1.6430184194890076, + "grad_norm": 0.21785797410006902, + "learning_rate": 3.463383443870364e-06, + "loss": 0.7882, + "num_tokens": 57789193367.0, + "step": 13826 + }, + { + "epoch": 1.6431372549019607, + "grad_norm": 0.2267685980067664, + "learning_rate": 3.4624369124157887e-06, + "loss": 0.7665, + "num_tokens": 57793374482.0, + "step": 13827 + }, + { + "epoch": 1.6432560903149138, + "grad_norm": 0.22461486190969665, + "learning_rate": 3.4614906601026598e-06, + "loss": 0.8331, + "num_tokens": 57797546517.0, + "step": 13828 + }, + { + "epoch": 1.643374925727867, + "grad_norm": 0.2290506060006336, + "learning_rate": 3.460544686966021e-06, + "loss": 0.7539, + "num_tokens": 57801710352.0, + "step": 13829 + }, + { + "epoch": 1.64349376114082, + "grad_norm": 0.2325149823966048, + "learning_rate": 3.4595989930409036e-06, + "loss": 0.7886, + "num_tokens": 57805875268.0, + "step": 13830 + }, + { + "epoch": 1.6436125965537731, + "grad_norm": 0.23716813172549966, + "learning_rate": 3.4586535783623298e-06, + "loss": 0.7753, + "num_tokens": 57810063634.0, + "step": 13831 + }, + { + "epoch": 1.6437314319667262, + "grad_norm": 0.23665200576499518, + "learning_rate": 3.4577084429653116e-06, + "loss": 0.7628, + "num_tokens": 57814253227.0, + "step": 13832 + }, + { + "epoch": 1.643850267379679, + "grad_norm": 0.25233181557959655, + "learning_rate": 3.4567635868848504e-06, + "loss": 0.758, + "num_tokens": 57818441869.0, + "step": 13833 + }, + { + "epoch": 1.6439691027926322, + "grad_norm": 0.2374293849136468, + "learning_rate": 3.4558190101559385e-06, + "loss": 0.8255, + "num_tokens": 57822631303.0, + "step": 13834 + }, + { + "epoch": 1.6440879382055853, + "grad_norm": 0.24174957261962227, + "learning_rate": 3.4548747128135564e-06, + "loss": 0.8177, + "num_tokens": 57826808469.0, + "step": 13835 + }, + { + "epoch": 1.6442067736185382, + "grad_norm": 0.23831476456525721, + "learning_rate": 3.4539306948926743e-06, + "loss": 0.7814, + "num_tokens": 57830997460.0, + "step": 13836 + }, + { + "epoch": 1.6443256090314913, + "grad_norm": 0.2388531024638734, + "learning_rate": 3.452986956428254e-06, + "loss": 0.8237, + "num_tokens": 57835185746.0, + "step": 13837 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.22931766030718867, + "learning_rate": 3.4520434974552432e-06, + "loss": 0.7868, + "num_tokens": 57839367355.0, + "step": 13838 + }, + { + "epoch": 1.6445632798573975, + "grad_norm": 0.22750842688827827, + "learning_rate": 3.4511003180085787e-06, + "loss": 0.8036, + "num_tokens": 57843556383.0, + "step": 13839 + }, + { + "epoch": 1.6446821152703506, + "grad_norm": 0.23873167894845707, + "learning_rate": 3.450157418123195e-06, + "loss": 0.7862, + "num_tokens": 57847745690.0, + "step": 13840 + }, + { + "epoch": 1.6448009506833037, + "grad_norm": 0.23035364887282403, + "learning_rate": 3.4492147978340095e-06, + "loss": 0.8247, + "num_tokens": 57851920573.0, + "step": 13841 + }, + { + "epoch": 1.6449197860962568, + "grad_norm": 0.2473170913420664, + "learning_rate": 3.4482724571759283e-06, + "loss": 0.7971, + "num_tokens": 57856110846.0, + "step": 13842 + }, + { + "epoch": 1.6450386215092099, + "grad_norm": 0.2317684641906863, + "learning_rate": 3.4473303961838546e-06, + "loss": 0.7665, + "num_tokens": 57860276685.0, + "step": 13843 + }, + { + "epoch": 1.6451574569221628, + "grad_norm": 0.230779466841372, + "learning_rate": 3.4463886148926695e-06, + "loss": 0.7683, + "num_tokens": 57864466083.0, + "step": 13844 + }, + { + "epoch": 1.6452762923351159, + "grad_norm": 0.23669863719683867, + "learning_rate": 3.4454471133372536e-06, + "loss": 0.7961, + "num_tokens": 57868616545.0, + "step": 13845 + }, + { + "epoch": 1.645395127748069, + "grad_norm": 0.2284247435753429, + "learning_rate": 3.4445058915524723e-06, + "loss": 0.8244, + "num_tokens": 57872807271.0, + "step": 13846 + }, + { + "epoch": 1.6455139631610218, + "grad_norm": 0.24479362861047754, + "learning_rate": 3.4435649495731844e-06, + "loss": 0.8459, + "num_tokens": 57876978300.0, + "step": 13847 + }, + { + "epoch": 1.645632798573975, + "grad_norm": 0.23057606872358266, + "learning_rate": 3.4426242874342374e-06, + "loss": 0.7818, + "num_tokens": 57881150949.0, + "step": 13848 + }, + { + "epoch": 1.645751633986928, + "grad_norm": 0.31042605771536697, + "learning_rate": 3.4416839051704637e-06, + "loss": 0.8214, + "num_tokens": 57885339500.0, + "step": 13849 + }, + { + "epoch": 1.6458704693998811, + "grad_norm": 0.25814303184719695, + "learning_rate": 3.440743802816691e-06, + "loss": 0.7921, + "num_tokens": 57889521058.0, + "step": 13850 + }, + { + "epoch": 1.6459893048128342, + "grad_norm": 0.2614489127524391, + "learning_rate": 3.4398039804077334e-06, + "loss": 0.7934, + "num_tokens": 57893709563.0, + "step": 13851 + }, + { + "epoch": 1.6461081402257873, + "grad_norm": 0.2381686703520503, + "learning_rate": 3.438864437978392e-06, + "loss": 0.7826, + "num_tokens": 57897900301.0, + "step": 13852 + }, + { + "epoch": 1.6462269756387404, + "grad_norm": 0.2414522803755948, + "learning_rate": 3.437925175563472e-06, + "loss": 0.8234, + "num_tokens": 57902090477.0, + "step": 13853 + }, + { + "epoch": 1.6463458110516935, + "grad_norm": 0.2543913695199874, + "learning_rate": 3.4369861931977467e-06, + "loss": 0.8107, + "num_tokens": 57906246397.0, + "step": 13854 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.2580256186836699, + "learning_rate": 3.4360474909159953e-06, + "loss": 0.8273, + "num_tokens": 57910418975.0, + "step": 13855 + }, + { + "epoch": 1.6465834818775995, + "grad_norm": 0.2514035985470156, + "learning_rate": 3.435109068752978e-06, + "loss": 0.7898, + "num_tokens": 57914609197.0, + "step": 13856 + }, + { + "epoch": 1.6467023172905526, + "grad_norm": 0.2543603032264446, + "learning_rate": 3.4341709267434513e-06, + "loss": 0.8348, + "num_tokens": 57918797114.0, + "step": 13857 + }, + { + "epoch": 1.6468211527035055, + "grad_norm": 0.24899976989764122, + "learning_rate": 3.4332330649221547e-06, + "loss": 0.8153, + "num_tokens": 57922960740.0, + "step": 13858 + }, + { + "epoch": 1.6469399881164586, + "grad_norm": 0.24903346511559005, + "learning_rate": 3.432295483323822e-06, + "loss": 0.7907, + "num_tokens": 57927150895.0, + "step": 13859 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.250479643122442, + "learning_rate": 3.431358181983174e-06, + "loss": 0.8087, + "num_tokens": 57931329539.0, + "step": 13860 + }, + { + "epoch": 1.6471776589423648, + "grad_norm": 0.232944284007157, + "learning_rate": 3.430421160934923e-06, + "loss": 0.8038, + "num_tokens": 57935518738.0, + "step": 13861 + }, + { + "epoch": 1.6472964943553179, + "grad_norm": 0.24346320070024483, + "learning_rate": 3.42948442021377e-06, + "loss": 0.7959, + "num_tokens": 57939707247.0, + "step": 13862 + }, + { + "epoch": 1.647415329768271, + "grad_norm": 0.2346684310581977, + "learning_rate": 3.4285479598544054e-06, + "loss": 0.7956, + "num_tokens": 57943896464.0, + "step": 13863 + }, + { + "epoch": 1.647534165181224, + "grad_norm": 0.2326231433600066, + "learning_rate": 3.427611779891512e-06, + "loss": 0.8224, + "num_tokens": 57948072193.0, + "step": 13864 + }, + { + "epoch": 1.6476530005941772, + "grad_norm": 0.24569243145069364, + "learning_rate": 3.426675880359751e-06, + "loss": 0.8319, + "num_tokens": 57952262652.0, + "step": 13865 + }, + { + "epoch": 1.6477718360071303, + "grad_norm": 0.23545892568772303, + "learning_rate": 3.425740261293793e-06, + "loss": 0.8362, + "num_tokens": 57956453092.0, + "step": 13866 + }, + { + "epoch": 1.6478906714200832, + "grad_norm": 0.23617766274845764, + "learning_rate": 3.4248049227282807e-06, + "loss": 0.7871, + "num_tokens": 57960642408.0, + "step": 13867 + }, + { + "epoch": 1.6480095068330363, + "grad_norm": 0.22939202342344833, + "learning_rate": 3.4238698646978542e-06, + "loss": 0.8503, + "num_tokens": 57964801021.0, + "step": 13868 + }, + { + "epoch": 1.6481283422459891, + "grad_norm": 0.2607403880763565, + "learning_rate": 3.4229350872371453e-06, + "loss": 0.7918, + "num_tokens": 57968990839.0, + "step": 13869 + }, + { + "epoch": 1.6482471776589422, + "grad_norm": 0.22737437182217735, + "learning_rate": 3.4220005903807674e-06, + "loss": 0.8352, + "num_tokens": 57973154759.0, + "step": 13870 + }, + { + "epoch": 1.6483660130718953, + "grad_norm": 0.243532468841829, + "learning_rate": 3.4210663741633253e-06, + "loss": 0.8174, + "num_tokens": 57977337246.0, + "step": 13871 + }, + { + "epoch": 1.6484848484848484, + "grad_norm": 0.23064249223066915, + "learning_rate": 3.4201324386194236e-06, + "loss": 0.8256, + "num_tokens": 57981526012.0, + "step": 13872 + }, + { + "epoch": 1.6486036838978015, + "grad_norm": 0.23409803959242476, + "learning_rate": 3.419198783783646e-06, + "loss": 0.8389, + "num_tokens": 57985715473.0, + "step": 13873 + }, + { + "epoch": 1.6487225193107546, + "grad_norm": 0.2270762741583517, + "learning_rate": 3.41826540969057e-06, + "loss": 0.8241, + "num_tokens": 57989878035.0, + "step": 13874 + }, + { + "epoch": 1.6488413547237077, + "grad_norm": 0.2446032368171022, + "learning_rate": 3.4173323163747585e-06, + "loss": 0.8322, + "num_tokens": 57994066180.0, + "step": 13875 + }, + { + "epoch": 1.6489601901366608, + "grad_norm": 0.22059902525917705, + "learning_rate": 3.4163995038707694e-06, + "loss": 0.8024, + "num_tokens": 57998256571.0, + "step": 13876 + }, + { + "epoch": 1.649079025549614, + "grad_norm": 0.22964573925223147, + "learning_rate": 3.415466972213145e-06, + "loss": 0.816, + "num_tokens": 58002445574.0, + "step": 13877 + }, + { + "epoch": 1.6491978609625668, + "grad_norm": 0.2344903306789722, + "learning_rate": 3.4145347214364246e-06, + "loss": 0.784, + "num_tokens": 58006602890.0, + "step": 13878 + }, + { + "epoch": 1.64931669637552, + "grad_norm": 0.2173939256089507, + "learning_rate": 3.4136027515751326e-06, + "loss": 0.8041, + "num_tokens": 58010792034.0, + "step": 13879 + }, + { + "epoch": 1.649435531788473, + "grad_norm": 0.22509177538898345, + "learning_rate": 3.4126710626637784e-06, + "loss": 0.8331, + "num_tokens": 58014978333.0, + "step": 13880 + }, + { + "epoch": 1.649554367201426, + "grad_norm": 0.2333915786502581, + "learning_rate": 3.41173965473687e-06, + "loss": 0.7901, + "num_tokens": 58019167259.0, + "step": 13881 + }, + { + "epoch": 1.649673202614379, + "grad_norm": 0.22390418281576846, + "learning_rate": 3.410808527828896e-06, + "loss": 0.8277, + "num_tokens": 58023354532.0, + "step": 13882 + }, + { + "epoch": 1.649792038027332, + "grad_norm": 0.22862180969340803, + "learning_rate": 3.409877681974344e-06, + "loss": 0.8079, + "num_tokens": 58027520668.0, + "step": 13883 + }, + { + "epoch": 1.6499108734402852, + "grad_norm": 0.21619308335569926, + "learning_rate": 3.4089471172076834e-06, + "loss": 0.8053, + "num_tokens": 58031710169.0, + "step": 13884 + }, + { + "epoch": 1.6500297088532383, + "grad_norm": 0.2310189796169811, + "learning_rate": 3.4080168335633766e-06, + "loss": 0.7733, + "num_tokens": 58035898614.0, + "step": 13885 + }, + { + "epoch": 1.6501485442661914, + "grad_norm": 0.2304068923638928, + "learning_rate": 3.407086831075876e-06, + "loss": 0.8282, + "num_tokens": 58040087775.0, + "step": 13886 + }, + { + "epoch": 1.6502673796791445, + "grad_norm": 0.22679637782542456, + "learning_rate": 3.406157109779623e-06, + "loss": 0.8145, + "num_tokens": 58044259201.0, + "step": 13887 + }, + { + "epoch": 1.6503862150920976, + "grad_norm": 0.23865898843446415, + "learning_rate": 3.405227669709047e-06, + "loss": 0.8354, + "num_tokens": 58048420012.0, + "step": 13888 + }, + { + "epoch": 1.6505050505050505, + "grad_norm": 0.24265133496886326, + "learning_rate": 3.404298510898568e-06, + "loss": 0.8209, + "num_tokens": 58052608416.0, + "step": 13889 + }, + { + "epoch": 1.6506238859180036, + "grad_norm": 0.23373441871677994, + "learning_rate": 3.403369633382598e-06, + "loss": 0.7985, + "num_tokens": 58056798333.0, + "step": 13890 + }, + { + "epoch": 1.6507427213309567, + "grad_norm": 0.27282557796589973, + "learning_rate": 3.4024410371955354e-06, + "loss": 0.8056, + "num_tokens": 58060972537.0, + "step": 13891 + }, + { + "epoch": 1.6508615567439096, + "grad_norm": 0.2282644988387067, + "learning_rate": 3.401512722371768e-06, + "loss": 0.7969, + "num_tokens": 58065136401.0, + "step": 13892 + }, + { + "epoch": 1.6509803921568627, + "grad_norm": 0.23539695331206026, + "learning_rate": 3.400584688945677e-06, + "loss": 0.8176, + "num_tokens": 58069325127.0, + "step": 13893 + }, + { + "epoch": 1.6510992275698158, + "grad_norm": 0.22607247967899397, + "learning_rate": 3.3996569369516287e-06, + "loss": 0.7822, + "num_tokens": 58073514872.0, + "step": 13894 + }, + { + "epoch": 1.6512180629827689, + "grad_norm": 0.24378186794766252, + "learning_rate": 3.398729466423983e-06, + "loss": 0.7955, + "num_tokens": 58077648408.0, + "step": 13895 + }, + { + "epoch": 1.651336898395722, + "grad_norm": 0.23589301852304684, + "learning_rate": 3.3978022773970844e-06, + "loss": 0.7963, + "num_tokens": 58081825124.0, + "step": 13896 + }, + { + "epoch": 1.651455733808675, + "grad_norm": 0.23966332359994108, + "learning_rate": 3.396875369905267e-06, + "loss": 0.7825, + "num_tokens": 58085989466.0, + "step": 13897 + }, + { + "epoch": 1.6515745692216282, + "grad_norm": 0.2287373673388237, + "learning_rate": 3.3959487439828655e-06, + "loss": 0.812, + "num_tokens": 58090178567.0, + "step": 13898 + }, + { + "epoch": 1.6516934046345813, + "grad_norm": 0.23734002372462706, + "learning_rate": 3.3950223996641913e-06, + "loss": 0.781, + "num_tokens": 58094368266.0, + "step": 13899 + }, + { + "epoch": 1.6518122400475341, + "grad_norm": 0.2407635818538538, + "learning_rate": 3.394096336983553e-06, + "loss": 0.7853, + "num_tokens": 58098528680.0, + "step": 13900 + }, + { + "epoch": 1.6519310754604872, + "grad_norm": 0.24059736727106384, + "learning_rate": 3.3931705559752415e-06, + "loss": 0.8298, + "num_tokens": 58102717097.0, + "step": 13901 + }, + { + "epoch": 1.6520499108734403, + "grad_norm": 0.23680913616027205, + "learning_rate": 3.392245056673543e-06, + "loss": 0.8449, + "num_tokens": 58106902489.0, + "step": 13902 + }, + { + "epoch": 1.6521687462863932, + "grad_norm": 0.24415678746118627, + "learning_rate": 3.3913198391127313e-06, + "loss": 0.7935, + "num_tokens": 58111090300.0, + "step": 13903 + }, + { + "epoch": 1.6522875816993463, + "grad_norm": 0.24686614641641746, + "learning_rate": 3.390394903327073e-06, + "loss": 0.772, + "num_tokens": 58115267464.0, + "step": 13904 + }, + { + "epoch": 1.6524064171122994, + "grad_norm": 0.23911798325948705, + "learning_rate": 3.3894702493508223e-06, + "loss": 0.7975, + "num_tokens": 58119431870.0, + "step": 13905 + }, + { + "epoch": 1.6525252525252525, + "grad_norm": 0.23404942445901233, + "learning_rate": 3.3885458772182183e-06, + "loss": 0.8061, + "num_tokens": 58123621316.0, + "step": 13906 + }, + { + "epoch": 1.6526440879382056, + "grad_norm": 0.2303729286650262, + "learning_rate": 3.387621786963494e-06, + "loss": 0.8187, + "num_tokens": 58127802810.0, + "step": 13907 + }, + { + "epoch": 1.6527629233511587, + "grad_norm": 0.24633496706034166, + "learning_rate": 3.3866979786208753e-06, + "loss": 0.7969, + "num_tokens": 58131938942.0, + "step": 13908 + }, + { + "epoch": 1.6528817587641118, + "grad_norm": 0.23743847680749602, + "learning_rate": 3.3857744522245683e-06, + "loss": 0.8287, + "num_tokens": 58136112977.0, + "step": 13909 + }, + { + "epoch": 1.653000594177065, + "grad_norm": 0.24268314998318938, + "learning_rate": 3.3848512078087815e-06, + "loss": 0.7803, + "num_tokens": 58140300086.0, + "step": 13910 + }, + { + "epoch": 1.6531194295900178, + "grad_norm": 0.23291084309113114, + "learning_rate": 3.3839282454077004e-06, + "loss": 0.8004, + "num_tokens": 58144489680.0, + "step": 13911 + }, + { + "epoch": 1.6532382650029709, + "grad_norm": 0.23514928390401002, + "learning_rate": 3.383005565055506e-06, + "loss": 0.7989, + "num_tokens": 58148679318.0, + "step": 13912 + }, + { + "epoch": 1.653357100415924, + "grad_norm": 0.2577108829159997, + "learning_rate": 3.38208316678637e-06, + "loss": 0.8001, + "num_tokens": 58152869444.0, + "step": 13913 + }, + { + "epoch": 1.6534759358288769, + "grad_norm": 0.2489687188892853, + "learning_rate": 3.3811610506344517e-06, + "loss": 0.8, + "num_tokens": 58157060115.0, + "step": 13914 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.2517638204204615, + "learning_rate": 3.3802392166338993e-06, + "loss": 0.736, + "num_tokens": 58161250250.0, + "step": 13915 + }, + { + "epoch": 1.653713606654783, + "grad_norm": 0.26965081715232353, + "learning_rate": 3.379317664818851e-06, + "loss": 0.7979, + "num_tokens": 58165439610.0, + "step": 13916 + }, + { + "epoch": 1.6538324420677362, + "grad_norm": 0.2292972384771402, + "learning_rate": 3.3783963952234357e-06, + "loss": 0.8418, + "num_tokens": 58169629435.0, + "step": 13917 + }, + { + "epoch": 1.6539512774806893, + "grad_norm": 0.27744654954793574, + "learning_rate": 3.377475407881771e-06, + "loss": 0.8091, + "num_tokens": 58173804091.0, + "step": 13918 + }, + { + "epoch": 1.6540701128936424, + "grad_norm": 0.26056205604364013, + "learning_rate": 3.3765547028279656e-06, + "loss": 0.8187, + "num_tokens": 58177965661.0, + "step": 13919 + }, + { + "epoch": 1.6541889483065955, + "grad_norm": 0.2500346469795904, + "learning_rate": 3.375634280096114e-06, + "loss": 0.8104, + "num_tokens": 58182082000.0, + "step": 13920 + }, + { + "epoch": 1.6543077837195486, + "grad_norm": 0.25660550253758996, + "learning_rate": 3.3747141397203064e-06, + "loss": 0.829, + "num_tokens": 58186268438.0, + "step": 13921 + }, + { + "epoch": 1.6544266191325014, + "grad_norm": 0.23138379743417836, + "learning_rate": 3.37379428173461e-06, + "loss": 0.8079, + "num_tokens": 58190458513.0, + "step": 13922 + }, + { + "epoch": 1.6545454545454545, + "grad_norm": 0.24901136595174972, + "learning_rate": 3.3728747061731004e-06, + "loss": 0.8567, + "num_tokens": 58194649092.0, + "step": 13923 + }, + { + "epoch": 1.6546642899584076, + "grad_norm": 0.2674440509746529, + "learning_rate": 3.3719554130698277e-06, + "loss": 0.7944, + "num_tokens": 58198836110.0, + "step": 13924 + }, + { + "epoch": 1.6547831253713605, + "grad_norm": 0.24332982022517907, + "learning_rate": 3.3710364024588366e-06, + "loss": 0.8028, + "num_tokens": 58203026361.0, + "step": 13925 + }, + { + "epoch": 1.6549019607843136, + "grad_norm": 0.2505332090072076, + "learning_rate": 3.3701176743741638e-06, + "loss": 0.8021, + "num_tokens": 58207215307.0, + "step": 13926 + }, + { + "epoch": 1.6550207961972667, + "grad_norm": 0.22890191308550129, + "learning_rate": 3.3691992288498286e-06, + "loss": 0.8188, + "num_tokens": 58211403764.0, + "step": 13927 + }, + { + "epoch": 1.6551396316102198, + "grad_norm": 0.23895084834489194, + "learning_rate": 3.368281065919845e-06, + "loss": 0.7924, + "num_tokens": 58215592849.0, + "step": 13928 + }, + { + "epoch": 1.655258467023173, + "grad_norm": 0.23329900671744594, + "learning_rate": 3.3673631856182188e-06, + "loss": 0.8192, + "num_tokens": 58219753978.0, + "step": 13929 + }, + { + "epoch": 1.655377302436126, + "grad_norm": 0.22498672203383413, + "learning_rate": 3.3664455879789405e-06, + "loss": 0.8263, + "num_tokens": 58223909632.0, + "step": 13930 + }, + { + "epoch": 1.6554961378490791, + "grad_norm": 0.24137549608354777, + "learning_rate": 3.3655282730359927e-06, + "loss": 0.769, + "num_tokens": 58228059374.0, + "step": 13931 + }, + { + "epoch": 1.6556149732620322, + "grad_norm": 0.23966744620247016, + "learning_rate": 3.3646112408233445e-06, + "loss": 0.8022, + "num_tokens": 58232200256.0, + "step": 13932 + }, + { + "epoch": 1.655733808674985, + "grad_norm": 0.23126502342800576, + "learning_rate": 3.363694491374957e-06, + "loss": 0.8149, + "num_tokens": 58236371747.0, + "step": 13933 + }, + { + "epoch": 1.6558526440879382, + "grad_norm": 0.2340713901630476, + "learning_rate": 3.3627780247247834e-06, + "loss": 0.7934, + "num_tokens": 58240560506.0, + "step": 13934 + }, + { + "epoch": 1.6559714795008913, + "grad_norm": 0.25047220432197675, + "learning_rate": 3.361861840906758e-06, + "loss": 0.8218, + "num_tokens": 58244748955.0, + "step": 13935 + }, + { + "epoch": 1.6560903149138442, + "grad_norm": 0.23273889999368289, + "learning_rate": 3.3609459399548185e-06, + "loss": 0.8187, + "num_tokens": 58248936398.0, + "step": 13936 + }, + { + "epoch": 1.6562091503267973, + "grad_norm": 0.25447002058173135, + "learning_rate": 3.3600303219028768e-06, + "loss": 0.823, + "num_tokens": 58253102026.0, + "step": 13937 + }, + { + "epoch": 1.6563279857397504, + "grad_norm": 0.236738689615371, + "learning_rate": 3.359114986784844e-06, + "loss": 0.8169, + "num_tokens": 58257291819.0, + "step": 13938 + }, + { + "epoch": 1.6564468211527035, + "grad_norm": 0.24393015037561785, + "learning_rate": 3.3581999346346173e-06, + "loss": 0.7766, + "num_tokens": 58261480521.0, + "step": 13939 + }, + { + "epoch": 1.6565656565656566, + "grad_norm": 0.21994279684489237, + "learning_rate": 3.3572851654860837e-06, + "loss": 0.788, + "num_tokens": 58265669254.0, + "step": 13940 + }, + { + "epoch": 1.6566844919786097, + "grad_norm": 0.2370329398173303, + "learning_rate": 3.356370679373122e-06, + "loss": 0.8063, + "num_tokens": 58269858874.0, + "step": 13941 + }, + { + "epoch": 1.6568033273915628, + "grad_norm": 0.23401292471471064, + "learning_rate": 3.3554564763295983e-06, + "loss": 0.797, + "num_tokens": 58274027800.0, + "step": 13942 + }, + { + "epoch": 1.6569221628045159, + "grad_norm": 0.2350935547911483, + "learning_rate": 3.3545425563893675e-06, + "loss": 0.8141, + "num_tokens": 58278215279.0, + "step": 13943 + }, + { + "epoch": 1.6570409982174688, + "grad_norm": 0.2704449536429189, + "learning_rate": 3.353628919586276e-06, + "loss": 0.794, + "num_tokens": 58282402426.0, + "step": 13944 + }, + { + "epoch": 1.6571598336304219, + "grad_norm": 0.23750447254333418, + "learning_rate": 3.3527155659541578e-06, + "loss": 0.8037, + "num_tokens": 58286582564.0, + "step": 13945 + }, + { + "epoch": 1.657278669043375, + "grad_norm": 0.2580967306927206, + "learning_rate": 3.3518024955268398e-06, + "loss": 0.8671, + "num_tokens": 58290770557.0, + "step": 13946 + }, + { + "epoch": 1.6573975044563278, + "grad_norm": 0.22610064352690448, + "learning_rate": 3.350889708338133e-06, + "loss": 0.787, + "num_tokens": 58294960323.0, + "step": 13947 + }, + { + "epoch": 1.657516339869281, + "grad_norm": 0.22684009119396153, + "learning_rate": 3.3499772044218436e-06, + "loss": 0.8355, + "num_tokens": 58299108090.0, + "step": 13948 + }, + { + "epoch": 1.657635175282234, + "grad_norm": 0.22267494080148845, + "learning_rate": 3.349064983811763e-06, + "loss": 0.782, + "num_tokens": 58303248419.0, + "step": 13949 + }, + { + "epoch": 1.6577540106951871, + "grad_norm": 0.2411878179153169, + "learning_rate": 3.3481530465416743e-06, + "loss": 0.7967, + "num_tokens": 58307437418.0, + "step": 13950 + }, + { + "epoch": 1.6578728461081402, + "grad_norm": 0.2301531769434567, + "learning_rate": 3.3472413926453507e-06, + "loss": 0.8173, + "num_tokens": 58311626788.0, + "step": 13951 + }, + { + "epoch": 1.6579916815210933, + "grad_norm": 0.23912169486601717, + "learning_rate": 3.346330022156552e-06, + "loss": 0.7814, + "num_tokens": 58315815272.0, + "step": 13952 + }, + { + "epoch": 1.6581105169340464, + "grad_norm": 0.24412885864469805, + "learning_rate": 3.3454189351090334e-06, + "loss": 0.8209, + "num_tokens": 58319976345.0, + "step": 13953 + }, + { + "epoch": 1.6582293523469995, + "grad_norm": 0.2664253628026076, + "learning_rate": 3.344508131536527e-06, + "loss": 0.8018, + "num_tokens": 58324110060.0, + "step": 13954 + }, + { + "epoch": 1.6583481877599526, + "grad_norm": 0.2512936689104551, + "learning_rate": 3.343597611472771e-06, + "loss": 0.8266, + "num_tokens": 58328285605.0, + "step": 13955 + }, + { + "epoch": 1.6584670231729055, + "grad_norm": 0.2533437914251002, + "learning_rate": 3.342687374951482e-06, + "loss": 0.8056, + "num_tokens": 58332474694.0, + "step": 13956 + }, + { + "epoch": 1.6585858585858586, + "grad_norm": 0.227460749812973, + "learning_rate": 3.3417774220063705e-06, + "loss": 0.8122, + "num_tokens": 58336665776.0, + "step": 13957 + }, + { + "epoch": 1.6587046939988115, + "grad_norm": 0.24006791745695766, + "learning_rate": 3.3408677526711347e-06, + "loss": 0.8332, + "num_tokens": 58340840129.0, + "step": 13958 + }, + { + "epoch": 1.6588235294117646, + "grad_norm": 0.2290965360176292, + "learning_rate": 3.339958366979461e-06, + "loss": 0.7915, + "num_tokens": 58344986231.0, + "step": 13959 + }, + { + "epoch": 1.6589423648247177, + "grad_norm": 0.23729842213237692, + "learning_rate": 3.3390492649650264e-06, + "loss": 0.8049, + "num_tokens": 58349167669.0, + "step": 13960 + }, + { + "epoch": 1.6590612002376708, + "grad_norm": 0.2198130608136388, + "learning_rate": 3.338140446661502e-06, + "loss": 0.8085, + "num_tokens": 58353357121.0, + "step": 13961 + }, + { + "epoch": 1.6591800356506239, + "grad_norm": 0.23630310422900744, + "learning_rate": 3.337231912102541e-06, + "loss": 0.8107, + "num_tokens": 58357541538.0, + "step": 13962 + }, + { + "epoch": 1.659298871063577, + "grad_norm": 0.24295074919068826, + "learning_rate": 3.3363236613217936e-06, + "loss": 0.8079, + "num_tokens": 58361720789.0, + "step": 13963 + }, + { + "epoch": 1.65941770647653, + "grad_norm": 0.24607184245343222, + "learning_rate": 3.3354156943528903e-06, + "loss": 0.822, + "num_tokens": 58365909952.0, + "step": 13964 + }, + { + "epoch": 1.6595365418894832, + "grad_norm": 0.2290664764555526, + "learning_rate": 3.334508011229459e-06, + "loss": 0.7916, + "num_tokens": 58370079291.0, + "step": 13965 + }, + { + "epoch": 1.6596553773024363, + "grad_norm": 0.2373578363861794, + "learning_rate": 3.333600611985113e-06, + "loss": 0.8395, + "num_tokens": 58374267933.0, + "step": 13966 + }, + { + "epoch": 1.6597742127153892, + "grad_norm": 0.23272447480703665, + "learning_rate": 3.332693496653455e-06, + "loss": 0.8381, + "num_tokens": 58378422807.0, + "step": 13967 + }, + { + "epoch": 1.6598930481283423, + "grad_norm": 0.2259832694500859, + "learning_rate": 3.331786665268085e-06, + "loss": 0.7723, + "num_tokens": 58382609963.0, + "step": 13968 + }, + { + "epoch": 1.6600118835412951, + "grad_norm": 0.24601170003317863, + "learning_rate": 3.3308801178625793e-06, + "loss": 0.8141, + "num_tokens": 58386776071.0, + "step": 13969 + }, + { + "epoch": 1.6601307189542482, + "grad_norm": 0.2354767411842598, + "learning_rate": 3.329973854470513e-06, + "loss": 0.8045, + "num_tokens": 58390944325.0, + "step": 13970 + }, + { + "epoch": 1.6602495543672013, + "grad_norm": 0.2275669238030999, + "learning_rate": 3.3290678751254478e-06, + "loss": 0.8143, + "num_tokens": 58395133137.0, + "step": 13971 + }, + { + "epoch": 1.6603683897801544, + "grad_norm": 0.21767913582146473, + "learning_rate": 3.328162179860935e-06, + "loss": 0.7921, + "num_tokens": 58399284845.0, + "step": 13972 + }, + { + "epoch": 1.6604872251931075, + "grad_norm": 0.2188961664566685, + "learning_rate": 3.3272567687105156e-06, + "loss": 0.7968, + "num_tokens": 58403474203.0, + "step": 13973 + }, + { + "epoch": 1.6606060606060606, + "grad_norm": 0.22756490111797043, + "learning_rate": 3.3263516417077213e-06, + "loss": 0.7914, + "num_tokens": 58407664559.0, + "step": 13974 + }, + { + "epoch": 1.6607248960190137, + "grad_norm": 0.21800690557245542, + "learning_rate": 3.3254467988860694e-06, + "loss": 0.7826, + "num_tokens": 58411841635.0, + "step": 13975 + }, + { + "epoch": 1.6608437314319668, + "grad_norm": 0.2539927238303912, + "learning_rate": 3.3245422402790715e-06, + "loss": 0.7848, + "num_tokens": 58416031034.0, + "step": 13976 + }, + { + "epoch": 1.66096256684492, + "grad_norm": 0.22467954903972603, + "learning_rate": 3.323637965920226e-06, + "loss": 0.7794, + "num_tokens": 58420220462.0, + "step": 13977 + }, + { + "epoch": 1.6610814022578728, + "grad_norm": 0.23402818991635205, + "learning_rate": 3.3227339758430215e-06, + "loss": 0.8128, + "num_tokens": 58424390711.0, + "step": 13978 + }, + { + "epoch": 1.661200237670826, + "grad_norm": 0.23878125917593557, + "learning_rate": 3.3218302700809344e-06, + "loss": 0.7739, + "num_tokens": 58428580691.0, + "step": 13979 + }, + { + "epoch": 1.661319073083779, + "grad_norm": 0.23215209185348046, + "learning_rate": 3.320926848667433e-06, + "loss": 0.8187, + "num_tokens": 58432758058.0, + "step": 13980 + }, + { + "epoch": 1.661437908496732, + "grad_norm": 0.217073326053617, + "learning_rate": 3.3200237116359745e-06, + "loss": 0.8119, + "num_tokens": 58436911807.0, + "step": 13981 + }, + { + "epoch": 1.661556743909685, + "grad_norm": 0.2369893288309445, + "learning_rate": 3.319120859020004e-06, + "loss": 0.8368, + "num_tokens": 58441101218.0, + "step": 13982 + }, + { + "epoch": 1.661675579322638, + "grad_norm": 0.23130839549905316, + "learning_rate": 3.3182182908529583e-06, + "loss": 0.827, + "num_tokens": 58445266678.0, + "step": 13983 + }, + { + "epoch": 1.6617944147355912, + "grad_norm": 0.23660490670679962, + "learning_rate": 3.317316007168264e-06, + "loss": 0.798, + "num_tokens": 58449448261.0, + "step": 13984 + }, + { + "epoch": 1.6619132501485443, + "grad_norm": 0.23475193313627643, + "learning_rate": 3.3164140079993314e-06, + "loss": 0.8018, + "num_tokens": 58453629418.0, + "step": 13985 + }, + { + "epoch": 1.6620320855614974, + "grad_norm": 0.23812363649413737, + "learning_rate": 3.3155122933795646e-06, + "loss": 0.7937, + "num_tokens": 58457818231.0, + "step": 13986 + }, + { + "epoch": 1.6621509209744505, + "grad_norm": 0.24476517178827725, + "learning_rate": 3.3146108633423624e-06, + "loss": 0.817, + "num_tokens": 58461974438.0, + "step": 13987 + }, + { + "epoch": 1.6622697563874036, + "grad_norm": 0.2535121755194117, + "learning_rate": 3.313709717921103e-06, + "loss": 0.8363, + "num_tokens": 58466154873.0, + "step": 13988 + }, + { + "epoch": 1.6623885918003565, + "grad_norm": 0.23862248334385192, + "learning_rate": 3.3128088571491647e-06, + "loss": 0.8098, + "num_tokens": 58470343241.0, + "step": 13989 + }, + { + "epoch": 1.6625074272133096, + "grad_norm": 0.2634629116919496, + "learning_rate": 3.3119082810599025e-06, + "loss": 0.8272, + "num_tokens": 58474499432.0, + "step": 13990 + }, + { + "epoch": 1.6626262626262627, + "grad_norm": 0.24130829830524866, + "learning_rate": 3.3110079896866708e-06, + "loss": 0.7991, + "num_tokens": 58478688824.0, + "step": 13991 + }, + { + "epoch": 1.6627450980392156, + "grad_norm": 0.24632443052321942, + "learning_rate": 3.3101079830628093e-06, + "loss": 0.8088, + "num_tokens": 58482865398.0, + "step": 13992 + }, + { + "epoch": 1.6628639334521687, + "grad_norm": 0.22118754041948635, + "learning_rate": 3.30920826122165e-06, + "loss": 0.7973, + "num_tokens": 58487055128.0, + "step": 13993 + }, + { + "epoch": 1.6629827688651218, + "grad_norm": 0.24729225159024515, + "learning_rate": 3.3083088241965146e-06, + "loss": 0.8038, + "num_tokens": 58491244695.0, + "step": 13994 + }, + { + "epoch": 1.6631016042780749, + "grad_norm": 0.2381700183582615, + "learning_rate": 3.3074096720207076e-06, + "loss": 0.7965, + "num_tokens": 58495434312.0, + "step": 13995 + }, + { + "epoch": 1.663220439691028, + "grad_norm": 0.2441969511965596, + "learning_rate": 3.306510804727529e-06, + "loss": 0.7845, + "num_tokens": 58499624545.0, + "step": 13996 + }, + { + "epoch": 1.663339275103981, + "grad_norm": 0.23713885158017114, + "learning_rate": 3.3056122223502683e-06, + "loss": 0.8182, + "num_tokens": 58503801123.0, + "step": 13997 + }, + { + "epoch": 1.6634581105169342, + "grad_norm": 0.23757426903749468, + "learning_rate": 3.3047139249222e-06, + "loss": 0.7838, + "num_tokens": 58507989296.0, + "step": 13998 + }, + { + "epoch": 1.6635769459298873, + "grad_norm": 0.23546034053513928, + "learning_rate": 3.3038159124765976e-06, + "loss": 0.8159, + "num_tokens": 58512178674.0, + "step": 13999 + }, + { + "epoch": 1.6636957813428401, + "grad_norm": 0.23649186605005135, + "learning_rate": 3.3029181850467126e-06, + "loss": 0.8345, + "num_tokens": 58516322093.0, + "step": 14000 + }, + { + "epoch": 1.6638146167557932, + "grad_norm": 0.24683694863965225, + "learning_rate": 3.3020207426657903e-06, + "loss": 0.8053, + "num_tokens": 58520511032.0, + "step": 14001 + }, + { + "epoch": 1.6639334521687463, + "grad_norm": 0.2287324005046835, + "learning_rate": 3.3011235853670676e-06, + "loss": 0.8359, + "num_tokens": 58524680214.0, + "step": 14002 + }, + { + "epoch": 1.6640522875816992, + "grad_norm": 0.25782261668764506, + "learning_rate": 3.3002267131837685e-06, + "loss": 0.7951, + "num_tokens": 58528869578.0, + "step": 14003 + }, + { + "epoch": 1.6641711229946523, + "grad_norm": 0.2337679774957048, + "learning_rate": 3.2993301261491084e-06, + "loss": 0.8219, + "num_tokens": 58533048648.0, + "step": 14004 + }, + { + "epoch": 1.6642899584076054, + "grad_norm": 0.2520327910365632, + "learning_rate": 3.29843382429629e-06, + "loss": 0.8407, + "num_tokens": 58537236528.0, + "step": 14005 + }, + { + "epoch": 1.6644087938205585, + "grad_norm": 0.2273755182700143, + "learning_rate": 3.2975378076585074e-06, + "loss": 0.8014, + "num_tokens": 58541403562.0, + "step": 14006 + }, + { + "epoch": 1.6645276292335116, + "grad_norm": 0.23837919902327454, + "learning_rate": 3.2966420762689415e-06, + "loss": 0.8111, + "num_tokens": 58545593555.0, + "step": 14007 + }, + { + "epoch": 1.6646464646464647, + "grad_norm": 0.24000351138412845, + "learning_rate": 3.2957466301607664e-06, + "loss": 0.7704, + "num_tokens": 58549758815.0, + "step": 14008 + }, + { + "epoch": 1.6647653000594178, + "grad_norm": 0.22874257889370136, + "learning_rate": 3.2948514693671405e-06, + "loss": 0.8158, + "num_tokens": 58553892231.0, + "step": 14009 + }, + { + "epoch": 1.664884135472371, + "grad_norm": 0.24210572026188265, + "learning_rate": 3.2939565939212205e-06, + "loss": 0.818, + "num_tokens": 58558050382.0, + "step": 14010 + }, + { + "epoch": 1.6650029708853238, + "grad_norm": 0.2281646214055108, + "learning_rate": 3.2930620038561375e-06, + "loss": 0.8165, + "num_tokens": 58562219587.0, + "step": 14011 + }, + { + "epoch": 1.6651218062982769, + "grad_norm": 0.23385875200085554, + "learning_rate": 3.292167699205028e-06, + "loss": 0.8164, + "num_tokens": 58566408011.0, + "step": 14012 + }, + { + "epoch": 1.66524064171123, + "grad_norm": 0.21971106215703756, + "learning_rate": 3.291273680001009e-06, + "loss": 0.8063, + "num_tokens": 58570572935.0, + "step": 14013 + }, + { + "epoch": 1.6653594771241829, + "grad_norm": 0.2387576577043689, + "learning_rate": 3.29037994627719e-06, + "loss": 0.8213, + "num_tokens": 58574728624.0, + "step": 14014 + }, + { + "epoch": 1.665478312537136, + "grad_norm": 0.23081888669847514, + "learning_rate": 3.289486498066672e-06, + "loss": 0.8154, + "num_tokens": 58578918379.0, + "step": 14015 + }, + { + "epoch": 1.665597147950089, + "grad_norm": 0.2262236310980227, + "learning_rate": 3.2885933354025353e-06, + "loss": 0.7586, + "num_tokens": 58583108386.0, + "step": 14016 + }, + { + "epoch": 1.6657159833630422, + "grad_norm": 0.23046891808941042, + "learning_rate": 3.287700458317858e-06, + "loss": 0.7883, + "num_tokens": 58587296780.0, + "step": 14017 + }, + { + "epoch": 1.6658348187759953, + "grad_norm": 0.22951656000817358, + "learning_rate": 3.2868078668457115e-06, + "loss": 0.8141, + "num_tokens": 58591475410.0, + "step": 14018 + }, + { + "epoch": 1.6659536541889484, + "grad_norm": 0.22343316922605913, + "learning_rate": 3.2859155610191474e-06, + "loss": 0.813, + "num_tokens": 58595641565.0, + "step": 14019 + }, + { + "epoch": 1.6660724896019015, + "grad_norm": 0.214067576940821, + "learning_rate": 3.285023540871215e-06, + "loss": 0.8429, + "num_tokens": 58599832092.0, + "step": 14020 + }, + { + "epoch": 1.6661913250148546, + "grad_norm": 0.23494189262627152, + "learning_rate": 3.284131806434944e-06, + "loss": 0.8056, + "num_tokens": 58603994523.0, + "step": 14021 + }, + { + "epoch": 1.6663101604278074, + "grad_norm": 0.224379834737497, + "learning_rate": 3.283240357743359e-06, + "loss": 0.8381, + "num_tokens": 58608184151.0, + "step": 14022 + }, + { + "epoch": 1.6664289958407605, + "grad_norm": 0.2319027856231532, + "learning_rate": 3.282349194829476e-06, + "loss": 0.8155, + "num_tokens": 58612348750.0, + "step": 14023 + }, + { + "epoch": 1.6665478312537136, + "grad_norm": 0.23016477019339185, + "learning_rate": 3.2814583177262928e-06, + "loss": 0.817, + "num_tokens": 58616537784.0, + "step": 14024 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.22876586643431446, + "learning_rate": 3.28056772646681e-06, + "loss": 0.8285, + "num_tokens": 58620701911.0, + "step": 14025 + }, + { + "epoch": 1.6667855020796196, + "grad_norm": 0.23797366347726903, + "learning_rate": 3.2796774210840022e-06, + "loss": 0.8159, + "num_tokens": 58624890925.0, + "step": 14026 + }, + { + "epoch": 1.6669043374925727, + "grad_norm": 0.22763197417793996, + "learning_rate": 3.2787874016108427e-06, + "loss": 0.8003, + "num_tokens": 58629065408.0, + "step": 14027 + }, + { + "epoch": 1.6670231729055258, + "grad_norm": 0.23526220769260262, + "learning_rate": 3.2778976680802915e-06, + "loss": 0.7979, + "num_tokens": 58633254920.0, + "step": 14028 + }, + { + "epoch": 1.667142008318479, + "grad_norm": 0.24391510215655782, + "learning_rate": 3.2770082205252984e-06, + "loss": 0.7973, + "num_tokens": 58637444117.0, + "step": 14029 + }, + { + "epoch": 1.667260843731432, + "grad_norm": 0.2384786169489129, + "learning_rate": 3.276119058978804e-06, + "loss": 0.8036, + "num_tokens": 58641633976.0, + "step": 14030 + }, + { + "epoch": 1.6673796791443851, + "grad_norm": 0.24367501313073325, + "learning_rate": 3.2752301834737344e-06, + "loss": 0.7916, + "num_tokens": 58645821250.0, + "step": 14031 + }, + { + "epoch": 1.6674985145573382, + "grad_norm": 0.23205697084787716, + "learning_rate": 3.274341594043011e-06, + "loss": 0.8166, + "num_tokens": 58650010890.0, + "step": 14032 + }, + { + "epoch": 1.667617349970291, + "grad_norm": 0.24419744210839586, + "learning_rate": 3.2734532907195382e-06, + "loss": 0.8229, + "num_tokens": 58654199465.0, + "step": 14033 + }, + { + "epoch": 1.6677361853832442, + "grad_norm": 0.23577533600288997, + "learning_rate": 3.2725652735362146e-06, + "loss": 0.7877, + "num_tokens": 58658370207.0, + "step": 14034 + }, + { + "epoch": 1.6678550207961973, + "grad_norm": 0.23511341418689805, + "learning_rate": 3.271677542525926e-06, + "loss": 0.7878, + "num_tokens": 58662559297.0, + "step": 14035 + }, + { + "epoch": 1.6679738562091502, + "grad_norm": 0.2383395829376421, + "learning_rate": 3.2707900977215468e-06, + "loss": 0.8444, + "num_tokens": 58666748160.0, + "step": 14036 + }, + { + "epoch": 1.6680926916221033, + "grad_norm": 0.2343087485451983, + "learning_rate": 3.269902939155945e-06, + "loss": 0.8107, + "num_tokens": 58670938462.0, + "step": 14037 + }, + { + "epoch": 1.6682115270350564, + "grad_norm": 0.2426474295247452, + "learning_rate": 3.2690160668619725e-06, + "loss": 0.8231, + "num_tokens": 58675127925.0, + "step": 14038 + }, + { + "epoch": 1.6683303624480095, + "grad_norm": 0.23456966044049876, + "learning_rate": 3.268129480872475e-06, + "loss": 0.8297, + "num_tokens": 58679286731.0, + "step": 14039 + }, + { + "epoch": 1.6684491978609626, + "grad_norm": 0.2513565679552808, + "learning_rate": 3.267243181220284e-06, + "loss": 0.7866, + "num_tokens": 58683475389.0, + "step": 14040 + }, + { + "epoch": 1.6685680332739157, + "grad_norm": 0.2337001364227421, + "learning_rate": 3.266357167938226e-06, + "loss": 0.7464, + "num_tokens": 58687665977.0, + "step": 14041 + }, + { + "epoch": 1.6686868686868688, + "grad_norm": 0.24542670376303796, + "learning_rate": 3.265471441059107e-06, + "loss": 0.8157, + "num_tokens": 58691856012.0, + "step": 14042 + }, + { + "epoch": 1.6688057040998219, + "grad_norm": 0.23859921400119866, + "learning_rate": 3.2645860006157308e-06, + "loss": 0.7772, + "num_tokens": 58696044580.0, + "step": 14043 + }, + { + "epoch": 1.6689245395127748, + "grad_norm": 0.2629084894572736, + "learning_rate": 3.26370084664089e-06, + "loss": 0.8245, + "num_tokens": 58700232131.0, + "step": 14044 + }, + { + "epoch": 1.6690433749257279, + "grad_norm": 0.24951202657721006, + "learning_rate": 3.2628159791673644e-06, + "loss": 0.8219, + "num_tokens": 58704409579.0, + "step": 14045 + }, + { + "epoch": 1.669162210338681, + "grad_norm": 0.24651332582735527, + "learning_rate": 3.2619313982279245e-06, + "loss": 0.8201, + "num_tokens": 58708558631.0, + "step": 14046 + }, + { + "epoch": 1.6692810457516338, + "grad_norm": 0.2518944414098619, + "learning_rate": 3.261047103855327e-06, + "loss": 0.7982, + "num_tokens": 58712748210.0, + "step": 14047 + }, + { + "epoch": 1.669399881164587, + "grad_norm": 0.23836732986597217, + "learning_rate": 3.260163096082321e-06, + "loss": 0.8055, + "num_tokens": 58716938756.0, + "step": 14048 + }, + { + "epoch": 1.66951871657754, + "grad_norm": 0.2328865818134259, + "learning_rate": 3.2592793749416417e-06, + "loss": 0.8028, + "num_tokens": 58721099385.0, + "step": 14049 + }, + { + "epoch": 1.6696375519904931, + "grad_norm": 0.23613580216807026, + "learning_rate": 3.258395940466022e-06, + "loss": 0.8046, + "num_tokens": 58725290086.0, + "step": 14050 + }, + { + "epoch": 1.6697563874034462, + "grad_norm": 0.24097783304119516, + "learning_rate": 3.257512792688178e-06, + "loss": 0.7994, + "num_tokens": 58729479811.0, + "step": 14051 + }, + { + "epoch": 1.6698752228163993, + "grad_norm": 0.2721570064898002, + "learning_rate": 3.25662993164081e-06, + "loss": 0.8302, + "num_tokens": 58733646826.0, + "step": 14052 + }, + { + "epoch": 1.6699940582293524, + "grad_norm": 0.22132900651008675, + "learning_rate": 3.255747357356618e-06, + "loss": 0.7991, + "num_tokens": 58737814709.0, + "step": 14053 + }, + { + "epoch": 1.6701128936423055, + "grad_norm": 0.23935475804738227, + "learning_rate": 3.254865069868284e-06, + "loss": 0.8024, + "num_tokens": 58742003416.0, + "step": 14054 + }, + { + "epoch": 1.6702317290552586, + "grad_norm": 0.22429904682896273, + "learning_rate": 3.2539830692084844e-06, + "loss": 0.8111, + "num_tokens": 58746172991.0, + "step": 14055 + }, + { + "epoch": 1.6703505644682115, + "grad_norm": 0.22743631858523242, + "learning_rate": 3.2531013554098807e-06, + "loss": 0.7918, + "num_tokens": 58750362313.0, + "step": 14056 + }, + { + "epoch": 1.6704693998811646, + "grad_norm": 0.22903990272482538, + "learning_rate": 3.2522199285051275e-06, + "loss": 0.8166, + "num_tokens": 58754550970.0, + "step": 14057 + }, + { + "epoch": 1.6705882352941175, + "grad_norm": 0.22796826773758275, + "learning_rate": 3.251338788526866e-06, + "loss": 0.8389, + "num_tokens": 58758727428.0, + "step": 14058 + }, + { + "epoch": 1.6707070707070706, + "grad_norm": 0.23494254095223716, + "learning_rate": 3.250457935507728e-06, + "loss": 0.8217, + "num_tokens": 58762906085.0, + "step": 14059 + }, + { + "epoch": 1.6708259061200237, + "grad_norm": 0.25190712905901236, + "learning_rate": 3.249577369480334e-06, + "loss": 0.8065, + "num_tokens": 58767094062.0, + "step": 14060 + }, + { + "epoch": 1.6709447415329768, + "grad_norm": 0.22867081980694917, + "learning_rate": 3.2486970904772948e-06, + "loss": 0.7777, + "num_tokens": 58771284968.0, + "step": 14061 + }, + { + "epoch": 1.6710635769459299, + "grad_norm": 0.23645544769326077, + "learning_rate": 3.247817098531211e-06, + "loss": 0.7513, + "num_tokens": 58775435024.0, + "step": 14062 + }, + { + "epoch": 1.671182412358883, + "grad_norm": 0.2389727888741112, + "learning_rate": 3.246937393674671e-06, + "loss": 0.7946, + "num_tokens": 58779624023.0, + "step": 14063 + }, + { + "epoch": 1.671301247771836, + "grad_norm": 0.23360258048450191, + "learning_rate": 3.246057975940251e-06, + "loss": 0.8255, + "num_tokens": 58783815355.0, + "step": 14064 + }, + { + "epoch": 1.6714200831847892, + "grad_norm": 0.2303004402643869, + "learning_rate": 3.2451788453605227e-06, + "loss": 0.8204, + "num_tokens": 58787953214.0, + "step": 14065 + }, + { + "epoch": 1.6715389185977423, + "grad_norm": 0.24547028506554222, + "learning_rate": 3.2443000019680404e-06, + "loss": 0.8203, + "num_tokens": 58792137837.0, + "step": 14066 + }, + { + "epoch": 1.6716577540106952, + "grad_norm": 0.2357922678543033, + "learning_rate": 3.2434214457953524e-06, + "loss": 0.8065, + "num_tokens": 58796308029.0, + "step": 14067 + }, + { + "epoch": 1.6717765894236483, + "grad_norm": 0.2538030809486898, + "learning_rate": 3.242543176874994e-06, + "loss": 0.8474, + "num_tokens": 58800498439.0, + "step": 14068 + }, + { + "epoch": 1.6718954248366014, + "grad_norm": 0.25188732760073224, + "learning_rate": 3.24166519523949e-06, + "loss": 0.7885, + "num_tokens": 58804680149.0, + "step": 14069 + }, + { + "epoch": 1.6720142602495542, + "grad_norm": 0.23362989032612563, + "learning_rate": 3.2407875009213545e-06, + "loss": 0.7998, + "num_tokens": 58808833977.0, + "step": 14070 + }, + { + "epoch": 1.6721330956625073, + "grad_norm": 0.2513524191707613, + "learning_rate": 3.2399100939530944e-06, + "loss": 0.7857, + "num_tokens": 58812972107.0, + "step": 14071 + }, + { + "epoch": 1.6722519310754604, + "grad_norm": 0.2459188319617388, + "learning_rate": 3.2390329743671996e-06, + "loss": 0.8011, + "num_tokens": 58817159888.0, + "step": 14072 + }, + { + "epoch": 1.6723707664884135, + "grad_norm": 0.2367971465140331, + "learning_rate": 3.238156142196156e-06, + "loss": 0.7946, + "num_tokens": 58821331159.0, + "step": 14073 + }, + { + "epoch": 1.6724896019013666, + "grad_norm": 0.23219283396398285, + "learning_rate": 3.2372795974724325e-06, + "loss": 0.8426, + "num_tokens": 58825506414.0, + "step": 14074 + }, + { + "epoch": 1.6726084373143197, + "grad_norm": 0.25029917599845075, + "learning_rate": 3.2364033402284896e-06, + "loss": 0.7933, + "num_tokens": 58829685775.0, + "step": 14075 + }, + { + "epoch": 1.6727272727272728, + "grad_norm": 0.22515555194970716, + "learning_rate": 3.235527370496782e-06, + "loss": 0.7922, + "num_tokens": 58833875771.0, + "step": 14076 + }, + { + "epoch": 1.672846108140226, + "grad_norm": 0.24585683283198997, + "learning_rate": 3.2346516883097496e-06, + "loss": 0.8459, + "num_tokens": 58838062856.0, + "step": 14077 + }, + { + "epoch": 1.6729649435531788, + "grad_norm": 0.24725984816533012, + "learning_rate": 3.233776293699821e-06, + "loss": 0.832, + "num_tokens": 58842252115.0, + "step": 14078 + }, + { + "epoch": 1.673083778966132, + "grad_norm": 0.29397772945249606, + "learning_rate": 3.2329011866994127e-06, + "loss": 0.76, + "num_tokens": 58846423719.0, + "step": 14079 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.25710245437919765, + "learning_rate": 3.2320263673409348e-06, + "loss": 0.7835, + "num_tokens": 58850588645.0, + "step": 14080 + }, + { + "epoch": 1.673321449792038, + "grad_norm": 0.264783507187646, + "learning_rate": 3.231151835656783e-06, + "loss": 0.8199, + "num_tokens": 58854747469.0, + "step": 14081 + }, + { + "epoch": 1.673440285204991, + "grad_norm": 0.2500853999244349, + "learning_rate": 3.230277591679348e-06, + "loss": 0.8185, + "num_tokens": 58858936579.0, + "step": 14082 + }, + { + "epoch": 1.673559120617944, + "grad_norm": 0.23499995367476484, + "learning_rate": 3.2294036354410053e-06, + "loss": 0.7984, + "num_tokens": 58863125891.0, + "step": 14083 + }, + { + "epoch": 1.6736779560308972, + "grad_norm": 0.27187924752250836, + "learning_rate": 3.228529966974116e-06, + "loss": 0.7949, + "num_tokens": 58867297728.0, + "step": 14084 + }, + { + "epoch": 1.6737967914438503, + "grad_norm": 0.223660460079589, + "learning_rate": 3.2276565863110397e-06, + "loss": 0.8219, + "num_tokens": 58871486610.0, + "step": 14085 + }, + { + "epoch": 1.6739156268568034, + "grad_norm": 0.24649883525022526, + "learning_rate": 3.226783493484118e-06, + "loss": 0.8098, + "num_tokens": 58875675479.0, + "step": 14086 + }, + { + "epoch": 1.6740344622697565, + "grad_norm": 0.24172846022065927, + "learning_rate": 3.2259106885256836e-06, + "loss": 0.826, + "num_tokens": 58879864878.0, + "step": 14087 + }, + { + "epoch": 1.6741532976827096, + "grad_norm": 0.2375357729694787, + "learning_rate": 3.2250381714680654e-06, + "loss": 0.7892, + "num_tokens": 58884021213.0, + "step": 14088 + }, + { + "epoch": 1.6742721330956625, + "grad_norm": 0.2512897829211436, + "learning_rate": 3.22416594234357e-06, + "loss": 0.8134, + "num_tokens": 58888184118.0, + "step": 14089 + }, + { + "epoch": 1.6743909685086156, + "grad_norm": 0.20775189798342722, + "learning_rate": 3.2232940011844993e-06, + "loss": 0.741, + "num_tokens": 58892373099.0, + "step": 14090 + }, + { + "epoch": 1.6745098039215687, + "grad_norm": 0.24174642530647147, + "learning_rate": 3.222422348023146e-06, + "loss": 0.8211, + "num_tokens": 58896561572.0, + "step": 14091 + }, + { + "epoch": 1.6746286393345216, + "grad_norm": 0.2266137204084177, + "learning_rate": 3.2215509828917894e-06, + "loss": 0.8541, + "num_tokens": 58900749737.0, + "step": 14092 + }, + { + "epoch": 1.6747474747474747, + "grad_norm": 0.23207375303638086, + "learning_rate": 3.2206799058227004e-06, + "loss": 0.8342, + "num_tokens": 58904936590.0, + "step": 14093 + }, + { + "epoch": 1.6748663101604278, + "grad_norm": 0.22730613252680845, + "learning_rate": 3.2198091168481367e-06, + "loss": 0.7986, + "num_tokens": 58909099825.0, + "step": 14094 + }, + { + "epoch": 1.6749851455733809, + "grad_norm": 0.2353356488364873, + "learning_rate": 3.2189386160003465e-06, + "loss": 0.8319, + "num_tokens": 58913258431.0, + "step": 14095 + }, + { + "epoch": 1.675103980986334, + "grad_norm": 0.23215471601223725, + "learning_rate": 3.218068403311567e-06, + "loss": 0.8222, + "num_tokens": 58917412733.0, + "step": 14096 + }, + { + "epoch": 1.675222816399287, + "grad_norm": 0.24264591316928713, + "learning_rate": 3.2171984788140263e-06, + "loss": 0.8075, + "num_tokens": 58921601621.0, + "step": 14097 + }, + { + "epoch": 1.6753416518122402, + "grad_norm": 0.22726934678868263, + "learning_rate": 3.2163288425399414e-06, + "loss": 0.8079, + "num_tokens": 58925789501.0, + "step": 14098 + }, + { + "epoch": 1.6754604872251933, + "grad_norm": 0.2225234518367539, + "learning_rate": 3.215459494521517e-06, + "loss": 0.7881, + "num_tokens": 58929970489.0, + "step": 14099 + }, + { + "epoch": 1.6755793226381461, + "grad_norm": 0.22074129365576114, + "learning_rate": 3.2145904347909453e-06, + "loss": 0.7745, + "num_tokens": 58934130661.0, + "step": 14100 + }, + { + "epoch": 1.6756981580510992, + "grad_norm": 0.2166270713525023, + "learning_rate": 3.2137216633804145e-06, + "loss": 0.8259, + "num_tokens": 58938302381.0, + "step": 14101 + }, + { + "epoch": 1.6758169934640523, + "grad_norm": 0.23186306629741496, + "learning_rate": 3.2128531803220963e-06, + "loss": 0.7878, + "num_tokens": 58942465197.0, + "step": 14102 + }, + { + "epoch": 1.6759358288770052, + "grad_norm": 0.21845578808224425, + "learning_rate": 3.2119849856481534e-06, + "loss": 0.8174, + "num_tokens": 58946649048.0, + "step": 14103 + }, + { + "epoch": 1.6760546642899583, + "grad_norm": 0.226676188001056, + "learning_rate": 3.211117079390741e-06, + "loss": 0.8215, + "num_tokens": 58950837630.0, + "step": 14104 + }, + { + "epoch": 1.6761734997029114, + "grad_norm": 0.23561739670810009, + "learning_rate": 3.210249461581995e-06, + "loss": 0.7552, + "num_tokens": 58955027439.0, + "step": 14105 + }, + { + "epoch": 1.6762923351158645, + "grad_norm": 0.21802200393083904, + "learning_rate": 3.209382132254048e-06, + "loss": 0.794, + "num_tokens": 58959216009.0, + "step": 14106 + }, + { + "epoch": 1.6764111705288176, + "grad_norm": 0.22699679762961264, + "learning_rate": 3.2085150914390223e-06, + "loss": 0.7935, + "num_tokens": 58963404999.0, + "step": 14107 + }, + { + "epoch": 1.6765300059417707, + "grad_norm": 0.22708904079477713, + "learning_rate": 3.2076483391690277e-06, + "loss": 0.8359, + "num_tokens": 58967591811.0, + "step": 14108 + }, + { + "epoch": 1.6766488413547238, + "grad_norm": 0.22380622666869526, + "learning_rate": 3.2067818754761633e-06, + "loss": 0.8197, + "num_tokens": 58971772311.0, + "step": 14109 + }, + { + "epoch": 1.676767676767677, + "grad_norm": 0.21855844003941133, + "learning_rate": 3.205915700392514e-06, + "loss": 0.7959, + "num_tokens": 58975960860.0, + "step": 14110 + }, + { + "epoch": 1.6768865121806298, + "grad_norm": 0.23333380645533405, + "learning_rate": 3.205049813950158e-06, + "loss": 0.8296, + "num_tokens": 58980150335.0, + "step": 14111 + }, + { + "epoch": 1.6770053475935829, + "grad_norm": 0.2356889536952373, + "learning_rate": 3.2041842161811626e-06, + "loss": 0.7981, + "num_tokens": 58984302782.0, + "step": 14112 + }, + { + "epoch": 1.677124183006536, + "grad_norm": 0.218717731981197, + "learning_rate": 3.2033189071175825e-06, + "loss": 0.8182, + "num_tokens": 58988463427.0, + "step": 14113 + }, + { + "epoch": 1.6772430184194889, + "grad_norm": 0.22341195140887896, + "learning_rate": 3.2024538867914676e-06, + "loss": 0.8239, + "num_tokens": 58992651513.0, + "step": 14114 + }, + { + "epoch": 1.677361853832442, + "grad_norm": 0.23629793511032676, + "learning_rate": 3.201589155234849e-06, + "loss": 0.778, + "num_tokens": 58996841182.0, + "step": 14115 + }, + { + "epoch": 1.677480689245395, + "grad_norm": 0.23014545772464726, + "learning_rate": 3.2007247124797493e-06, + "loss": 0.8105, + "num_tokens": 59001025242.0, + "step": 14116 + }, + { + "epoch": 1.6775995246583482, + "grad_norm": 0.23516386334588152, + "learning_rate": 3.199860558558185e-06, + "loss": 0.8147, + "num_tokens": 59005194449.0, + "step": 14117 + }, + { + "epoch": 1.6777183600713013, + "grad_norm": 0.24104204441685667, + "learning_rate": 3.198996693502156e-06, + "loss": 0.8075, + "num_tokens": 59009384726.0, + "step": 14118 + }, + { + "epoch": 1.6778371954842544, + "grad_norm": 0.2359946317065624, + "learning_rate": 3.1981331173436558e-06, + "loss": 0.8037, + "num_tokens": 59013574330.0, + "step": 14119 + }, + { + "epoch": 1.6779560308972075, + "grad_norm": 0.2514820756659144, + "learning_rate": 3.1972698301146644e-06, + "loss": 0.8088, + "num_tokens": 59017753310.0, + "step": 14120 + }, + { + "epoch": 1.6780748663101606, + "grad_norm": 0.2499949498002487, + "learning_rate": 3.1964068318471536e-06, + "loss": 0.8175, + "num_tokens": 59021927309.0, + "step": 14121 + }, + { + "epoch": 1.6781937017231134, + "grad_norm": 0.23571467167524446, + "learning_rate": 3.195544122573082e-06, + "loss": 0.7756, + "num_tokens": 59026117415.0, + "step": 14122 + }, + { + "epoch": 1.6783125371360665, + "grad_norm": 0.2196541300698484, + "learning_rate": 3.1946817023243996e-06, + "loss": 0.8019, + "num_tokens": 59030306799.0, + "step": 14123 + }, + { + "epoch": 1.6784313725490196, + "grad_norm": 0.23556782809632396, + "learning_rate": 3.1938195711330437e-06, + "loss": 0.7726, + "num_tokens": 59034495746.0, + "step": 14124 + }, + { + "epoch": 1.6785502079619725, + "grad_norm": 0.23934374040648076, + "learning_rate": 3.1929577290309443e-06, + "loss": 0.8133, + "num_tokens": 59038678269.0, + "step": 14125 + }, + { + "epoch": 1.6786690433749256, + "grad_norm": 0.2346486962084237, + "learning_rate": 3.1920961760500126e-06, + "loss": 0.794, + "num_tokens": 59042866362.0, + "step": 14126 + }, + { + "epoch": 1.6787878787878787, + "grad_norm": 0.22757960152662454, + "learning_rate": 3.1912349122221607e-06, + "loss": 0.7813, + "num_tokens": 59047056498.0, + "step": 14127 + }, + { + "epoch": 1.6789067142008318, + "grad_norm": 0.22920165566418282, + "learning_rate": 3.190373937579282e-06, + "loss": 0.7938, + "num_tokens": 59051215657.0, + "step": 14128 + }, + { + "epoch": 1.679025549613785, + "grad_norm": 0.25075434000166535, + "learning_rate": 3.189513252153261e-06, + "loss": 0.7762, + "num_tokens": 59055405619.0, + "step": 14129 + }, + { + "epoch": 1.679144385026738, + "grad_norm": 0.23234983560538983, + "learning_rate": 3.1886528559759745e-06, + "loss": 0.841, + "num_tokens": 59059592824.0, + "step": 14130 + }, + { + "epoch": 1.6792632204396911, + "grad_norm": 0.2322544754526809, + "learning_rate": 3.1877927490792815e-06, + "loss": 0.8157, + "num_tokens": 59063781503.0, + "step": 14131 + }, + { + "epoch": 1.6793820558526442, + "grad_norm": 0.25033381176533964, + "learning_rate": 3.186932931495034e-06, + "loss": 0.7993, + "num_tokens": 59067970267.0, + "step": 14132 + }, + { + "epoch": 1.679500891265597, + "grad_norm": 0.22394909083738926, + "learning_rate": 3.1860734032550794e-06, + "loss": 0.8416, + "num_tokens": 59072109030.0, + "step": 14133 + }, + { + "epoch": 1.6796197266785502, + "grad_norm": 0.2379737203364285, + "learning_rate": 3.185214164391247e-06, + "loss": 0.8123, + "num_tokens": 59076296150.0, + "step": 14134 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.2166036190154, + "learning_rate": 3.1843552149353572e-06, + "loss": 0.7913, + "num_tokens": 59080485051.0, + "step": 14135 + }, + { + "epoch": 1.6798573975044562, + "grad_norm": 0.25089721131424847, + "learning_rate": 3.1834965549192176e-06, + "loss": 0.7762, + "num_tokens": 59084662535.0, + "step": 14136 + }, + { + "epoch": 1.6799762329174093, + "grad_norm": 0.21409830767263616, + "learning_rate": 3.1826381843746285e-06, + "loss": 0.7841, + "num_tokens": 59088822759.0, + "step": 14137 + }, + { + "epoch": 1.6800950683303624, + "grad_norm": 0.22233917153851346, + "learning_rate": 3.1817801033333766e-06, + "loss": 0.7842, + "num_tokens": 59093009220.0, + "step": 14138 + }, + { + "epoch": 1.6802139037433155, + "grad_norm": 0.22389235081690784, + "learning_rate": 3.1809223118272438e-06, + "loss": 0.7874, + "num_tokens": 59097189486.0, + "step": 14139 + }, + { + "epoch": 1.6803327391562686, + "grad_norm": 0.23799222487957664, + "learning_rate": 3.180064809887997e-06, + "loss": 0.8315, + "num_tokens": 59101378766.0, + "step": 14140 + }, + { + "epoch": 1.6804515745692217, + "grad_norm": 0.23648857178302893, + "learning_rate": 3.1792075975473878e-06, + "loss": 0.7699, + "num_tokens": 59105517623.0, + "step": 14141 + }, + { + "epoch": 1.6805704099821748, + "grad_norm": 0.23068846686852343, + "learning_rate": 3.1783506748371633e-06, + "loss": 0.7952, + "num_tokens": 59109696275.0, + "step": 14142 + }, + { + "epoch": 1.6806892453951279, + "grad_norm": 0.22944104846187485, + "learning_rate": 3.17749404178906e-06, + "loss": 0.7604, + "num_tokens": 59113886577.0, + "step": 14143 + }, + { + "epoch": 1.680808080808081, + "grad_norm": 0.22997221426970924, + "learning_rate": 3.1766376984348006e-06, + "loss": 0.7853, + "num_tokens": 59118074753.0, + "step": 14144 + }, + { + "epoch": 1.6809269162210339, + "grad_norm": 0.23378586334345133, + "learning_rate": 3.1757816448060984e-06, + "loss": 0.778, + "num_tokens": 59122264390.0, + "step": 14145 + }, + { + "epoch": 1.681045751633987, + "grad_norm": 0.22508498132387425, + "learning_rate": 3.174925880934656e-06, + "loss": 0.8172, + "num_tokens": 59126453480.0, + "step": 14146 + }, + { + "epoch": 1.6811645870469398, + "grad_norm": 0.23915283892402847, + "learning_rate": 3.1740704068521662e-06, + "loss": 0.833, + "num_tokens": 59130640655.0, + "step": 14147 + }, + { + "epoch": 1.681283422459893, + "grad_norm": 0.23821894829321827, + "learning_rate": 3.173215222590309e-06, + "loss": 0.8238, + "num_tokens": 59134830084.0, + "step": 14148 + }, + { + "epoch": 1.681402257872846, + "grad_norm": 0.2421489208530218, + "learning_rate": 3.1723603281807547e-06, + "loss": 0.7462, + "num_tokens": 59139019052.0, + "step": 14149 + }, + { + "epoch": 1.6815210932857991, + "grad_norm": 0.23902247170787663, + "learning_rate": 3.171505723655163e-06, + "loss": 0.8267, + "num_tokens": 59143182532.0, + "step": 14150 + }, + { + "epoch": 1.6816399286987522, + "grad_norm": 0.2607933701078449, + "learning_rate": 3.1706514090451843e-06, + "loss": 0.817, + "num_tokens": 59147358847.0, + "step": 14151 + }, + { + "epoch": 1.6817587641117053, + "grad_norm": 0.23589817109236383, + "learning_rate": 3.169797384382455e-06, + "loss": 0.8101, + "num_tokens": 59151548466.0, + "step": 14152 + }, + { + "epoch": 1.6818775995246584, + "grad_norm": 0.24635277166298203, + "learning_rate": 3.1689436496986024e-06, + "loss": 0.7907, + "num_tokens": 59155732471.0, + "step": 14153 + }, + { + "epoch": 1.6819964349376115, + "grad_norm": 0.2176838170335228, + "learning_rate": 3.1680902050252447e-06, + "loss": 0.8256, + "num_tokens": 59159904517.0, + "step": 14154 + }, + { + "epoch": 1.6821152703505646, + "grad_norm": 0.2274721079836709, + "learning_rate": 3.167237050393987e-06, + "loss": 0.8165, + "num_tokens": 59164094598.0, + "step": 14155 + }, + { + "epoch": 1.6822341057635175, + "grad_norm": 0.2511588133241586, + "learning_rate": 3.1663841858364264e-06, + "loss": 0.8249, + "num_tokens": 59168269466.0, + "step": 14156 + }, + { + "epoch": 1.6823529411764706, + "grad_norm": 0.21964370725429325, + "learning_rate": 3.1655316113841413e-06, + "loss": 0.7861, + "num_tokens": 59172458077.0, + "step": 14157 + }, + { + "epoch": 1.6824717765894235, + "grad_norm": 0.23012461699089043, + "learning_rate": 3.164679327068712e-06, + "loss": 0.7833, + "num_tokens": 59176646927.0, + "step": 14158 + }, + { + "epoch": 1.6825906120023766, + "grad_norm": 0.24195586266644623, + "learning_rate": 3.163827332921699e-06, + "loss": 0.815, + "num_tokens": 59180835254.0, + "step": 14159 + }, + { + "epoch": 1.6827094474153297, + "grad_norm": 0.256668053488972, + "learning_rate": 3.162975628974654e-06, + "loss": 0.8157, + "num_tokens": 59185023781.0, + "step": 14160 + }, + { + "epoch": 1.6828282828282828, + "grad_norm": 0.2209097194497057, + "learning_rate": 3.1621242152591214e-06, + "loss": 0.7977, + "num_tokens": 59189183834.0, + "step": 14161 + }, + { + "epoch": 1.6829471182412359, + "grad_norm": 0.2299517872740446, + "learning_rate": 3.1612730918066282e-06, + "loss": 0.8037, + "num_tokens": 59193343902.0, + "step": 14162 + }, + { + "epoch": 1.683065953654189, + "grad_norm": 0.24595376543133748, + "learning_rate": 3.160422258648695e-06, + "loss": 0.7998, + "num_tokens": 59197533955.0, + "step": 14163 + }, + { + "epoch": 1.683184789067142, + "grad_norm": 0.22139101403987924, + "learning_rate": 3.15957171581683e-06, + "loss": 0.8199, + "num_tokens": 59201721598.0, + "step": 14164 + }, + { + "epoch": 1.6833036244800952, + "grad_norm": 0.2234686747139048, + "learning_rate": 3.1587214633425354e-06, + "loss": 0.8013, + "num_tokens": 59205882780.0, + "step": 14165 + }, + { + "epoch": 1.6834224598930483, + "grad_norm": 0.24825540157505654, + "learning_rate": 3.157871501257299e-06, + "loss": 0.7878, + "num_tokens": 59210048832.0, + "step": 14166 + }, + { + "epoch": 1.6835412953060012, + "grad_norm": 0.23049825066331306, + "learning_rate": 3.1570218295925935e-06, + "loss": 0.8305, + "num_tokens": 59214229781.0, + "step": 14167 + }, + { + "epoch": 1.6836601307189543, + "grad_norm": 0.22275718711474551, + "learning_rate": 3.1561724483798883e-06, + "loss": 0.8275, + "num_tokens": 59218392592.0, + "step": 14168 + }, + { + "epoch": 1.6837789661319074, + "grad_norm": 0.22720033360546502, + "learning_rate": 3.1553233576506378e-06, + "loss": 0.8064, + "num_tokens": 59222581674.0, + "step": 14169 + }, + { + "epoch": 1.6838978015448602, + "grad_norm": 0.22642109392401835, + "learning_rate": 3.1544745574362838e-06, + "loss": 0.8217, + "num_tokens": 59226763365.0, + "step": 14170 + }, + { + "epoch": 1.6840166369578133, + "grad_norm": 0.2100252235067669, + "learning_rate": 3.1536260477682685e-06, + "loss": 0.7964, + "num_tokens": 59230904058.0, + "step": 14171 + }, + { + "epoch": 1.6841354723707664, + "grad_norm": 0.22936700364818888, + "learning_rate": 3.1527778286780077e-06, + "loss": 0.8393, + "num_tokens": 59235093187.0, + "step": 14172 + }, + { + "epoch": 1.6842543077837195, + "grad_norm": 0.22049586261022222, + "learning_rate": 3.151929900196915e-06, + "loss": 0.8041, + "num_tokens": 59239278459.0, + "step": 14173 + }, + { + "epoch": 1.6843731431966726, + "grad_norm": 0.22358214035788507, + "learning_rate": 3.151082262356394e-06, + "loss": 0.8098, + "num_tokens": 59243466220.0, + "step": 14174 + }, + { + "epoch": 1.6844919786096257, + "grad_norm": 0.23614736730046668, + "learning_rate": 3.1502349151878347e-06, + "loss": 0.8189, + "num_tokens": 59247655763.0, + "step": 14175 + }, + { + "epoch": 1.6846108140225788, + "grad_norm": 0.23412203657529798, + "learning_rate": 3.1493878587226157e-06, + "loss": 0.7948, + "num_tokens": 59251844068.0, + "step": 14176 + }, + { + "epoch": 1.684729649435532, + "grad_norm": 0.22788430200638074, + "learning_rate": 3.1485410929921094e-06, + "loss": 0.7945, + "num_tokens": 59256011328.0, + "step": 14177 + }, + { + "epoch": 1.6848484848484848, + "grad_norm": 0.23057675161827149, + "learning_rate": 3.1476946180276723e-06, + "loss": 0.7807, + "num_tokens": 59260200457.0, + "step": 14178 + }, + { + "epoch": 1.684967320261438, + "grad_norm": 0.22689395475380905, + "learning_rate": 3.1468484338606536e-06, + "loss": 0.8147, + "num_tokens": 59264389949.0, + "step": 14179 + }, + { + "epoch": 1.685086155674391, + "grad_norm": 0.22685373938680536, + "learning_rate": 3.146002540522389e-06, + "loss": 0.816, + "num_tokens": 59268578677.0, + "step": 14180 + }, + { + "epoch": 1.685204991087344, + "grad_norm": 0.24210537212959793, + "learning_rate": 3.1451569380442048e-06, + "loss": 0.8198, + "num_tokens": 59272765324.0, + "step": 14181 + }, + { + "epoch": 1.685323826500297, + "grad_norm": 0.23625002061702835, + "learning_rate": 3.144311626457417e-06, + "loss": 0.808, + "num_tokens": 59276952912.0, + "step": 14182 + }, + { + "epoch": 1.68544266191325, + "grad_norm": 0.22786509916942307, + "learning_rate": 3.1434666057933306e-06, + "loss": 0.8391, + "num_tokens": 59281141964.0, + "step": 14183 + }, + { + "epoch": 1.6855614973262032, + "grad_norm": 0.23121000651174095, + "learning_rate": 3.1426218760832396e-06, + "loss": 0.8103, + "num_tokens": 59285305802.0, + "step": 14184 + }, + { + "epoch": 1.6856803327391563, + "grad_norm": 0.23522519001456496, + "learning_rate": 3.1417774373584266e-06, + "loss": 0.8013, + "num_tokens": 59289445566.0, + "step": 14185 + }, + { + "epoch": 1.6857991681521094, + "grad_norm": 0.2328763930407666, + "learning_rate": 3.1409332896501654e-06, + "loss": 0.8122, + "num_tokens": 59293608226.0, + "step": 14186 + }, + { + "epoch": 1.6859180035650625, + "grad_norm": 0.24512486178285586, + "learning_rate": 3.1400894329897157e-06, + "loss": 0.7932, + "num_tokens": 59297795167.0, + "step": 14187 + }, + { + "epoch": 1.6860368389780156, + "grad_norm": 0.22461068858119676, + "learning_rate": 3.139245867408331e-06, + "loss": 0.8239, + "num_tokens": 59301970142.0, + "step": 14188 + }, + { + "epoch": 1.6861556743909685, + "grad_norm": 0.2453083459732051, + "learning_rate": 3.138402592937246e-06, + "loss": 0.8073, + "num_tokens": 59306158949.0, + "step": 14189 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.2243640337274897, + "learning_rate": 3.1375596096076955e-06, + "loss": 0.8152, + "num_tokens": 59310349032.0, + "step": 14190 + }, + { + "epoch": 1.6863933452168747, + "grad_norm": 0.24549181141466725, + "learning_rate": 3.136716917450896e-06, + "loss": 0.8104, + "num_tokens": 59314528396.0, + "step": 14191 + }, + { + "epoch": 1.6865121806298276, + "grad_norm": 0.23565331906789605, + "learning_rate": 3.135874516498058e-06, + "loss": 0.7994, + "num_tokens": 59318693946.0, + "step": 14192 + }, + { + "epoch": 1.6866310160427807, + "grad_norm": 0.23408980316610495, + "learning_rate": 3.135032406780374e-06, + "loss": 0.8136, + "num_tokens": 59322883597.0, + "step": 14193 + }, + { + "epoch": 1.6867498514557338, + "grad_norm": 0.22478289758664596, + "learning_rate": 3.1341905883290326e-06, + "loss": 0.8368, + "num_tokens": 59327072533.0, + "step": 14194 + }, + { + "epoch": 1.6868686868686869, + "grad_norm": 0.21953580432656972, + "learning_rate": 3.1333490611752055e-06, + "loss": 0.8613, + "num_tokens": 59331247243.0, + "step": 14195 + }, + { + "epoch": 1.68698752228164, + "grad_norm": 0.24415496529714684, + "learning_rate": 3.132507825350062e-06, + "loss": 0.7789, + "num_tokens": 59335411218.0, + "step": 14196 + }, + { + "epoch": 1.687106357694593, + "grad_norm": 0.23971116655313895, + "learning_rate": 3.1316668808847583e-06, + "loss": 0.802, + "num_tokens": 59339598569.0, + "step": 14197 + }, + { + "epoch": 1.6872251931075462, + "grad_norm": 0.22723287117803787, + "learning_rate": 3.1308262278104302e-06, + "loss": 0.8065, + "num_tokens": 59343760974.0, + "step": 14198 + }, + { + "epoch": 1.6873440285204993, + "grad_norm": 0.24670639145910417, + "learning_rate": 3.1299858661582133e-06, + "loss": 0.8173, + "num_tokens": 59347949448.0, + "step": 14199 + }, + { + "epoch": 1.6874628639334521, + "grad_norm": 0.24372295499090343, + "learning_rate": 3.1291457959592293e-06, + "loss": 0.8059, + "num_tokens": 59352094251.0, + "step": 14200 + }, + { + "epoch": 1.6875816993464052, + "grad_norm": 0.24448542963947423, + "learning_rate": 3.128306017244588e-06, + "loss": 0.7873, + "num_tokens": 59356282020.0, + "step": 14201 + }, + { + "epoch": 1.6877005347593583, + "grad_norm": 0.23213267214099445, + "learning_rate": 3.12746653004539e-06, + "loss": 0.7879, + "num_tokens": 59360460998.0, + "step": 14202 + }, + { + "epoch": 1.6878193701723112, + "grad_norm": 0.22493843047433332, + "learning_rate": 3.1266273343927235e-06, + "loss": 0.7918, + "num_tokens": 59364650498.0, + "step": 14203 + }, + { + "epoch": 1.6879382055852643, + "grad_norm": 0.23407004034197204, + "learning_rate": 3.125788430317668e-06, + "loss": 0.8003, + "num_tokens": 59368818312.0, + "step": 14204 + }, + { + "epoch": 1.6880570409982174, + "grad_norm": 0.24279353365285222, + "learning_rate": 3.12494981785129e-06, + "loss": 0.7926, + "num_tokens": 59372990038.0, + "step": 14205 + }, + { + "epoch": 1.6881758764111705, + "grad_norm": 0.21603066970991475, + "learning_rate": 3.1241114970246457e-06, + "loss": 0.7999, + "num_tokens": 59377159125.0, + "step": 14206 + }, + { + "epoch": 1.6882947118241236, + "grad_norm": 0.2312165020192884, + "learning_rate": 3.123273467868782e-06, + "loss": 0.8257, + "num_tokens": 59381322593.0, + "step": 14207 + }, + { + "epoch": 1.6884135472370767, + "grad_norm": 0.23019409056248855, + "learning_rate": 3.1224357304147346e-06, + "loss": 0.7902, + "num_tokens": 59385513378.0, + "step": 14208 + }, + { + "epoch": 1.6885323826500298, + "grad_norm": 0.2228759822443104, + "learning_rate": 3.121598284693526e-06, + "loss": 0.825, + "num_tokens": 59389702320.0, + "step": 14209 + }, + { + "epoch": 1.688651218062983, + "grad_norm": 0.24278155667310353, + "learning_rate": 3.1207611307361712e-06, + "loss": 0.8494, + "num_tokens": 59393863000.0, + "step": 14210 + }, + { + "epoch": 1.6887700534759358, + "grad_norm": 0.21899060030251413, + "learning_rate": 3.119924268573672e-06, + "loss": 0.7837, + "num_tokens": 59398035614.0, + "step": 14211 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.22649105415782006, + "learning_rate": 3.1190876982370184e-06, + "loss": 0.796, + "num_tokens": 59402218411.0, + "step": 14212 + }, + { + "epoch": 1.689007724301842, + "grad_norm": 0.22516395242018383, + "learning_rate": 3.118251419757195e-06, + "loss": 0.8005, + "num_tokens": 59406407103.0, + "step": 14213 + }, + { + "epoch": 1.6891265597147949, + "grad_norm": 0.2306953008618927, + "learning_rate": 3.1174154331651722e-06, + "loss": 0.8022, + "num_tokens": 59410595499.0, + "step": 14214 + }, + { + "epoch": 1.689245395127748, + "grad_norm": 0.23996776390723842, + "learning_rate": 3.116579738491903e-06, + "loss": 0.8047, + "num_tokens": 59414784506.0, + "step": 14215 + }, + { + "epoch": 1.689364230540701, + "grad_norm": 0.2281788747464749, + "learning_rate": 3.1157443357683432e-06, + "loss": 0.7912, + "num_tokens": 59418974733.0, + "step": 14216 + }, + { + "epoch": 1.6894830659536542, + "grad_norm": 0.22695430824966986, + "learning_rate": 3.1149092250254274e-06, + "loss": 0.8611, + "num_tokens": 59423131932.0, + "step": 14217 + }, + { + "epoch": 1.6896019013666073, + "grad_norm": 0.22801068666057328, + "learning_rate": 3.114074406294084e-06, + "loss": 0.7871, + "num_tokens": 59427295832.0, + "step": 14218 + }, + { + "epoch": 1.6897207367795604, + "grad_norm": 0.22745476596146236, + "learning_rate": 3.11323987960523e-06, + "loss": 0.8174, + "num_tokens": 59431485225.0, + "step": 14219 + }, + { + "epoch": 1.6898395721925135, + "grad_norm": 0.2129443036487527, + "learning_rate": 3.1124056449897667e-06, + "loss": 0.8116, + "num_tokens": 59435674114.0, + "step": 14220 + }, + { + "epoch": 1.6899584076054666, + "grad_norm": 0.23561000767930873, + "learning_rate": 3.1115717024785885e-06, + "loss": 0.7921, + "num_tokens": 59439863526.0, + "step": 14221 + }, + { + "epoch": 1.6900772430184194, + "grad_norm": 0.2287358378641536, + "learning_rate": 3.1107380521025843e-06, + "loss": 0.7481, + "num_tokens": 59444053197.0, + "step": 14222 + }, + { + "epoch": 1.6901960784313725, + "grad_norm": 0.23603253036271407, + "learning_rate": 3.109904693892624e-06, + "loss": 0.7748, + "num_tokens": 59448241789.0, + "step": 14223 + }, + { + "epoch": 1.6903149138443256, + "grad_norm": 0.23010824913347755, + "learning_rate": 3.109071627879572e-06, + "loss": 0.7812, + "num_tokens": 59452431773.0, + "step": 14224 + }, + { + "epoch": 1.6904337492572785, + "grad_norm": 0.22438679231920478, + "learning_rate": 3.1082388540942755e-06, + "loss": 0.7795, + "num_tokens": 59456622991.0, + "step": 14225 + }, + { + "epoch": 1.6905525846702316, + "grad_norm": 0.21843161265616778, + "learning_rate": 3.107406372567577e-06, + "loss": 0.8141, + "num_tokens": 59460810920.0, + "step": 14226 + }, + { + "epoch": 1.6906714200831847, + "grad_norm": 0.232086176493314, + "learning_rate": 3.1065741833303036e-06, + "loss": 0.814, + "num_tokens": 59464998583.0, + "step": 14227 + }, + { + "epoch": 1.6907902554961378, + "grad_norm": 0.2208374585638611, + "learning_rate": 3.1057422864132793e-06, + "loss": 0.7849, + "num_tokens": 59469189626.0, + "step": 14228 + }, + { + "epoch": 1.690909090909091, + "grad_norm": 0.22421351081281624, + "learning_rate": 3.10491068184731e-06, + "loss": 0.8013, + "num_tokens": 59473380013.0, + "step": 14229 + }, + { + "epoch": 1.691027926322044, + "grad_norm": 0.229734866855992, + "learning_rate": 3.1040793696631917e-06, + "loss": 0.7967, + "num_tokens": 59477569005.0, + "step": 14230 + }, + { + "epoch": 1.6911467617349971, + "grad_norm": 0.228952507630442, + "learning_rate": 3.10324834989171e-06, + "loss": 0.8225, + "num_tokens": 59481759048.0, + "step": 14231 + }, + { + "epoch": 1.6912655971479502, + "grad_norm": 0.24008986372380503, + "learning_rate": 3.1024176225636415e-06, + "loss": 0.8127, + "num_tokens": 59485896635.0, + "step": 14232 + }, + { + "epoch": 1.691384432560903, + "grad_norm": 0.2344885738724956, + "learning_rate": 3.1015871877097504e-06, + "loss": 0.8086, + "num_tokens": 59490069609.0, + "step": 14233 + }, + { + "epoch": 1.6915032679738562, + "grad_norm": 0.23390150701399112, + "learning_rate": 3.100757045360791e-06, + "loss": 0.8075, + "num_tokens": 59494258152.0, + "step": 14234 + }, + { + "epoch": 1.6916221033868093, + "grad_norm": 0.22348524968194258, + "learning_rate": 3.0999271955475067e-06, + "loss": 0.7582, + "num_tokens": 59498443850.0, + "step": 14235 + }, + { + "epoch": 1.6917409387997622, + "grad_norm": 0.22377587125171625, + "learning_rate": 3.0990976383006296e-06, + "loss": 0.8352, + "num_tokens": 59502621019.0, + "step": 14236 + }, + { + "epoch": 1.6918597742127153, + "grad_norm": 0.22560936267903692, + "learning_rate": 3.09826837365088e-06, + "loss": 0.7898, + "num_tokens": 59506811122.0, + "step": 14237 + }, + { + "epoch": 1.6919786096256684, + "grad_norm": 0.2226810872180947, + "learning_rate": 3.097439401628969e-06, + "loss": 0.8213, + "num_tokens": 59511000321.0, + "step": 14238 + }, + { + "epoch": 1.6920974450386215, + "grad_norm": 0.24724723915986196, + "learning_rate": 3.0966107222655955e-06, + "loss": 0.7812, + "num_tokens": 59515181778.0, + "step": 14239 + }, + { + "epoch": 1.6922162804515746, + "grad_norm": 0.2284088692339139, + "learning_rate": 3.09578233559145e-06, + "loss": 0.7851, + "num_tokens": 59519371048.0, + "step": 14240 + }, + { + "epoch": 1.6923351158645277, + "grad_norm": 0.23362174103972316, + "learning_rate": 3.0949542416372093e-06, + "loss": 0.7943, + "num_tokens": 59523560149.0, + "step": 14241 + }, + { + "epoch": 1.6924539512774808, + "grad_norm": 0.22105561046302447, + "learning_rate": 3.0941264404335393e-06, + "loss": 0.7759, + "num_tokens": 59527707146.0, + "step": 14242 + }, + { + "epoch": 1.6925727866904339, + "grad_norm": 0.23963412247236324, + "learning_rate": 3.0932989320110996e-06, + "loss": 0.8057, + "num_tokens": 59531827653.0, + "step": 14243 + }, + { + "epoch": 1.692691622103387, + "grad_norm": 0.22832373018400698, + "learning_rate": 3.0924717164005317e-06, + "loss": 0.8061, + "num_tokens": 59536016254.0, + "step": 14244 + }, + { + "epoch": 1.6928104575163399, + "grad_norm": 0.23297143854819366, + "learning_rate": 3.0916447936324756e-06, + "loss": 0.7783, + "num_tokens": 59540205580.0, + "step": 14245 + }, + { + "epoch": 1.692929292929293, + "grad_norm": 0.22356172239052624, + "learning_rate": 3.0908181637375464e-06, + "loss": 0.8079, + "num_tokens": 59544379624.0, + "step": 14246 + }, + { + "epoch": 1.6930481283422458, + "grad_norm": 0.23580377766762534, + "learning_rate": 3.089991826746365e-06, + "loss": 0.7967, + "num_tokens": 59548521012.0, + "step": 14247 + }, + { + "epoch": 1.693166963755199, + "grad_norm": 0.23023478687327548, + "learning_rate": 3.0891657826895303e-06, + "loss": 0.7959, + "num_tokens": 59552701344.0, + "step": 14248 + }, + { + "epoch": 1.693285799168152, + "grad_norm": 0.22872237923133498, + "learning_rate": 3.0883400315976336e-06, + "loss": 0.807, + "num_tokens": 59556883087.0, + "step": 14249 + }, + { + "epoch": 1.6934046345811051, + "grad_norm": 0.24183038658550388, + "learning_rate": 3.0875145735012583e-06, + "loss": 0.816, + "num_tokens": 59561070979.0, + "step": 14250 + }, + { + "epoch": 1.6935234699940582, + "grad_norm": 0.2289334224446041, + "learning_rate": 3.0866894084309684e-06, + "loss": 0.7823, + "num_tokens": 59565259210.0, + "step": 14251 + }, + { + "epoch": 1.6936423054070113, + "grad_norm": 0.23302966098876923, + "learning_rate": 3.0858645364173256e-06, + "loss": 0.8492, + "num_tokens": 59569445490.0, + "step": 14252 + }, + { + "epoch": 1.6937611408199644, + "grad_norm": 0.22862849516467823, + "learning_rate": 3.085039957490875e-06, + "loss": 0.8317, + "num_tokens": 59573627026.0, + "step": 14253 + }, + { + "epoch": 1.6938799762329175, + "grad_norm": 0.247273906719444, + "learning_rate": 3.0842156716821582e-06, + "loss": 0.8182, + "num_tokens": 59577795656.0, + "step": 14254 + }, + { + "epoch": 1.6939988116458706, + "grad_norm": 0.23447257688568235, + "learning_rate": 3.0833916790217e-06, + "loss": 0.8298, + "num_tokens": 59581969586.0, + "step": 14255 + }, + { + "epoch": 1.6941176470588235, + "grad_norm": 0.2271890225051785, + "learning_rate": 3.082567979540014e-06, + "loss": 0.7668, + "num_tokens": 59586158061.0, + "step": 14256 + }, + { + "epoch": 1.6942364824717766, + "grad_norm": 0.22588907336643996, + "learning_rate": 3.0817445732676043e-06, + "loss": 0.7732, + "num_tokens": 59590348618.0, + "step": 14257 + }, + { + "epoch": 1.6943553178847295, + "grad_norm": 0.2362292826347493, + "learning_rate": 3.0809214602349664e-06, + "loss": 0.8169, + "num_tokens": 59594537850.0, + "step": 14258 + }, + { + "epoch": 1.6944741532976826, + "grad_norm": 0.2302927165115029, + "learning_rate": 3.0800986404725797e-06, + "loss": 0.8055, + "num_tokens": 59598694559.0, + "step": 14259 + }, + { + "epoch": 1.6945929887106357, + "grad_norm": 0.23738661797904453, + "learning_rate": 3.079276114010921e-06, + "loss": 0.813, + "num_tokens": 59602845664.0, + "step": 14260 + }, + { + "epoch": 1.6947118241235888, + "grad_norm": 0.22670050231564168, + "learning_rate": 3.0784538808804465e-06, + "loss": 0.7889, + "num_tokens": 59607034598.0, + "step": 14261 + }, + { + "epoch": 1.6948306595365419, + "grad_norm": 0.2312752704208434, + "learning_rate": 3.0776319411116094e-06, + "loss": 0.8408, + "num_tokens": 59611226038.0, + "step": 14262 + }, + { + "epoch": 1.694949494949495, + "grad_norm": 0.2231290064582561, + "learning_rate": 3.076810294734846e-06, + "loss": 0.7518, + "num_tokens": 59615404173.0, + "step": 14263 + }, + { + "epoch": 1.695068330362448, + "grad_norm": 0.25124188167638184, + "learning_rate": 3.0759889417805877e-06, + "loss": 0.797, + "num_tokens": 59619592234.0, + "step": 14264 + }, + { + "epoch": 1.6951871657754012, + "grad_norm": 0.25175835140935005, + "learning_rate": 3.07516788227925e-06, + "loss": 0.801, + "num_tokens": 59623781090.0, + "step": 14265 + }, + { + "epoch": 1.6953060011883543, + "grad_norm": 0.24881421392116634, + "learning_rate": 3.0743471162612404e-06, + "loss": 0.8309, + "num_tokens": 59627970001.0, + "step": 14266 + }, + { + "epoch": 1.6954248366013072, + "grad_norm": 0.23921391680411538, + "learning_rate": 3.0735266437569545e-06, + "loss": 0.7817, + "num_tokens": 59632128920.0, + "step": 14267 + }, + { + "epoch": 1.6955436720142603, + "grad_norm": 0.24742198322289086, + "learning_rate": 3.0727064647967763e-06, + "loss": 0.8125, + "num_tokens": 59636318365.0, + "step": 14268 + }, + { + "epoch": 1.6956625074272134, + "grad_norm": 0.24698526931337206, + "learning_rate": 3.0718865794110806e-06, + "loss": 0.7963, + "num_tokens": 59640506860.0, + "step": 14269 + }, + { + "epoch": 1.6957813428401662, + "grad_norm": 0.22832945784288738, + "learning_rate": 3.0710669876302314e-06, + "loss": 0.8164, + "num_tokens": 59644677465.0, + "step": 14270 + }, + { + "epoch": 1.6959001782531193, + "grad_norm": 0.2540448416077137, + "learning_rate": 3.070247689484581e-06, + "loss": 0.8071, + "num_tokens": 59648822374.0, + "step": 14271 + }, + { + "epoch": 1.6960190136660724, + "grad_norm": 0.2533644943473855, + "learning_rate": 3.0694286850044664e-06, + "loss": 0.8566, + "num_tokens": 59653011942.0, + "step": 14272 + }, + { + "epoch": 1.6961378490790255, + "grad_norm": 0.22820000960438436, + "learning_rate": 3.0686099742202236e-06, + "loss": 0.7874, + "num_tokens": 59657202807.0, + "step": 14273 + }, + { + "epoch": 1.6962566844919786, + "grad_norm": 0.24978369186558647, + "learning_rate": 3.06779155716217e-06, + "loss": 0.7864, + "num_tokens": 59661392359.0, + "step": 14274 + }, + { + "epoch": 1.6963755199049317, + "grad_norm": 0.23881497344450642, + "learning_rate": 3.066973433860614e-06, + "loss": 0.814, + "num_tokens": 59665543766.0, + "step": 14275 + }, + { + "epoch": 1.6964943553178848, + "grad_norm": 0.2275367210354127, + "learning_rate": 3.066155604345856e-06, + "loss": 0.7684, + "num_tokens": 59669732239.0, + "step": 14276 + }, + { + "epoch": 1.696613190730838, + "grad_norm": 0.2556992960270156, + "learning_rate": 3.06533806864818e-06, + "loss": 0.8318, + "num_tokens": 59673921719.0, + "step": 14277 + }, + { + "epoch": 1.6967320261437908, + "grad_norm": 0.2266191843348843, + "learning_rate": 3.0645208267978597e-06, + "loss": 0.8003, + "num_tokens": 59678112213.0, + "step": 14278 + }, + { + "epoch": 1.696850861556744, + "grad_norm": 0.2340616756369219, + "learning_rate": 3.0637038788251667e-06, + "loss": 0.813, + "num_tokens": 59682301648.0, + "step": 14279 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 0.24218895101281032, + "learning_rate": 3.0628872247603516e-06, + "loss": 0.8306, + "num_tokens": 59686490973.0, + "step": 14280 + }, + { + "epoch": 1.69708853238265, + "grad_norm": 0.24131029130973464, + "learning_rate": 3.0620708646336607e-06, + "loss": 0.7795, + "num_tokens": 59690651488.0, + "step": 14281 + }, + { + "epoch": 1.697207367795603, + "grad_norm": 0.232614772525682, + "learning_rate": 3.0612547984753227e-06, + "loss": 0.7981, + "num_tokens": 59694841292.0, + "step": 14282 + }, + { + "epoch": 1.697326203208556, + "grad_norm": 0.23471338932194766, + "learning_rate": 3.0604390263155603e-06, + "loss": 0.8058, + "num_tokens": 59699007818.0, + "step": 14283 + }, + { + "epoch": 1.6974450386215092, + "grad_norm": 0.23159431094485688, + "learning_rate": 3.059623548184584e-06, + "loss": 0.8341, + "num_tokens": 59703198040.0, + "step": 14284 + }, + { + "epoch": 1.6975638740344623, + "grad_norm": 0.22474553750294948, + "learning_rate": 3.0588083641125958e-06, + "loss": 0.819, + "num_tokens": 59707363834.0, + "step": 14285 + }, + { + "epoch": 1.6976827094474154, + "grad_norm": 0.23946578030849858, + "learning_rate": 3.0579934741297867e-06, + "loss": 0.8058, + "num_tokens": 59711553838.0, + "step": 14286 + }, + { + "epoch": 1.6978015448603685, + "grad_norm": 0.23516329069447323, + "learning_rate": 3.0571788782663296e-06, + "loss": 0.8183, + "num_tokens": 59715741892.0, + "step": 14287 + }, + { + "epoch": 1.6979203802733216, + "grad_norm": 0.21637784312171804, + "learning_rate": 3.0563645765523937e-06, + "loss": 0.8033, + "num_tokens": 59719930880.0, + "step": 14288 + }, + { + "epoch": 1.6980392156862745, + "grad_norm": 0.23398445880436244, + "learning_rate": 3.055550569018136e-06, + "loss": 0.7983, + "num_tokens": 59724100783.0, + "step": 14289 + }, + { + "epoch": 1.6981580510992276, + "grad_norm": 0.22639116978545276, + "learning_rate": 3.0547368556937006e-06, + "loss": 0.8299, + "num_tokens": 59728289027.0, + "step": 14290 + }, + { + "epoch": 1.6982768865121807, + "grad_norm": 0.2311584184358533, + "learning_rate": 3.0539234366092234e-06, + "loss": 0.8063, + "num_tokens": 59732454734.0, + "step": 14291 + }, + { + "epoch": 1.6983957219251336, + "grad_norm": 0.2401414844083273, + "learning_rate": 3.053110311794828e-06, + "loss": 0.7965, + "num_tokens": 59736634556.0, + "step": 14292 + }, + { + "epoch": 1.6985145573380867, + "grad_norm": 0.2234642761753001, + "learning_rate": 3.0522974812806267e-06, + "loss": 0.8305, + "num_tokens": 59740824815.0, + "step": 14293 + }, + { + "epoch": 1.6986333927510398, + "grad_norm": 0.2323923730638068, + "learning_rate": 3.0514849450967217e-06, + "loss": 0.8092, + "num_tokens": 59745013963.0, + "step": 14294 + }, + { + "epoch": 1.6987522281639929, + "grad_norm": 0.2289031076547629, + "learning_rate": 3.050672703273203e-06, + "loss": 0.818, + "num_tokens": 59749177221.0, + "step": 14295 + }, + { + "epoch": 1.698871063576946, + "grad_norm": 0.2647050915021388, + "learning_rate": 3.0498607558401523e-06, + "loss": 0.8341, + "num_tokens": 59753364730.0, + "step": 14296 + }, + { + "epoch": 1.698989898989899, + "grad_norm": 0.22715268037660427, + "learning_rate": 3.049049102827638e-06, + "loss": 0.8627, + "num_tokens": 59757527763.0, + "step": 14297 + }, + { + "epoch": 1.6991087344028521, + "grad_norm": 0.25144967846421423, + "learning_rate": 3.048237744265718e-06, + "loss": 0.7845, + "num_tokens": 59761717277.0, + "step": 14298 + }, + { + "epoch": 1.6992275698158052, + "grad_norm": 0.24903439019688442, + "learning_rate": 3.0474266801844394e-06, + "loss": 0.804, + "num_tokens": 59765906073.0, + "step": 14299 + }, + { + "epoch": 1.6993464052287581, + "grad_norm": 0.22348777379083037, + "learning_rate": 3.046615910613839e-06, + "loss": 0.757, + "num_tokens": 59770095491.0, + "step": 14300 + }, + { + "epoch": 1.6994652406417112, + "grad_norm": 0.23442873369305467, + "learning_rate": 3.045805435583943e-06, + "loss": 0.7664, + "num_tokens": 59774285643.0, + "step": 14301 + }, + { + "epoch": 1.6995840760546643, + "grad_norm": 0.2543077220597582, + "learning_rate": 3.0449952551247674e-06, + "loss": 0.8275, + "num_tokens": 59778474319.0, + "step": 14302 + }, + { + "epoch": 1.6997029114676172, + "grad_norm": 0.2400449220604038, + "learning_rate": 3.0441853692663107e-06, + "loss": 0.7858, + "num_tokens": 59782649145.0, + "step": 14303 + }, + { + "epoch": 1.6998217468805703, + "grad_norm": 0.2487051349909738, + "learning_rate": 3.0433757780385688e-06, + "loss": 0.8253, + "num_tokens": 59786818399.0, + "step": 14304 + }, + { + "epoch": 1.6999405822935234, + "grad_norm": 0.22831906971314575, + "learning_rate": 3.0425664814715256e-06, + "loss": 0.7819, + "num_tokens": 59790999321.0, + "step": 14305 + }, + { + "epoch": 1.7000594177064765, + "grad_norm": 0.23472592639864592, + "learning_rate": 3.0417574795951498e-06, + "loss": 0.8081, + "num_tokens": 59795185816.0, + "step": 14306 + }, + { + "epoch": 1.7001782531194296, + "grad_norm": 0.23278211073722113, + "learning_rate": 3.040948772439404e-06, + "loss": 0.8126, + "num_tokens": 59799375000.0, + "step": 14307 + }, + { + "epoch": 1.7002970885323827, + "grad_norm": 0.2227239001545608, + "learning_rate": 3.0401403600342335e-06, + "loss": 0.7939, + "num_tokens": 59803564796.0, + "step": 14308 + }, + { + "epoch": 1.7004159239453358, + "grad_norm": 0.22191473041356663, + "learning_rate": 3.039332242409579e-06, + "loss": 0.7482, + "num_tokens": 59807755024.0, + "step": 14309 + }, + { + "epoch": 1.700534759358289, + "grad_norm": 0.23789257682068357, + "learning_rate": 3.0385244195953638e-06, + "loss": 0.7854, + "num_tokens": 59811943751.0, + "step": 14310 + }, + { + "epoch": 1.7006535947712418, + "grad_norm": 0.24832001268569381, + "learning_rate": 3.0377168916215105e-06, + "loss": 0.8359, + "num_tokens": 59816132208.0, + "step": 14311 + }, + { + "epoch": 1.7007724301841949, + "grad_norm": 0.23235262407291468, + "learning_rate": 3.036909658517924e-06, + "loss": 0.8, + "num_tokens": 59820319967.0, + "step": 14312 + }, + { + "epoch": 1.700891265597148, + "grad_norm": 0.25469677379681105, + "learning_rate": 3.0361027203144954e-06, + "loss": 0.8164, + "num_tokens": 59824489902.0, + "step": 14313 + }, + { + "epoch": 1.7010101010101009, + "grad_norm": 0.22647288231830476, + "learning_rate": 3.0352960770411087e-06, + "loss": 0.8113, + "num_tokens": 59828675317.0, + "step": 14314 + }, + { + "epoch": 1.701128936423054, + "grad_norm": 0.23487374410026934, + "learning_rate": 3.0344897287276376e-06, + "loss": 0.8001, + "num_tokens": 59832863365.0, + "step": 14315 + }, + { + "epoch": 1.701247771836007, + "grad_norm": 0.22835921510561236, + "learning_rate": 3.0336836754039416e-06, + "loss": 0.8152, + "num_tokens": 59837045322.0, + "step": 14316 + }, + { + "epoch": 1.7013666072489602, + "grad_norm": 0.223638161980932, + "learning_rate": 3.032877917099878e-06, + "loss": 0.7656, + "num_tokens": 59841217523.0, + "step": 14317 + }, + { + "epoch": 1.7014854426619133, + "grad_norm": 0.22125067688795438, + "learning_rate": 3.0320724538452814e-06, + "loss": 0.8397, + "num_tokens": 59845405836.0, + "step": 14318 + }, + { + "epoch": 1.7016042780748664, + "grad_norm": 0.210731803724964, + "learning_rate": 3.031267285669981e-06, + "loss": 0.8242, + "num_tokens": 59849594690.0, + "step": 14319 + }, + { + "epoch": 1.7017231134878195, + "grad_norm": 0.22415208519247873, + "learning_rate": 3.030462412603796e-06, + "loss": 0.7879, + "num_tokens": 59853764234.0, + "step": 14320 + }, + { + "epoch": 1.7018419489007726, + "grad_norm": 0.214438878062379, + "learning_rate": 3.029657834676534e-06, + "loss": 0.7724, + "num_tokens": 59857952162.0, + "step": 14321 + }, + { + "epoch": 1.7019607843137254, + "grad_norm": 0.21857995056599125, + "learning_rate": 3.0288535519179907e-06, + "loss": 0.7823, + "num_tokens": 59862143180.0, + "step": 14322 + }, + { + "epoch": 1.7020796197266785, + "grad_norm": 0.23168926220086294, + "learning_rate": 3.028049564357951e-06, + "loss": 0.8255, + "num_tokens": 59866333861.0, + "step": 14323 + }, + { + "epoch": 1.7021984551396316, + "grad_norm": 0.2168552279917475, + "learning_rate": 3.027245872026191e-06, + "loss": 0.7982, + "num_tokens": 59870523253.0, + "step": 14324 + }, + { + "epoch": 1.7023172905525845, + "grad_norm": 0.22584663515130682, + "learning_rate": 3.0264424749524713e-06, + "loss": 0.7791, + "num_tokens": 59874701378.0, + "step": 14325 + }, + { + "epoch": 1.7024361259655376, + "grad_norm": 0.2290976713770901, + "learning_rate": 3.025639373166547e-06, + "loss": 0.8121, + "num_tokens": 59878873706.0, + "step": 14326 + }, + { + "epoch": 1.7025549613784907, + "grad_norm": 0.220388459699598, + "learning_rate": 3.0248365666981577e-06, + "loss": 0.8224, + "num_tokens": 59883058664.0, + "step": 14327 + }, + { + "epoch": 1.7026737967914438, + "grad_norm": 0.23380221836672282, + "learning_rate": 3.0240340555770363e-06, + "loss": 0.8052, + "num_tokens": 59887249069.0, + "step": 14328 + }, + { + "epoch": 1.702792632204397, + "grad_norm": 0.22529456708860507, + "learning_rate": 3.0232318398329003e-06, + "loss": 0.7978, + "num_tokens": 59891409208.0, + "step": 14329 + }, + { + "epoch": 1.70291146761735, + "grad_norm": 0.2214881732718272, + "learning_rate": 3.02242991949546e-06, + "loss": 0.8147, + "num_tokens": 59895594114.0, + "step": 14330 + }, + { + "epoch": 1.7030303030303031, + "grad_norm": 0.227920435674472, + "learning_rate": 3.021628294594412e-06, + "loss": 0.8079, + "num_tokens": 59899776432.0, + "step": 14331 + }, + { + "epoch": 1.7031491384432562, + "grad_norm": 0.22711073846421057, + "learning_rate": 3.0208269651594436e-06, + "loss": 0.8036, + "num_tokens": 59903966132.0, + "step": 14332 + }, + { + "epoch": 1.7032679738562093, + "grad_norm": 0.22969004109831187, + "learning_rate": 3.020025931220231e-06, + "loss": 0.7853, + "num_tokens": 59908154914.0, + "step": 14333 + }, + { + "epoch": 1.7033868092691622, + "grad_norm": 0.22497582929550491, + "learning_rate": 3.019225192806442e-06, + "loss": 0.8086, + "num_tokens": 59912315219.0, + "step": 14334 + }, + { + "epoch": 1.7035056446821153, + "grad_norm": 0.23672368645843042, + "learning_rate": 3.018424749947723e-06, + "loss": 0.7902, + "num_tokens": 59916461901.0, + "step": 14335 + }, + { + "epoch": 1.7036244800950682, + "grad_norm": 0.22212338640186918, + "learning_rate": 3.0176246026737235e-06, + "loss": 0.8635, + "num_tokens": 59920638740.0, + "step": 14336 + }, + { + "epoch": 1.7037433155080213, + "grad_norm": 0.24239955506483357, + "learning_rate": 3.0168247510140753e-06, + "loss": 0.8183, + "num_tokens": 59924828242.0, + "step": 14337 + }, + { + "epoch": 1.7038621509209744, + "grad_norm": 0.22495800808950564, + "learning_rate": 3.0160251949983974e-06, + "loss": 0.7951, + "num_tokens": 59928998810.0, + "step": 14338 + }, + { + "epoch": 1.7039809863339275, + "grad_norm": 0.2533190763461733, + "learning_rate": 3.015225934656303e-06, + "loss": 0.8423, + "num_tokens": 59933187318.0, + "step": 14339 + }, + { + "epoch": 1.7040998217468806, + "grad_norm": 0.25609260551130786, + "learning_rate": 3.014426970017388e-06, + "loss": 0.8115, + "num_tokens": 59937369048.0, + "step": 14340 + }, + { + "epoch": 1.7042186571598337, + "grad_norm": 0.2288903132134777, + "learning_rate": 3.0136283011112403e-06, + "loss": 0.8092, + "num_tokens": 59941558992.0, + "step": 14341 + }, + { + "epoch": 1.7043374925727868, + "grad_norm": 0.2519067974956074, + "learning_rate": 3.0128299279674387e-06, + "loss": 0.7887, + "num_tokens": 59945747633.0, + "step": 14342 + }, + { + "epoch": 1.7044563279857399, + "grad_norm": 0.23907035294950538, + "learning_rate": 3.0120318506155527e-06, + "loss": 0.8251, + "num_tokens": 59949936431.0, + "step": 14343 + }, + { + "epoch": 1.704575163398693, + "grad_norm": 0.2269839978135148, + "learning_rate": 3.011234069085136e-06, + "loss": 0.7879, + "num_tokens": 59954122920.0, + "step": 14344 + }, + { + "epoch": 1.7046939988116458, + "grad_norm": 0.23784914769929164, + "learning_rate": 3.0104365834057307e-06, + "loss": 0.7639, + "num_tokens": 59958303131.0, + "step": 14345 + }, + { + "epoch": 1.704812834224599, + "grad_norm": 0.25476007981271986, + "learning_rate": 3.0096393936068724e-06, + "loss": 0.8315, + "num_tokens": 59962492775.0, + "step": 14346 + }, + { + "epoch": 1.7049316696375518, + "grad_norm": 0.2429124525602757, + "learning_rate": 3.008842499718083e-06, + "loss": 0.8071, + "num_tokens": 59966682066.0, + "step": 14347 + }, + { + "epoch": 1.705050505050505, + "grad_norm": 0.2525669134027923, + "learning_rate": 3.0080459017688736e-06, + "loss": 0.8119, + "num_tokens": 59970870159.0, + "step": 14348 + }, + { + "epoch": 1.705169340463458, + "grad_norm": 0.24367532884179682, + "learning_rate": 3.0072495997887493e-06, + "loss": 0.8122, + "num_tokens": 59975045297.0, + "step": 14349 + }, + { + "epoch": 1.7052881758764111, + "grad_norm": 0.2322064458302489, + "learning_rate": 3.0064535938071936e-06, + "loss": 0.81, + "num_tokens": 59979234728.0, + "step": 14350 + }, + { + "epoch": 1.7054070112893642, + "grad_norm": 0.245801945934389, + "learning_rate": 3.0056578838536886e-06, + "loss": 0.8082, + "num_tokens": 59983424527.0, + "step": 14351 + }, + { + "epoch": 1.7055258467023173, + "grad_norm": 0.22849456443761057, + "learning_rate": 3.0048624699577024e-06, + "loss": 0.8233, + "num_tokens": 59987561366.0, + "step": 14352 + }, + { + "epoch": 1.7056446821152704, + "grad_norm": 0.25042265725790636, + "learning_rate": 3.00406735214869e-06, + "loss": 0.8282, + "num_tokens": 59991707238.0, + "step": 14353 + }, + { + "epoch": 1.7057635175282235, + "grad_norm": 0.24516775342155497, + "learning_rate": 3.0032725304561e-06, + "loss": 0.7982, + "num_tokens": 59995875706.0, + "step": 14354 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.2346751343015147, + "learning_rate": 3.0024780049093643e-06, + "loss": 0.7843, + "num_tokens": 60000045681.0, + "step": 14355 + }, + { + "epoch": 1.7060011883541295, + "grad_norm": 0.21727504947993437, + "learning_rate": 3.0016837755379092e-06, + "loss": 0.7707, + "num_tokens": 60004181060.0, + "step": 14356 + }, + { + "epoch": 1.7061200237670826, + "grad_norm": 0.24153558923927498, + "learning_rate": 3.0008898423711467e-06, + "loss": 0.8292, + "num_tokens": 60008370004.0, + "step": 14357 + }, + { + "epoch": 1.7062388591800357, + "grad_norm": 0.2313224144361431, + "learning_rate": 3.0000962054384784e-06, + "loss": 0.8205, + "num_tokens": 60012559124.0, + "step": 14358 + }, + { + "epoch": 1.7063576945929886, + "grad_norm": 0.22970296670471876, + "learning_rate": 2.999302864769296e-06, + "loss": 0.8348, + "num_tokens": 60016749223.0, + "step": 14359 + }, + { + "epoch": 1.7064765300059417, + "grad_norm": 0.24126699425265025, + "learning_rate": 2.998509820392982e-06, + "loss": 0.7922, + "num_tokens": 60020938707.0, + "step": 14360 + }, + { + "epoch": 1.7065953654188948, + "grad_norm": 0.23444392218628962, + "learning_rate": 2.9977170723388994e-06, + "loss": 0.809, + "num_tokens": 60025127423.0, + "step": 14361 + }, + { + "epoch": 1.7067142008318479, + "grad_norm": 0.21461222061423255, + "learning_rate": 2.996924620636411e-06, + "loss": 0.8122, + "num_tokens": 60029314957.0, + "step": 14362 + }, + { + "epoch": 1.706833036244801, + "grad_norm": 0.23320154470513962, + "learning_rate": 2.9961324653148637e-06, + "loss": 0.8259, + "num_tokens": 60033481220.0, + "step": 14363 + }, + { + "epoch": 1.706951871657754, + "grad_norm": 0.23096133417890805, + "learning_rate": 2.9953406064035926e-06, + "loss": 0.796, + "num_tokens": 60037647610.0, + "step": 14364 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.23653663206088857, + "learning_rate": 2.9945490439319247e-06, + "loss": 0.787, + "num_tokens": 60041836737.0, + "step": 14365 + }, + { + "epoch": 1.7071895424836603, + "grad_norm": 0.25362782122870176, + "learning_rate": 2.9937577779291715e-06, + "loss": 0.8132, + "num_tokens": 60046022254.0, + "step": 14366 + }, + { + "epoch": 1.7073083778966132, + "grad_norm": 0.22987867165657574, + "learning_rate": 2.9929668084246357e-06, + "loss": 0.7679, + "num_tokens": 60050197278.0, + "step": 14367 + }, + { + "epoch": 1.7074272133095663, + "grad_norm": 0.24679430516477704, + "learning_rate": 2.9921761354476127e-06, + "loss": 0.8206, + "num_tokens": 60054364793.0, + "step": 14368 + }, + { + "epoch": 1.7075460487225194, + "grad_norm": 0.24206568844930862, + "learning_rate": 2.9913857590273833e-06, + "loss": 0.8222, + "num_tokens": 60058554766.0, + "step": 14369 + }, + { + "epoch": 1.7076648841354722, + "grad_norm": 0.2213988919455734, + "learning_rate": 2.990595679193218e-06, + "loss": 0.7956, + "num_tokens": 60062744559.0, + "step": 14370 + }, + { + "epoch": 1.7077837195484253, + "grad_norm": 0.2251943915166463, + "learning_rate": 2.989805895974374e-06, + "loss": 0.8128, + "num_tokens": 60066913829.0, + "step": 14371 + }, + { + "epoch": 1.7079025549613784, + "grad_norm": 0.23777398303563801, + "learning_rate": 2.9890164094001005e-06, + "loss": 0.7576, + "num_tokens": 60071049885.0, + "step": 14372 + }, + { + "epoch": 1.7080213903743315, + "grad_norm": 0.207067460721074, + "learning_rate": 2.988227219499635e-06, + "loss": 0.8269, + "num_tokens": 60075238773.0, + "step": 14373 + }, + { + "epoch": 1.7081402257872846, + "grad_norm": 0.2606107351894122, + "learning_rate": 2.987438326302202e-06, + "loss": 0.7899, + "num_tokens": 60079429321.0, + "step": 14374 + }, + { + "epoch": 1.7082590612002377, + "grad_norm": 0.24011452416747445, + "learning_rate": 2.9866497298370233e-06, + "loss": 0.806, + "num_tokens": 60083616542.0, + "step": 14375 + }, + { + "epoch": 1.7083778966131908, + "grad_norm": 0.23344008369956057, + "learning_rate": 2.985861430133296e-06, + "loss": 0.8091, + "num_tokens": 60087781608.0, + "step": 14376 + }, + { + "epoch": 1.708496732026144, + "grad_norm": 0.2513638103870826, + "learning_rate": 2.9850734272202174e-06, + "loss": 0.8212, + "num_tokens": 60091919626.0, + "step": 14377 + }, + { + "epoch": 1.7086155674390968, + "grad_norm": 0.24602844552863054, + "learning_rate": 2.9842857211269683e-06, + "loss": 0.8376, + "num_tokens": 60096108869.0, + "step": 14378 + }, + { + "epoch": 1.70873440285205, + "grad_norm": 0.22304052047557624, + "learning_rate": 2.983498311882721e-06, + "loss": 0.8076, + "num_tokens": 60100282628.0, + "step": 14379 + }, + { + "epoch": 1.708853238265003, + "grad_norm": 0.2500057783299023, + "learning_rate": 2.982711199516636e-06, + "loss": 0.8138, + "num_tokens": 60104470172.0, + "step": 14380 + }, + { + "epoch": 1.708972073677956, + "grad_norm": 0.2423503977101885, + "learning_rate": 2.981924384057861e-06, + "loss": 0.8045, + "num_tokens": 60108637606.0, + "step": 14381 + }, + { + "epoch": 1.709090909090909, + "grad_norm": 0.22779707448538464, + "learning_rate": 2.981137865535537e-06, + "loss": 0.7823, + "num_tokens": 60112808167.0, + "step": 14382 + }, + { + "epoch": 1.709209744503862, + "grad_norm": 0.24042191333912596, + "learning_rate": 2.98035164397879e-06, + "loss": 0.8109, + "num_tokens": 60116998441.0, + "step": 14383 + }, + { + "epoch": 1.7093285799168152, + "grad_norm": 0.22719700772398363, + "learning_rate": 2.979565719416737e-06, + "loss": 0.7724, + "num_tokens": 60121185654.0, + "step": 14384 + }, + { + "epoch": 1.7094474153297683, + "grad_norm": 0.2260442839966296, + "learning_rate": 2.9787800918784825e-06, + "loss": 0.793, + "num_tokens": 60125338445.0, + "step": 14385 + }, + { + "epoch": 1.7095662507427214, + "grad_norm": 0.23216121595370046, + "learning_rate": 2.9779947613931225e-06, + "loss": 0.7779, + "num_tokens": 60129526598.0, + "step": 14386 + }, + { + "epoch": 1.7096850861556745, + "grad_norm": 0.22996796444238587, + "learning_rate": 2.9772097279897392e-06, + "loss": 0.7954, + "num_tokens": 60133716149.0, + "step": 14387 + }, + { + "epoch": 1.7098039215686276, + "grad_norm": 0.22754636546244716, + "learning_rate": 2.9764249916974047e-06, + "loss": 0.8135, + "num_tokens": 60137905726.0, + "step": 14388 + }, + { + "epoch": 1.7099227569815805, + "grad_norm": 0.22044165383200712, + "learning_rate": 2.9756405525451814e-06, + "loss": 0.8072, + "num_tokens": 60142074430.0, + "step": 14389 + }, + { + "epoch": 1.7100415923945336, + "grad_norm": 0.2447527169009251, + "learning_rate": 2.974856410562119e-06, + "loss": 0.8268, + "num_tokens": 60146244819.0, + "step": 14390 + }, + { + "epoch": 1.7101604278074867, + "grad_norm": 0.24241895985147777, + "learning_rate": 2.9740725657772607e-06, + "loss": 0.8268, + "num_tokens": 60150402610.0, + "step": 14391 + }, + { + "epoch": 1.7102792632204395, + "grad_norm": 0.21630333958267486, + "learning_rate": 2.973289018219629e-06, + "loss": 0.7918, + "num_tokens": 60154592686.0, + "step": 14392 + }, + { + "epoch": 1.7103980986333926, + "grad_norm": 0.23624416904225834, + "learning_rate": 2.972505767918242e-06, + "loss": 0.8094, + "num_tokens": 60158781383.0, + "step": 14393 + }, + { + "epoch": 1.7105169340463457, + "grad_norm": 0.2184058818526069, + "learning_rate": 2.971722814902109e-06, + "loss": 0.7977, + "num_tokens": 60162957943.0, + "step": 14394 + }, + { + "epoch": 1.7106357694592988, + "grad_norm": 0.23354351145291005, + "learning_rate": 2.9709401592002253e-06, + "loss": 0.7623, + "num_tokens": 60167146874.0, + "step": 14395 + }, + { + "epoch": 1.710754604872252, + "grad_norm": 0.230399154172444, + "learning_rate": 2.970157800841576e-06, + "loss": 0.7949, + "num_tokens": 60171335645.0, + "step": 14396 + }, + { + "epoch": 1.710873440285205, + "grad_norm": 0.2315990326310714, + "learning_rate": 2.9693757398551315e-06, + "loss": 0.7735, + "num_tokens": 60175494123.0, + "step": 14397 + }, + { + "epoch": 1.7109922756981581, + "grad_norm": 0.23109611828496673, + "learning_rate": 2.968593976269855e-06, + "loss": 0.7814, + "num_tokens": 60179683192.0, + "step": 14398 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.22682937032263178, + "learning_rate": 2.967812510114697e-06, + "loss": 0.7875, + "num_tokens": 60183872662.0, + "step": 14399 + }, + { + "epoch": 1.7112299465240641, + "grad_norm": 0.24549823442612162, + "learning_rate": 2.967031341418601e-06, + "loss": 0.7803, + "num_tokens": 60188043510.0, + "step": 14400 + }, + { + "epoch": 1.7113487819370172, + "grad_norm": 0.23737782161513438, + "learning_rate": 2.966250470210496e-06, + "loss": 0.8047, + "num_tokens": 60192218082.0, + "step": 14401 + }, + { + "epoch": 1.7114676173499703, + "grad_norm": 0.23827464967641113, + "learning_rate": 2.9654698965192963e-06, + "loss": 0.8229, + "num_tokens": 60196397420.0, + "step": 14402 + }, + { + "epoch": 1.7115864527629232, + "grad_norm": 0.2410741876551795, + "learning_rate": 2.9646896203739127e-06, + "loss": 0.7861, + "num_tokens": 60200587358.0, + "step": 14403 + }, + { + "epoch": 1.7117052881758763, + "grad_norm": 0.22388821374754245, + "learning_rate": 2.9639096418032408e-06, + "loss": 0.7907, + "num_tokens": 60204757787.0, + "step": 14404 + }, + { + "epoch": 1.7118241235888294, + "grad_norm": 0.2388930163735374, + "learning_rate": 2.963129960836163e-06, + "loss": 0.8202, + "num_tokens": 60208941471.0, + "step": 14405 + }, + { + "epoch": 1.7119429590017825, + "grad_norm": 0.21925598785149814, + "learning_rate": 2.9623505775015604e-06, + "loss": 0.7971, + "num_tokens": 60213120700.0, + "step": 14406 + }, + { + "epoch": 1.7120617944147356, + "grad_norm": 0.23193204174808313, + "learning_rate": 2.96157149182829e-06, + "loss": 0.8305, + "num_tokens": 60217309640.0, + "step": 14407 + }, + { + "epoch": 1.7121806298276887, + "grad_norm": 0.22542304382271505, + "learning_rate": 2.9607927038452046e-06, + "loss": 0.7593, + "num_tokens": 60221500419.0, + "step": 14408 + }, + { + "epoch": 1.7122994652406418, + "grad_norm": 0.21669531531164682, + "learning_rate": 2.960014213581148e-06, + "loss": 0.7972, + "num_tokens": 60225647146.0, + "step": 14409 + }, + { + "epoch": 1.712418300653595, + "grad_norm": 0.22400908181380255, + "learning_rate": 2.9592360210649485e-06, + "loss": 0.803, + "num_tokens": 60229829450.0, + "step": 14410 + }, + { + "epoch": 1.7125371360665478, + "grad_norm": 0.22834605804513508, + "learning_rate": 2.9584581263254255e-06, + "loss": 0.807, + "num_tokens": 60234019093.0, + "step": 14411 + }, + { + "epoch": 1.7126559714795009, + "grad_norm": 0.22300287918402759, + "learning_rate": 2.9576805293913865e-06, + "loss": 0.8464, + "num_tokens": 60238209499.0, + "step": 14412 + }, + { + "epoch": 1.712774806892454, + "grad_norm": 0.2424371135557215, + "learning_rate": 2.9569032302916297e-06, + "loss": 0.7922, + "num_tokens": 60242399261.0, + "step": 14413 + }, + { + "epoch": 1.7128936423054069, + "grad_norm": 0.2317300525217859, + "learning_rate": 2.956126229054941e-06, + "loss": 0.816, + "num_tokens": 60246589124.0, + "step": 14414 + }, + { + "epoch": 1.71301247771836, + "grad_norm": 0.23502187648993922, + "learning_rate": 2.955349525710095e-06, + "loss": 0.8417, + "num_tokens": 60250772559.0, + "step": 14415 + }, + { + "epoch": 1.713131313131313, + "grad_norm": 0.2319028201392965, + "learning_rate": 2.9545731202858558e-06, + "loss": 0.8118, + "num_tokens": 60254930327.0, + "step": 14416 + }, + { + "epoch": 1.7132501485442662, + "grad_norm": 0.2507372420708197, + "learning_rate": 2.9537970128109766e-06, + "loss": 0.7992, + "num_tokens": 60259119696.0, + "step": 14417 + }, + { + "epoch": 1.7133689839572193, + "grad_norm": 0.22867322000273935, + "learning_rate": 2.953021203314197e-06, + "loss": 0.8229, + "num_tokens": 60263309133.0, + "step": 14418 + }, + { + "epoch": 1.7134878193701724, + "grad_norm": 0.22936004867948662, + "learning_rate": 2.9522456918242504e-06, + "loss": 0.7813, + "num_tokens": 60267499249.0, + "step": 14419 + }, + { + "epoch": 1.7136066547831255, + "grad_norm": 0.2332670376430616, + "learning_rate": 2.951470478369857e-06, + "loss": 0.7897, + "num_tokens": 60271689223.0, + "step": 14420 + }, + { + "epoch": 1.7137254901960786, + "grad_norm": 0.2270947415063501, + "learning_rate": 2.9506955629797234e-06, + "loss": 0.8046, + "num_tokens": 60275877439.0, + "step": 14421 + }, + { + "epoch": 1.7138443256090314, + "grad_norm": 0.22253511639817528, + "learning_rate": 2.9499209456825506e-06, + "loss": 0.8014, + "num_tokens": 60280065802.0, + "step": 14422 + }, + { + "epoch": 1.7139631610219845, + "grad_norm": 0.21957336642820327, + "learning_rate": 2.949146626507021e-06, + "loss": 0.7585, + "num_tokens": 60284244099.0, + "step": 14423 + }, + { + "epoch": 1.7140819964349376, + "grad_norm": 0.23924859127887016, + "learning_rate": 2.948372605481812e-06, + "loss": 0.7728, + "num_tokens": 60288403834.0, + "step": 14424 + }, + { + "epoch": 1.7142008318478905, + "grad_norm": 0.2202754035763323, + "learning_rate": 2.9475988826355893e-06, + "loss": 0.7818, + "num_tokens": 60292575765.0, + "step": 14425 + }, + { + "epoch": 1.7143196672608436, + "grad_norm": 0.22550438508488432, + "learning_rate": 2.9468254579970067e-06, + "loss": 0.8102, + "num_tokens": 60296762453.0, + "step": 14426 + }, + { + "epoch": 1.7144385026737967, + "grad_norm": 0.22426514055866942, + "learning_rate": 2.9460523315947065e-06, + "loss": 0.779, + "num_tokens": 60300950518.0, + "step": 14427 + }, + { + "epoch": 1.7145573380867498, + "grad_norm": 0.2243124581975354, + "learning_rate": 2.945279503457318e-06, + "loss": 0.7999, + "num_tokens": 60305116603.0, + "step": 14428 + }, + { + "epoch": 1.714676173499703, + "grad_norm": 0.23036573511654093, + "learning_rate": 2.9445069736134625e-06, + "loss": 0.7932, + "num_tokens": 60309293892.0, + "step": 14429 + }, + { + "epoch": 1.714795008912656, + "grad_norm": 0.2120550946545824, + "learning_rate": 2.943734742091751e-06, + "loss": 0.8077, + "num_tokens": 60313484275.0, + "step": 14430 + }, + { + "epoch": 1.7149138443256091, + "grad_norm": 0.22359472268621328, + "learning_rate": 2.942962808920779e-06, + "loss": 0.8241, + "num_tokens": 60317674187.0, + "step": 14431 + }, + { + "epoch": 1.7150326797385622, + "grad_norm": 0.22086827973508988, + "learning_rate": 2.94219117412914e-06, + "loss": 0.8115, + "num_tokens": 60321863676.0, + "step": 14432 + }, + { + "epoch": 1.7151515151515153, + "grad_norm": 0.22330559747516007, + "learning_rate": 2.9414198377454027e-06, + "loss": 0.8415, + "num_tokens": 60326021663.0, + "step": 14433 + }, + { + "epoch": 1.7152703505644682, + "grad_norm": 0.23617444162972484, + "learning_rate": 2.940648799798136e-06, + "loss": 0.7967, + "num_tokens": 60330210119.0, + "step": 14434 + }, + { + "epoch": 1.7153891859774213, + "grad_norm": 0.2176194981650115, + "learning_rate": 2.9398780603158934e-06, + "loss": 0.8404, + "num_tokens": 60334399362.0, + "step": 14435 + }, + { + "epoch": 1.7155080213903742, + "grad_norm": 0.22806467288339743, + "learning_rate": 2.9391076193272187e-06, + "loss": 0.8386, + "num_tokens": 60338573340.0, + "step": 14436 + }, + { + "epoch": 1.7156268568033273, + "grad_norm": 0.23476446295778428, + "learning_rate": 2.9383374768606434e-06, + "loss": 0.8036, + "num_tokens": 60342763963.0, + "step": 14437 + }, + { + "epoch": 1.7157456922162804, + "grad_norm": 0.22552485943683923, + "learning_rate": 2.9375676329446877e-06, + "loss": 0.7866, + "num_tokens": 60346953830.0, + "step": 14438 + }, + { + "epoch": 1.7158645276292335, + "grad_norm": 0.23155584946216667, + "learning_rate": 2.936798087607863e-06, + "loss": 0.788, + "num_tokens": 60351123267.0, + "step": 14439 + }, + { + "epoch": 1.7159833630421866, + "grad_norm": 0.23853527042027378, + "learning_rate": 2.936028840878666e-06, + "loss": 0.8118, + "num_tokens": 60355312200.0, + "step": 14440 + }, + { + "epoch": 1.7161021984551397, + "grad_norm": 0.24272691019995954, + "learning_rate": 2.935259892785588e-06, + "loss": 0.7859, + "num_tokens": 60359500150.0, + "step": 14441 + }, + { + "epoch": 1.7162210338680928, + "grad_norm": 0.24015125376414448, + "learning_rate": 2.934491243357103e-06, + "loss": 0.806, + "num_tokens": 60363679009.0, + "step": 14442 + }, + { + "epoch": 1.7163398692810459, + "grad_norm": 0.24229615334709223, + "learning_rate": 2.9337228926216777e-06, + "loss": 0.8029, + "num_tokens": 60367840804.0, + "step": 14443 + }, + { + "epoch": 1.716458704693999, + "grad_norm": 0.2200091761118249, + "learning_rate": 2.9329548406077646e-06, + "loss": 0.7711, + "num_tokens": 60372029882.0, + "step": 14444 + }, + { + "epoch": 1.7165775401069518, + "grad_norm": 0.22950109110368908, + "learning_rate": 2.9321870873438106e-06, + "loss": 0.8073, + "num_tokens": 60376219125.0, + "step": 14445 + }, + { + "epoch": 1.716696375519905, + "grad_norm": 0.23883944780022667, + "learning_rate": 2.9314196328582473e-06, + "loss": 0.8115, + "num_tokens": 60380409271.0, + "step": 14446 + }, + { + "epoch": 1.7168152109328578, + "grad_norm": 0.2184453031245096, + "learning_rate": 2.930652477179495e-06, + "loss": 0.8146, + "num_tokens": 60384597220.0, + "step": 14447 + }, + { + "epoch": 1.716934046345811, + "grad_norm": 0.22884478027457306, + "learning_rate": 2.929885620335964e-06, + "loss": 0.8156, + "num_tokens": 60388761818.0, + "step": 14448 + }, + { + "epoch": 1.717052881758764, + "grad_norm": 0.2517913337106646, + "learning_rate": 2.9291190623560558e-06, + "loss": 0.8249, + "num_tokens": 60392950363.0, + "step": 14449 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.24081262091630368, + "learning_rate": 2.9283528032681535e-06, + "loss": 0.824, + "num_tokens": 60397113089.0, + "step": 14450 + }, + { + "epoch": 1.7172905525846702, + "grad_norm": 0.23695545473870497, + "learning_rate": 2.9275868431006406e-06, + "loss": 0.8073, + "num_tokens": 60401302768.0, + "step": 14451 + }, + { + "epoch": 1.7174093879976233, + "grad_norm": 0.23534052350899343, + "learning_rate": 2.926821181881879e-06, + "loss": 0.7892, + "num_tokens": 60405484105.0, + "step": 14452 + }, + { + "epoch": 1.7175282234105764, + "grad_norm": 0.22489864678358668, + "learning_rate": 2.9260558196402254e-06, + "loss": 0.801, + "num_tokens": 60409642984.0, + "step": 14453 + }, + { + "epoch": 1.7176470588235295, + "grad_norm": 0.2484160767648355, + "learning_rate": 2.9252907564040268e-06, + "loss": 0.8194, + "num_tokens": 60413832999.0, + "step": 14454 + }, + { + "epoch": 1.7177658942364826, + "grad_norm": 0.23571475237842104, + "learning_rate": 2.92452599220161e-06, + "loss": 0.8196, + "num_tokens": 60418001936.0, + "step": 14455 + }, + { + "epoch": 1.7178847296494355, + "grad_norm": 0.26281714654803406, + "learning_rate": 2.9237615270612974e-06, + "loss": 0.794, + "num_tokens": 60422183601.0, + "step": 14456 + }, + { + "epoch": 1.7180035650623886, + "grad_norm": 0.2404784688478273, + "learning_rate": 2.922997361011403e-06, + "loss": 0.8264, + "num_tokens": 60426341053.0, + "step": 14457 + }, + { + "epoch": 1.7181224004753417, + "grad_norm": 0.25712830804862663, + "learning_rate": 2.922233494080226e-06, + "loss": 0.8309, + "num_tokens": 60430501266.0, + "step": 14458 + }, + { + "epoch": 1.7182412358882946, + "grad_norm": 0.2542264332576408, + "learning_rate": 2.9214699262960574e-06, + "loss": 0.7988, + "num_tokens": 60434677291.0, + "step": 14459 + }, + { + "epoch": 1.7183600713012477, + "grad_norm": 0.24836258734271388, + "learning_rate": 2.9207066576871675e-06, + "loss": 0.8096, + "num_tokens": 60438841435.0, + "step": 14460 + }, + { + "epoch": 1.7184789067142008, + "grad_norm": 0.24980887724639902, + "learning_rate": 2.9199436882818276e-06, + "loss": 0.8406, + "num_tokens": 60443031572.0, + "step": 14461 + }, + { + "epoch": 1.7185977421271539, + "grad_norm": 0.24796636909474706, + "learning_rate": 2.9191810181082935e-06, + "loss": 0.8243, + "num_tokens": 60447181441.0, + "step": 14462 + }, + { + "epoch": 1.718716577540107, + "grad_norm": 0.2382068768943965, + "learning_rate": 2.918418647194805e-06, + "loss": 0.8152, + "num_tokens": 60451343422.0, + "step": 14463 + }, + { + "epoch": 1.71883541295306, + "grad_norm": 0.23043592095601978, + "learning_rate": 2.9176565755696027e-06, + "loss": 0.7758, + "num_tokens": 60455533041.0, + "step": 14464 + }, + { + "epoch": 1.7189542483660132, + "grad_norm": 0.22590180828527315, + "learning_rate": 2.9168948032609017e-06, + "loss": 0.8504, + "num_tokens": 60459721553.0, + "step": 14465 + }, + { + "epoch": 1.7190730837789663, + "grad_norm": 0.2353458291156931, + "learning_rate": 2.9161333302969176e-06, + "loss": 0.8385, + "num_tokens": 60463880745.0, + "step": 14466 + }, + { + "epoch": 1.7191919191919192, + "grad_norm": 0.22061864033285714, + "learning_rate": 2.915372156705847e-06, + "loss": 0.7934, + "num_tokens": 60468071003.0, + "step": 14467 + }, + { + "epoch": 1.7193107546048723, + "grad_norm": 0.24097286293781237, + "learning_rate": 2.914611282515881e-06, + "loss": 0.7981, + "num_tokens": 60472189704.0, + "step": 14468 + }, + { + "epoch": 1.7194295900178254, + "grad_norm": 0.22911800067308197, + "learning_rate": 2.9138507077551953e-06, + "loss": 0.8049, + "num_tokens": 60476380427.0, + "step": 14469 + }, + { + "epoch": 1.7195484254307782, + "grad_norm": 0.24329734278353568, + "learning_rate": 2.913090432451959e-06, + "loss": 0.8084, + "num_tokens": 60480571857.0, + "step": 14470 + }, + { + "epoch": 1.7196672608437313, + "grad_norm": 0.23650914526781622, + "learning_rate": 2.9123304566343264e-06, + "loss": 0.8151, + "num_tokens": 60484761463.0, + "step": 14471 + }, + { + "epoch": 1.7197860962566844, + "grad_norm": 0.2421962804544552, + "learning_rate": 2.9115707803304416e-06, + "loss": 0.7896, + "num_tokens": 60488917667.0, + "step": 14472 + }, + { + "epoch": 1.7199049316696375, + "grad_norm": 0.24418785763564663, + "learning_rate": 2.910811403568439e-06, + "loss": 0.8284, + "num_tokens": 60493080499.0, + "step": 14473 + }, + { + "epoch": 1.7200237670825906, + "grad_norm": 0.2148584276973762, + "learning_rate": 2.9100523263764396e-06, + "loss": 0.8356, + "num_tokens": 60497268652.0, + "step": 14474 + }, + { + "epoch": 1.7201426024955437, + "grad_norm": 0.23450042230708193, + "learning_rate": 2.9092935487825557e-06, + "loss": 0.7925, + "num_tokens": 60501458212.0, + "step": 14475 + }, + { + "epoch": 1.7202614379084968, + "grad_norm": 0.2217729282382532, + "learning_rate": 2.9085350708148873e-06, + "loss": 0.7721, + "num_tokens": 60505637103.0, + "step": 14476 + }, + { + "epoch": 1.72038027332145, + "grad_norm": 0.256088467919176, + "learning_rate": 2.9077768925015227e-06, + "loss": 0.8366, + "num_tokens": 60509776438.0, + "step": 14477 + }, + { + "epoch": 1.7204991087344028, + "grad_norm": 0.2333491202004662, + "learning_rate": 2.90701901387054e-06, + "loss": 0.805, + "num_tokens": 60513965002.0, + "step": 14478 + }, + { + "epoch": 1.720617944147356, + "grad_norm": 0.23326799892698427, + "learning_rate": 2.9062614349500073e-06, + "loss": 0.8054, + "num_tokens": 60518153182.0, + "step": 14479 + }, + { + "epoch": 1.720736779560309, + "grad_norm": 0.22239980584042854, + "learning_rate": 2.9055041557679797e-06, + "loss": 0.7799, + "num_tokens": 60522342888.0, + "step": 14480 + }, + { + "epoch": 1.720855614973262, + "grad_norm": 0.23297676676112522, + "learning_rate": 2.9047471763525003e-06, + "loss": 0.7922, + "num_tokens": 60526501398.0, + "step": 14481 + }, + { + "epoch": 1.720974450386215, + "grad_norm": 0.22311683302122481, + "learning_rate": 2.9039904967316013e-06, + "loss": 0.8277, + "num_tokens": 60530686808.0, + "step": 14482 + }, + { + "epoch": 1.721093285799168, + "grad_norm": 0.23053093675171935, + "learning_rate": 2.9032341169333093e-06, + "loss": 0.789, + "num_tokens": 60534869646.0, + "step": 14483 + }, + { + "epoch": 1.7212121212121212, + "grad_norm": 0.24021344701836223, + "learning_rate": 2.9024780369856343e-06, + "loss": 0.8203, + "num_tokens": 60539058295.0, + "step": 14484 + }, + { + "epoch": 1.7213309566250743, + "grad_norm": 0.23098956075433227, + "learning_rate": 2.901722256916578e-06, + "loss": 0.8525, + "num_tokens": 60543247682.0, + "step": 14485 + }, + { + "epoch": 1.7214497920380274, + "grad_norm": 0.27546266619918686, + "learning_rate": 2.9009667767541257e-06, + "loss": 0.813, + "num_tokens": 60547436943.0, + "step": 14486 + }, + { + "epoch": 1.7215686274509805, + "grad_norm": 0.27519728348701233, + "learning_rate": 2.900211596526257e-06, + "loss": 0.836, + "num_tokens": 60551626116.0, + "step": 14487 + }, + { + "epoch": 1.7216874628639336, + "grad_norm": 0.25622877492988977, + "learning_rate": 2.899456716260936e-06, + "loss": 0.8424, + "num_tokens": 60555816717.0, + "step": 14488 + }, + { + "epoch": 1.7218062982768865, + "grad_norm": 0.2620354261615198, + "learning_rate": 2.8987021359861243e-06, + "loss": 0.8216, + "num_tokens": 60560004930.0, + "step": 14489 + }, + { + "epoch": 1.7219251336898396, + "grad_norm": 0.22788772171816454, + "learning_rate": 2.8979478557297653e-06, + "loss": 0.8015, + "num_tokens": 60564185161.0, + "step": 14490 + }, + { + "epoch": 1.7220439691027927, + "grad_norm": 0.2422656225265525, + "learning_rate": 2.897193875519789e-06, + "loss": 0.7985, + "num_tokens": 60568349917.0, + "step": 14491 + }, + { + "epoch": 1.7221628045157455, + "grad_norm": 0.2426480600677288, + "learning_rate": 2.8964401953841186e-06, + "loss": 0.7982, + "num_tokens": 60572540162.0, + "step": 14492 + }, + { + "epoch": 1.7222816399286986, + "grad_norm": 0.21967007997820545, + "learning_rate": 2.895686815350669e-06, + "loss": 0.8578, + "num_tokens": 60576727387.0, + "step": 14493 + }, + { + "epoch": 1.7224004753416517, + "grad_norm": 0.26632287143827454, + "learning_rate": 2.8949337354473343e-06, + "loss": 0.8097, + "num_tokens": 60580917301.0, + "step": 14494 + }, + { + "epoch": 1.7225193107546048, + "grad_norm": 0.23166411024472172, + "learning_rate": 2.8941809557020105e-06, + "loss": 0.7905, + "num_tokens": 60585102236.0, + "step": 14495 + }, + { + "epoch": 1.722638146167558, + "grad_norm": 0.244430826694355, + "learning_rate": 2.893428476142572e-06, + "loss": 0.7553, + "num_tokens": 60589289574.0, + "step": 14496 + }, + { + "epoch": 1.722756981580511, + "grad_norm": 0.2539284490113564, + "learning_rate": 2.892676296796885e-06, + "loss": 0.7612, + "num_tokens": 60593477191.0, + "step": 14497 + }, + { + "epoch": 1.7228758169934641, + "grad_norm": 0.23330471137266787, + "learning_rate": 2.891924417692807e-06, + "loss": 0.778, + "num_tokens": 60597667153.0, + "step": 14498 + }, + { + "epoch": 1.7229946524064172, + "grad_norm": 0.22440873802976788, + "learning_rate": 2.8911728388581806e-06, + "loss": 0.8503, + "num_tokens": 60601855342.0, + "step": 14499 + }, + { + "epoch": 1.7231134878193701, + "grad_norm": 0.2550952562233564, + "learning_rate": 2.890421560320842e-06, + "loss": 0.8022, + "num_tokens": 60606044307.0, + "step": 14500 + }, + { + "epoch": 1.7232323232323232, + "grad_norm": 0.22394383405413662, + "learning_rate": 2.889670582108612e-06, + "loss": 0.8592, + "num_tokens": 60610185827.0, + "step": 14501 + }, + { + "epoch": 1.7233511586452763, + "grad_norm": 0.22505614771182883, + "learning_rate": 2.888919904249301e-06, + "loss": 0.8125, + "num_tokens": 60614355769.0, + "step": 14502 + }, + { + "epoch": 1.7234699940582292, + "grad_norm": 0.22982600663796168, + "learning_rate": 2.888169526770711e-06, + "loss": 0.7642, + "num_tokens": 60618517869.0, + "step": 14503 + }, + { + "epoch": 1.7235888294711823, + "grad_norm": 0.23395998878621424, + "learning_rate": 2.88741944970063e-06, + "loss": 0.8338, + "num_tokens": 60622707739.0, + "step": 14504 + }, + { + "epoch": 1.7237076648841354, + "grad_norm": 0.2256092967373064, + "learning_rate": 2.8866696730668362e-06, + "loss": 0.8198, + "num_tokens": 60626897339.0, + "step": 14505 + }, + { + "epoch": 1.7238265002970885, + "grad_norm": 0.2428856855077664, + "learning_rate": 2.8859201968970974e-06, + "loss": 0.8003, + "num_tokens": 60631057567.0, + "step": 14506 + }, + { + "epoch": 1.7239453357100416, + "grad_norm": 0.23215419851859903, + "learning_rate": 2.8851710212191653e-06, + "loss": 0.8355, + "num_tokens": 60635247027.0, + "step": 14507 + }, + { + "epoch": 1.7240641711229947, + "grad_norm": 0.2372840318411981, + "learning_rate": 2.884422146060788e-06, + "loss": 0.8294, + "num_tokens": 60639393926.0, + "step": 14508 + }, + { + "epoch": 1.7241830065359478, + "grad_norm": 0.2357911554337545, + "learning_rate": 2.8836735714496987e-06, + "loss": 0.8079, + "num_tokens": 60643554075.0, + "step": 14509 + }, + { + "epoch": 1.724301841948901, + "grad_norm": 0.24093309194686682, + "learning_rate": 2.882925297413619e-06, + "loss": 0.8033, + "num_tokens": 60647722177.0, + "step": 14510 + }, + { + "epoch": 1.7244206773618538, + "grad_norm": 0.23138419310715405, + "learning_rate": 2.882177323980261e-06, + "loss": 0.7966, + "num_tokens": 60651912087.0, + "step": 14511 + }, + { + "epoch": 1.7245395127748069, + "grad_norm": 0.22781900184041204, + "learning_rate": 2.881429651177324e-06, + "loss": 0.7583, + "num_tokens": 60656101404.0, + "step": 14512 + }, + { + "epoch": 1.72465834818776, + "grad_norm": 0.22000631228315037, + "learning_rate": 2.8806822790324934e-06, + "loss": 0.8243, + "num_tokens": 60660274741.0, + "step": 14513 + }, + { + "epoch": 1.7247771836007129, + "grad_norm": 0.23649164777784704, + "learning_rate": 2.879935207573451e-06, + "loss": 0.7888, + "num_tokens": 60664441590.0, + "step": 14514 + }, + { + "epoch": 1.724896019013666, + "grad_norm": 0.21461595003405998, + "learning_rate": 2.879188436827864e-06, + "loss": 0.789, + "num_tokens": 60668631295.0, + "step": 14515 + }, + { + "epoch": 1.725014854426619, + "grad_norm": 0.2234294770475271, + "learning_rate": 2.878441966823387e-06, + "loss": 0.7619, + "num_tokens": 60672821408.0, + "step": 14516 + }, + { + "epoch": 1.7251336898395722, + "grad_norm": 0.2287272538546613, + "learning_rate": 2.877695797587662e-06, + "loss": 0.8173, + "num_tokens": 60677009137.0, + "step": 14517 + }, + { + "epoch": 1.7252525252525253, + "grad_norm": 0.2168791038221276, + "learning_rate": 2.8769499291483245e-06, + "loss": 0.8153, + "num_tokens": 60681187034.0, + "step": 14518 + }, + { + "epoch": 1.7253713606654784, + "grad_norm": 0.24866164460849502, + "learning_rate": 2.8762043615329948e-06, + "loss": 0.815, + "num_tokens": 60685375524.0, + "step": 14519 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.22184851360166336, + "learning_rate": 2.875459094769284e-06, + "loss": 0.8101, + "num_tokens": 60689564379.0, + "step": 14520 + }, + { + "epoch": 1.7256090314913846, + "grad_norm": 0.2366726088767761, + "learning_rate": 2.8747141288847956e-06, + "loss": 0.8257, + "num_tokens": 60693753447.0, + "step": 14521 + }, + { + "epoch": 1.7257278669043374, + "grad_norm": 0.23344916121003714, + "learning_rate": 2.8739694639071136e-06, + "loss": 0.8184, + "num_tokens": 60697929073.0, + "step": 14522 + }, + { + "epoch": 1.7258467023172905, + "grad_norm": 0.24884726734244578, + "learning_rate": 2.873225099863817e-06, + "loss": 0.8121, + "num_tokens": 60702086367.0, + "step": 14523 + }, + { + "epoch": 1.7259655377302436, + "grad_norm": 0.2278402302017409, + "learning_rate": 2.8724810367824728e-06, + "loss": 0.8129, + "num_tokens": 60706251819.0, + "step": 14524 + }, + { + "epoch": 1.7260843731431965, + "grad_norm": 0.24213126630400808, + "learning_rate": 2.8717372746906345e-06, + "loss": 0.8412, + "num_tokens": 60710441430.0, + "step": 14525 + }, + { + "epoch": 1.7262032085561496, + "grad_norm": 0.24393914104037442, + "learning_rate": 2.8709938136158485e-06, + "loss": 0.8483, + "num_tokens": 60714629428.0, + "step": 14526 + }, + { + "epoch": 1.7263220439691027, + "grad_norm": 0.2316833307204354, + "learning_rate": 2.8702506535856444e-06, + "loss": 0.8214, + "num_tokens": 60718818327.0, + "step": 14527 + }, + { + "epoch": 1.7264408793820558, + "grad_norm": 0.2266504423543868, + "learning_rate": 2.8695077946275476e-06, + "loss": 0.8077, + "num_tokens": 60723008002.0, + "step": 14528 + }, + { + "epoch": 1.726559714795009, + "grad_norm": 0.23461278032096622, + "learning_rate": 2.8687652367690666e-06, + "loss": 0.8014, + "num_tokens": 60727173038.0, + "step": 14529 + }, + { + "epoch": 1.726678550207962, + "grad_norm": 0.22403632532081605, + "learning_rate": 2.8680229800377025e-06, + "loss": 0.8238, + "num_tokens": 60731355019.0, + "step": 14530 + }, + { + "epoch": 1.7267973856209151, + "grad_norm": 0.24204443354885377, + "learning_rate": 2.8672810244609408e-06, + "loss": 0.8092, + "num_tokens": 60735504963.0, + "step": 14531 + }, + { + "epoch": 1.7269162210338682, + "grad_norm": 0.2254989749432414, + "learning_rate": 2.8665393700662616e-06, + "loss": 0.7917, + "num_tokens": 60739694409.0, + "step": 14532 + }, + { + "epoch": 1.7270350564468213, + "grad_norm": 0.22916460264427335, + "learning_rate": 2.8657980168811266e-06, + "loss": 0.7944, + "num_tokens": 60743882279.0, + "step": 14533 + }, + { + "epoch": 1.7271538918597742, + "grad_norm": 0.2301664104713607, + "learning_rate": 2.865056964932995e-06, + "loss": 0.7748, + "num_tokens": 60748061650.0, + "step": 14534 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.23636869577230452, + "learning_rate": 2.8643162142493093e-06, + "loss": 0.8515, + "num_tokens": 60752249703.0, + "step": 14535 + }, + { + "epoch": 1.7273915626856802, + "grad_norm": 0.23238001329046792, + "learning_rate": 2.863575764857502e-06, + "loss": 0.8369, + "num_tokens": 60756439474.0, + "step": 14536 + }, + { + "epoch": 1.7275103980986333, + "grad_norm": 0.24156593225132372, + "learning_rate": 2.8628356167849945e-06, + "loss": 0.7845, + "num_tokens": 60760608515.0, + "step": 14537 + }, + { + "epoch": 1.7276292335115864, + "grad_norm": 0.23159179634416718, + "learning_rate": 2.862095770059195e-06, + "loss": 0.7741, + "num_tokens": 60764774220.0, + "step": 14538 + }, + { + "epoch": 1.7277480689245395, + "grad_norm": 0.2343162397949315, + "learning_rate": 2.861356224707503e-06, + "loss": 0.8683, + "num_tokens": 60768962388.0, + "step": 14539 + }, + { + "epoch": 1.7278669043374926, + "grad_norm": 0.2465054644898325, + "learning_rate": 2.8606169807573092e-06, + "loss": 0.8151, + "num_tokens": 60773150999.0, + "step": 14540 + }, + { + "epoch": 1.7279857397504457, + "grad_norm": 0.22858745392844645, + "learning_rate": 2.859878038235988e-06, + "loss": 0.8078, + "num_tokens": 60777338663.0, + "step": 14541 + }, + { + "epoch": 1.7281045751633988, + "grad_norm": 0.3009657366315986, + "learning_rate": 2.859139397170907e-06, + "loss": 0.8142, + "num_tokens": 60781527883.0, + "step": 14542 + }, + { + "epoch": 1.7282234105763519, + "grad_norm": 0.2271130920207119, + "learning_rate": 2.858401057589417e-06, + "loss": 0.8135, + "num_tokens": 60785717770.0, + "step": 14543 + }, + { + "epoch": 1.728342245989305, + "grad_norm": 0.22968886472949132, + "learning_rate": 2.857663019518865e-06, + "loss": 0.7766, + "num_tokens": 60789907886.0, + "step": 14544 + }, + { + "epoch": 1.7284610814022578, + "grad_norm": 0.2309009191339306, + "learning_rate": 2.856925282986578e-06, + "loss": 0.7978, + "num_tokens": 60794093792.0, + "step": 14545 + }, + { + "epoch": 1.728579916815211, + "grad_norm": 0.23009267684479046, + "learning_rate": 2.856187848019882e-06, + "loss": 0.7905, + "num_tokens": 60798281443.0, + "step": 14546 + }, + { + "epoch": 1.728698752228164, + "grad_norm": 0.2352363516666588, + "learning_rate": 2.8554507146460864e-06, + "loss": 0.8497, + "num_tokens": 60802471121.0, + "step": 14547 + }, + { + "epoch": 1.728817587641117, + "grad_norm": 0.2235341812891871, + "learning_rate": 2.854713882892487e-06, + "loss": 0.8144, + "num_tokens": 60806638478.0, + "step": 14548 + }, + { + "epoch": 1.72893642305407, + "grad_norm": 0.24152125576483066, + "learning_rate": 2.8539773527863724e-06, + "loss": 0.8414, + "num_tokens": 60810790816.0, + "step": 14549 + }, + { + "epoch": 1.7290552584670231, + "grad_norm": 0.24833479325311378, + "learning_rate": 2.8532411243550183e-06, + "loss": 0.8173, + "num_tokens": 60814937802.0, + "step": 14550 + }, + { + "epoch": 1.7291740938799762, + "grad_norm": 0.24329267175109331, + "learning_rate": 2.8525051976256917e-06, + "loss": 0.789, + "num_tokens": 60819095678.0, + "step": 14551 + }, + { + "epoch": 1.7292929292929293, + "grad_norm": 0.23312906051587315, + "learning_rate": 2.8517695726256432e-06, + "loss": 0.8025, + "num_tokens": 60823283665.0, + "step": 14552 + }, + { + "epoch": 1.7294117647058824, + "grad_norm": 0.2473295289904375, + "learning_rate": 2.8510342493821185e-06, + "loss": 0.7927, + "num_tokens": 60827467960.0, + "step": 14553 + }, + { + "epoch": 1.7295306001188355, + "grad_norm": 0.245794698251591, + "learning_rate": 2.8502992279223464e-06, + "loss": 0.8041, + "num_tokens": 60831656932.0, + "step": 14554 + }, + { + "epoch": 1.7296494355317886, + "grad_norm": 0.23302628019502145, + "learning_rate": 2.84956450827355e-06, + "loss": 0.8155, + "num_tokens": 60835845949.0, + "step": 14555 + }, + { + "epoch": 1.7297682709447415, + "grad_norm": 0.2664599260637497, + "learning_rate": 2.8488300904629368e-06, + "loss": 0.7714, + "num_tokens": 60840035320.0, + "step": 14556 + }, + { + "epoch": 1.7298871063576946, + "grad_norm": 0.24114694810099915, + "learning_rate": 2.8480959745177043e-06, + "loss": 0.7838, + "num_tokens": 60844224926.0, + "step": 14557 + }, + { + "epoch": 1.7300059417706477, + "grad_norm": 0.24039392654019462, + "learning_rate": 2.847362160465041e-06, + "loss": 0.7561, + "num_tokens": 60848413861.0, + "step": 14558 + }, + { + "epoch": 1.7301247771836006, + "grad_norm": 0.2159946047630325, + "learning_rate": 2.8466286483321213e-06, + "loss": 0.8089, + "num_tokens": 60852521834.0, + "step": 14559 + }, + { + "epoch": 1.7302436125965537, + "grad_norm": 0.23650545702849166, + "learning_rate": 2.8458954381461105e-06, + "loss": 0.8149, + "num_tokens": 60856678118.0, + "step": 14560 + }, + { + "epoch": 1.7303624480095068, + "grad_norm": 0.23768662148724645, + "learning_rate": 2.8451625299341607e-06, + "loss": 0.8102, + "num_tokens": 60860866299.0, + "step": 14561 + }, + { + "epoch": 1.7304812834224599, + "grad_norm": 0.22844571125293753, + "learning_rate": 2.8444299237234145e-06, + "loss": 0.7949, + "num_tokens": 60865027562.0, + "step": 14562 + }, + { + "epoch": 1.730600118835413, + "grad_norm": 0.23640084187453933, + "learning_rate": 2.8436976195410033e-06, + "loss": 0.7858, + "num_tokens": 60869185190.0, + "step": 14563 + }, + { + "epoch": 1.730718954248366, + "grad_norm": 0.22620602815675295, + "learning_rate": 2.8429656174140465e-06, + "loss": 0.8305, + "num_tokens": 60873373427.0, + "step": 14564 + }, + { + "epoch": 1.7308377896613192, + "grad_norm": 0.23441207689198493, + "learning_rate": 2.842233917369652e-06, + "loss": 0.8008, + "num_tokens": 60877561863.0, + "step": 14565 + }, + { + "epoch": 1.7309566250742723, + "grad_norm": 0.24723725298037585, + "learning_rate": 2.8415025194349173e-06, + "loss": 0.8277, + "num_tokens": 60881752293.0, + "step": 14566 + }, + { + "epoch": 1.7310754604872252, + "grad_norm": 0.22762171156031868, + "learning_rate": 2.84077142363693e-06, + "loss": 0.7964, + "num_tokens": 60885917814.0, + "step": 14567 + }, + { + "epoch": 1.7311942959001783, + "grad_norm": 0.25839790012597896, + "learning_rate": 2.8400406300027627e-06, + "loss": 0.8354, + "num_tokens": 60890095004.0, + "step": 14568 + }, + { + "epoch": 1.7313131313131314, + "grad_norm": 0.23372239629224795, + "learning_rate": 2.8393101385594833e-06, + "loss": 0.7909, + "num_tokens": 60894254482.0, + "step": 14569 + }, + { + "epoch": 1.7314319667260842, + "grad_norm": 0.22730201535478545, + "learning_rate": 2.8385799493341382e-06, + "loss": 0.8034, + "num_tokens": 60898442063.0, + "step": 14570 + }, + { + "epoch": 1.7315508021390373, + "grad_norm": 0.2480565420716033, + "learning_rate": 2.837850062353771e-06, + "loss": 0.8114, + "num_tokens": 60902630732.0, + "step": 14571 + }, + { + "epoch": 1.7316696375519904, + "grad_norm": 0.2285481499846159, + "learning_rate": 2.8371204776454154e-06, + "loss": 0.8302, + "num_tokens": 60906819350.0, + "step": 14572 + }, + { + "epoch": 1.7317884729649435, + "grad_norm": 0.2627377710319368, + "learning_rate": 2.8363911952360863e-06, + "loss": 0.7918, + "num_tokens": 60911008968.0, + "step": 14573 + }, + { + "epoch": 1.7319073083778966, + "grad_norm": 0.22609892937776446, + "learning_rate": 2.835662215152796e-06, + "loss": 0.8109, + "num_tokens": 60915198119.0, + "step": 14574 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.25167088833597356, + "learning_rate": 2.8349335374225358e-06, + "loss": 0.8239, + "num_tokens": 60919387197.0, + "step": 14575 + }, + { + "epoch": 1.7321449792038028, + "grad_norm": 0.24554439494118288, + "learning_rate": 2.834205162072293e-06, + "loss": 0.8405, + "num_tokens": 60923575758.0, + "step": 14576 + }, + { + "epoch": 1.732263814616756, + "grad_norm": 0.21762463151498018, + "learning_rate": 2.833477089129042e-06, + "loss": 0.8235, + "num_tokens": 60927764260.0, + "step": 14577 + }, + { + "epoch": 1.7323826500297088, + "grad_norm": 0.2887163408019189, + "learning_rate": 2.832749318619748e-06, + "loss": 0.8262, + "num_tokens": 60931924605.0, + "step": 14578 + }, + { + "epoch": 1.732501485442662, + "grad_norm": 0.24777431225197122, + "learning_rate": 2.83202185057136e-06, + "loss": 0.7996, + "num_tokens": 60936114243.0, + "step": 14579 + }, + { + "epoch": 1.732620320855615, + "grad_norm": 0.22617316251928932, + "learning_rate": 2.831294685010819e-06, + "loss": 0.832, + "num_tokens": 60940302943.0, + "step": 14580 + }, + { + "epoch": 1.732739156268568, + "grad_norm": 0.24596539679066146, + "learning_rate": 2.8305678219650557e-06, + "loss": 0.7915, + "num_tokens": 60944492746.0, + "step": 14581 + }, + { + "epoch": 1.732857991681521, + "grad_norm": 0.22134838464159903, + "learning_rate": 2.829841261460987e-06, + "loss": 0.7991, + "num_tokens": 60948680613.0, + "step": 14582 + }, + { + "epoch": 1.732976827094474, + "grad_norm": 0.2336873847860305, + "learning_rate": 2.8291150035255205e-06, + "loss": 0.8183, + "num_tokens": 60952840009.0, + "step": 14583 + }, + { + "epoch": 1.7330956625074272, + "grad_norm": 0.2321650523769679, + "learning_rate": 2.8283890481855514e-06, + "loss": 0.8055, + "num_tokens": 60957029480.0, + "step": 14584 + }, + { + "epoch": 1.7332144979203803, + "grad_norm": 0.22298410155377538, + "learning_rate": 2.8276633954679645e-06, + "loss": 0.8079, + "num_tokens": 60961219071.0, + "step": 14585 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.2356738146097493, + "learning_rate": 2.8269380453996338e-06, + "loss": 0.7977, + "num_tokens": 60965408352.0, + "step": 14586 + }, + { + "epoch": 1.7334521687462865, + "grad_norm": 0.2244045264539738, + "learning_rate": 2.8262129980074205e-06, + "loss": 0.8185, + "num_tokens": 60969594913.0, + "step": 14587 + }, + { + "epoch": 1.7335710041592396, + "grad_norm": 0.2238775444914127, + "learning_rate": 2.8254882533181777e-06, + "loss": 0.7898, + "num_tokens": 60973784628.0, + "step": 14588 + }, + { + "epoch": 1.7336898395721925, + "grad_norm": 0.224401882000009, + "learning_rate": 2.8247638113587428e-06, + "loss": 0.8034, + "num_tokens": 60977974577.0, + "step": 14589 + }, + { + "epoch": 1.7338086749851456, + "grad_norm": 0.23022990542821295, + "learning_rate": 2.824039672155944e-06, + "loss": 0.7695, + "num_tokens": 60982163345.0, + "step": 14590 + }, + { + "epoch": 1.7339275103980987, + "grad_norm": 0.2373866738328967, + "learning_rate": 2.8233158357366003e-06, + "loss": 0.7918, + "num_tokens": 60986350207.0, + "step": 14591 + }, + { + "epoch": 1.7340463458110515, + "grad_norm": 0.22693020385018012, + "learning_rate": 2.822592302127517e-06, + "loss": 0.8257, + "num_tokens": 60990508866.0, + "step": 14592 + }, + { + "epoch": 1.7341651812240046, + "grad_norm": 0.23037066129249653, + "learning_rate": 2.821869071355491e-06, + "loss": 0.7907, + "num_tokens": 60994688341.0, + "step": 14593 + }, + { + "epoch": 1.7342840166369577, + "grad_norm": 0.2308219014067047, + "learning_rate": 2.821146143447302e-06, + "loss": 0.8204, + "num_tokens": 60998875843.0, + "step": 14594 + }, + { + "epoch": 1.7344028520499108, + "grad_norm": 0.2264257658367434, + "learning_rate": 2.820423518429727e-06, + "loss": 0.7809, + "num_tokens": 61003064813.0, + "step": 14595 + }, + { + "epoch": 1.734521687462864, + "grad_norm": 0.25001294196360785, + "learning_rate": 2.8197011963295225e-06, + "loss": 0.8159, + "num_tokens": 61007255571.0, + "step": 14596 + }, + { + "epoch": 1.734640522875817, + "grad_norm": 0.21798388257598247, + "learning_rate": 2.818979177173442e-06, + "loss": 0.7962, + "num_tokens": 61011438053.0, + "step": 14597 + }, + { + "epoch": 1.7347593582887701, + "grad_norm": 0.2257024849289076, + "learning_rate": 2.8182574609882224e-06, + "loss": 0.8035, + "num_tokens": 61015628281.0, + "step": 14598 + }, + { + "epoch": 1.7348781937017232, + "grad_norm": 0.22664985027687579, + "learning_rate": 2.8175360478005936e-06, + "loss": 0.7981, + "num_tokens": 61019770953.0, + "step": 14599 + }, + { + "epoch": 1.7349970291146761, + "grad_norm": 0.2223299923200133, + "learning_rate": 2.816814937637271e-06, + "loss": 0.7977, + "num_tokens": 61023929926.0, + "step": 14600 + }, + { + "epoch": 1.7351158645276292, + "grad_norm": 0.2262118957994183, + "learning_rate": 2.8160941305249577e-06, + "loss": 0.7896, + "num_tokens": 61028118448.0, + "step": 14601 + }, + { + "epoch": 1.7352346999405823, + "grad_norm": 0.23174943855078126, + "learning_rate": 2.8153736264903487e-06, + "loss": 0.8018, + "num_tokens": 61032308221.0, + "step": 14602 + }, + { + "epoch": 1.7353535353535352, + "grad_norm": 0.22323867199180164, + "learning_rate": 2.8146534255601283e-06, + "loss": 0.7599, + "num_tokens": 61036497317.0, + "step": 14603 + }, + { + "epoch": 1.7354723707664883, + "grad_norm": 0.22807746676185814, + "learning_rate": 2.8139335277609664e-06, + "loss": 0.8072, + "num_tokens": 61040686702.0, + "step": 14604 + }, + { + "epoch": 1.7355912061794414, + "grad_norm": 0.22899651741326368, + "learning_rate": 2.8132139331195253e-06, + "loss": 0.8018, + "num_tokens": 61044866211.0, + "step": 14605 + }, + { + "epoch": 1.7357100415923945, + "grad_norm": 0.23358017477504317, + "learning_rate": 2.812494641662451e-06, + "loss": 0.8237, + "num_tokens": 61049055816.0, + "step": 14606 + }, + { + "epoch": 1.7358288770053476, + "grad_norm": 0.25654092540283885, + "learning_rate": 2.8117756534163828e-06, + "loss": 0.8087, + "num_tokens": 61053245353.0, + "step": 14607 + }, + { + "epoch": 1.7359477124183007, + "grad_norm": 0.24199300281359093, + "learning_rate": 2.8110569684079474e-06, + "loss": 0.8157, + "num_tokens": 61057434163.0, + "step": 14608 + }, + { + "epoch": 1.7360665478312538, + "grad_norm": 0.2245498748455533, + "learning_rate": 2.8103385866637583e-06, + "loss": 0.7974, + "num_tokens": 61061615941.0, + "step": 14609 + }, + { + "epoch": 1.736185383244207, + "grad_norm": 0.23837825127403153, + "learning_rate": 2.8096205082104246e-06, + "loss": 0.7832, + "num_tokens": 61065806234.0, + "step": 14610 + }, + { + "epoch": 1.7363042186571598, + "grad_norm": 0.24608755013407418, + "learning_rate": 2.8089027330745334e-06, + "loss": 0.8154, + "num_tokens": 61069995551.0, + "step": 14611 + }, + { + "epoch": 1.7364230540701129, + "grad_norm": 0.23592174826421622, + "learning_rate": 2.80818526128267e-06, + "loss": 0.812, + "num_tokens": 61074186597.0, + "step": 14612 + }, + { + "epoch": 1.736541889483066, + "grad_norm": 0.2506454840317937, + "learning_rate": 2.807468092861402e-06, + "loss": 0.816, + "num_tokens": 61078374994.0, + "step": 14613 + }, + { + "epoch": 1.7366607248960189, + "grad_norm": 0.23296456115850758, + "learning_rate": 2.8067512278372905e-06, + "loss": 0.7778, + "num_tokens": 61082564230.0, + "step": 14614 + }, + { + "epoch": 1.736779560308972, + "grad_norm": 0.24636424313941008, + "learning_rate": 2.8060346662368843e-06, + "loss": 0.806, + "num_tokens": 61086753445.0, + "step": 14615 + }, + { + "epoch": 1.736898395721925, + "grad_norm": 0.23886349939359006, + "learning_rate": 2.805318408086718e-06, + "loss": 0.8033, + "num_tokens": 61090942844.0, + "step": 14616 + }, + { + "epoch": 1.7370172311348782, + "grad_norm": 0.22498981110484223, + "learning_rate": 2.8046024534133183e-06, + "loss": 0.7951, + "num_tokens": 61095130041.0, + "step": 14617 + }, + { + "epoch": 1.7371360665478313, + "grad_norm": 0.24049733044963573, + "learning_rate": 2.8038868022431977e-06, + "loss": 0.7963, + "num_tokens": 61099318883.0, + "step": 14618 + }, + { + "epoch": 1.7372549019607844, + "grad_norm": 0.22494028924788487, + "learning_rate": 2.8031714546028627e-06, + "loss": 0.8355, + "num_tokens": 61103507892.0, + "step": 14619 + }, + { + "epoch": 1.7373737373737375, + "grad_norm": 0.24901714575383804, + "learning_rate": 2.8024564105188012e-06, + "loss": 0.8074, + "num_tokens": 61107697712.0, + "step": 14620 + }, + { + "epoch": 1.7374925727866906, + "grad_norm": 0.21911433459174773, + "learning_rate": 2.801741670017497e-06, + "loss": 0.797, + "num_tokens": 61111886392.0, + "step": 14621 + }, + { + "epoch": 1.7376114081996437, + "grad_norm": 0.22631861885918034, + "learning_rate": 2.801027233125415e-06, + "loss": 0.8267, + "num_tokens": 61116074166.0, + "step": 14622 + }, + { + "epoch": 1.7377302436125965, + "grad_norm": 0.23084445105706566, + "learning_rate": 2.800313099869018e-06, + "loss": 0.8116, + "num_tokens": 61120263785.0, + "step": 14623 + }, + { + "epoch": 1.7378490790255496, + "grad_norm": 0.23894881515388072, + "learning_rate": 2.79959927027475e-06, + "loss": 0.824, + "num_tokens": 61124453026.0, + "step": 14624 + }, + { + "epoch": 1.7379679144385025, + "grad_norm": 0.22591736774050947, + "learning_rate": 2.7988857443690476e-06, + "loss": 0.7667, + "num_tokens": 61128631985.0, + "step": 14625 + }, + { + "epoch": 1.7380867498514556, + "grad_norm": 0.23436088810911954, + "learning_rate": 2.7981725221783367e-06, + "loss": 0.7993, + "num_tokens": 61132807398.0, + "step": 14626 + }, + { + "epoch": 1.7382055852644087, + "grad_norm": 0.2338267458224703, + "learning_rate": 2.797459603729026e-06, + "loss": 0.8488, + "num_tokens": 61136983615.0, + "step": 14627 + }, + { + "epoch": 1.7383244206773618, + "grad_norm": 0.2299237774081913, + "learning_rate": 2.796746989047518e-06, + "loss": 0.7978, + "num_tokens": 61141134688.0, + "step": 14628 + }, + { + "epoch": 1.738443256090315, + "grad_norm": 0.22542594082944262, + "learning_rate": 2.7960346781602068e-06, + "loss": 0.8147, + "num_tokens": 61145297847.0, + "step": 14629 + }, + { + "epoch": 1.738562091503268, + "grad_norm": 0.23369862910974673, + "learning_rate": 2.7953226710934696e-06, + "loss": 0.8184, + "num_tokens": 61149488032.0, + "step": 14630 + }, + { + "epoch": 1.7386809269162211, + "grad_norm": 0.24467836159441983, + "learning_rate": 2.7946109678736768e-06, + "loss": 0.7699, + "num_tokens": 61153675661.0, + "step": 14631 + }, + { + "epoch": 1.7387997623291742, + "grad_norm": 0.21877332153237408, + "learning_rate": 2.7938995685271804e-06, + "loss": 0.803, + "num_tokens": 61157847696.0, + "step": 14632 + }, + { + "epoch": 1.7389185977421273, + "grad_norm": 0.23693750106699468, + "learning_rate": 2.7931884730803294e-06, + "loss": 0.8066, + "num_tokens": 61162036668.0, + "step": 14633 + }, + { + "epoch": 1.7390374331550802, + "grad_norm": 0.22843347158289823, + "learning_rate": 2.7924776815594544e-06, + "loss": 0.8107, + "num_tokens": 61166226135.0, + "step": 14634 + }, + { + "epoch": 1.7391562685680333, + "grad_norm": 0.21674027398071893, + "learning_rate": 2.7917671939908837e-06, + "loss": 0.8242, + "num_tokens": 61170413865.0, + "step": 14635 + }, + { + "epoch": 1.7392751039809862, + "grad_norm": 0.2371205445874886, + "learning_rate": 2.7910570104009278e-06, + "loss": 0.8398, + "num_tokens": 61174604208.0, + "step": 14636 + }, + { + "epoch": 1.7393939393939393, + "grad_norm": 0.2183437421862732, + "learning_rate": 2.790347130815884e-06, + "loss": 0.8234, + "num_tokens": 61178776965.0, + "step": 14637 + }, + { + "epoch": 1.7395127748068924, + "grad_norm": 0.21597488686553615, + "learning_rate": 2.7896375552620435e-06, + "loss": 0.8199, + "num_tokens": 61182965225.0, + "step": 14638 + }, + { + "epoch": 1.7396316102198455, + "grad_norm": 0.2313480598119458, + "learning_rate": 2.7889282837656846e-06, + "loss": 0.8094, + "num_tokens": 61187102965.0, + "step": 14639 + }, + { + "epoch": 1.7397504456327986, + "grad_norm": 0.2229859682723172, + "learning_rate": 2.7882193163530734e-06, + "loss": 0.7791, + "num_tokens": 61191271473.0, + "step": 14640 + }, + { + "epoch": 1.7398692810457517, + "grad_norm": 0.23165636857919594, + "learning_rate": 2.7875106530504654e-06, + "loss": 0.7802, + "num_tokens": 61195408302.0, + "step": 14641 + }, + { + "epoch": 1.7399881164587048, + "grad_norm": 0.23174255182685183, + "learning_rate": 2.7868022938841045e-06, + "loss": 0.8106, + "num_tokens": 61199596618.0, + "step": 14642 + }, + { + "epoch": 1.7401069518716579, + "grad_norm": 0.2372096720468823, + "learning_rate": 2.786094238880225e-06, + "loss": 0.8098, + "num_tokens": 61203760215.0, + "step": 14643 + }, + { + "epoch": 1.740225787284611, + "grad_norm": 0.23986992009593824, + "learning_rate": 2.7853864880650476e-06, + "loss": 0.7979, + "num_tokens": 61207949591.0, + "step": 14644 + }, + { + "epoch": 1.7403446226975638, + "grad_norm": 0.23359091792828499, + "learning_rate": 2.784679041464783e-06, + "loss": 0.8574, + "num_tokens": 61212139965.0, + "step": 14645 + }, + { + "epoch": 1.740463458110517, + "grad_norm": 0.2251291875210906, + "learning_rate": 2.7839718991056303e-06, + "loss": 0.7606, + "num_tokens": 61216329090.0, + "step": 14646 + }, + { + "epoch": 1.74058229352347, + "grad_norm": 0.2278253231201194, + "learning_rate": 2.783265061013777e-06, + "loss": 0.8108, + "num_tokens": 61220501214.0, + "step": 14647 + }, + { + "epoch": 1.740701128936423, + "grad_norm": 0.24084093048236738, + "learning_rate": 2.7825585272153986e-06, + "loss": 0.7967, + "num_tokens": 61224687816.0, + "step": 14648 + }, + { + "epoch": 1.740819964349376, + "grad_norm": 0.23262094520114188, + "learning_rate": 2.7818522977366633e-06, + "loss": 0.8125, + "num_tokens": 61228878261.0, + "step": 14649 + }, + { + "epoch": 1.7409387997623291, + "grad_norm": 0.23807750345758255, + "learning_rate": 2.7811463726037236e-06, + "loss": 0.8064, + "num_tokens": 61233061163.0, + "step": 14650 + }, + { + "epoch": 1.7410576351752822, + "grad_norm": 0.2230061399198184, + "learning_rate": 2.780440751842722e-06, + "loss": 0.8163, + "num_tokens": 61237241172.0, + "step": 14651 + }, + { + "epoch": 1.7411764705882353, + "grad_norm": 0.23264479424927959, + "learning_rate": 2.779735435479792e-06, + "loss": 0.7883, + "num_tokens": 61241429518.0, + "step": 14652 + }, + { + "epoch": 1.7412953060011884, + "grad_norm": 0.22627044178505737, + "learning_rate": 2.7790304235410494e-06, + "loss": 0.8199, + "num_tokens": 61245618789.0, + "step": 14653 + }, + { + "epoch": 1.7414141414141415, + "grad_norm": 0.2189345080482001, + "learning_rate": 2.7783257160526065e-06, + "loss": 0.8091, + "num_tokens": 61249807224.0, + "step": 14654 + }, + { + "epoch": 1.7415329768270946, + "grad_norm": 0.22350610581739416, + "learning_rate": 2.777621313040561e-06, + "loss": 0.8321, + "num_tokens": 61253922159.0, + "step": 14655 + }, + { + "epoch": 1.7416518122400475, + "grad_norm": 0.21745647907979415, + "learning_rate": 2.7769172145309986e-06, + "loss": 0.7733, + "num_tokens": 61258111395.0, + "step": 14656 + }, + { + "epoch": 1.7417706476530006, + "grad_norm": 0.22972145084131962, + "learning_rate": 2.776213420549997e-06, + "loss": 0.82, + "num_tokens": 61262273249.0, + "step": 14657 + }, + { + "epoch": 1.7418894830659537, + "grad_norm": 0.2320708526727392, + "learning_rate": 2.7755099311236156e-06, + "loss": 0.8193, + "num_tokens": 61266461228.0, + "step": 14658 + }, + { + "epoch": 1.7420083184789066, + "grad_norm": 0.22801363501198235, + "learning_rate": 2.774806746277909e-06, + "loss": 0.777, + "num_tokens": 61270649479.0, + "step": 14659 + }, + { + "epoch": 1.7421271538918597, + "grad_norm": 0.24122621629752763, + "learning_rate": 2.7741038660389174e-06, + "loss": 0.8303, + "num_tokens": 61274839483.0, + "step": 14660 + }, + { + "epoch": 1.7422459893048128, + "grad_norm": 0.22445288182391282, + "learning_rate": 2.773401290432674e-06, + "loss": 0.8202, + "num_tokens": 61279006210.0, + "step": 14661 + }, + { + "epoch": 1.7423648247177659, + "grad_norm": 0.2308675683489104, + "learning_rate": 2.7726990194851962e-06, + "loss": 0.7914, + "num_tokens": 61283196179.0, + "step": 14662 + }, + { + "epoch": 1.742483660130719, + "grad_norm": 0.2134851138286884, + "learning_rate": 2.77199705322249e-06, + "loss": 0.778, + "num_tokens": 61287385376.0, + "step": 14663 + }, + { + "epoch": 1.742602495543672, + "grad_norm": 0.22243099443145514, + "learning_rate": 2.7712953916705515e-06, + "loss": 0.7875, + "num_tokens": 61291568403.0, + "step": 14664 + }, + { + "epoch": 1.7427213309566252, + "grad_norm": 0.21287363125154965, + "learning_rate": 2.7705940348553673e-06, + "loss": 0.8256, + "num_tokens": 61295757634.0, + "step": 14665 + }, + { + "epoch": 1.7428401663695783, + "grad_norm": 0.23382973642492857, + "learning_rate": 2.7698929828029086e-06, + "loss": 0.8031, + "num_tokens": 61299946175.0, + "step": 14666 + }, + { + "epoch": 1.7429590017825312, + "grad_norm": 0.2251612025302696, + "learning_rate": 2.769192235539142e-06, + "loss": 0.8153, + "num_tokens": 61304135044.0, + "step": 14667 + }, + { + "epoch": 1.7430778371954843, + "grad_norm": 0.2342064968257382, + "learning_rate": 2.7684917930900155e-06, + "loss": 0.792, + "num_tokens": 61308323698.0, + "step": 14668 + }, + { + "epoch": 1.7431966726084374, + "grad_norm": 0.2392090768981343, + "learning_rate": 2.7677916554814695e-06, + "loss": 0.7894, + "num_tokens": 61312478518.0, + "step": 14669 + }, + { + "epoch": 1.7433155080213902, + "grad_norm": 0.22602715038973561, + "learning_rate": 2.7670918227394313e-06, + "loss": 0.8102, + "num_tokens": 61316667454.0, + "step": 14670 + }, + { + "epoch": 1.7434343434343433, + "grad_norm": 0.23462869316403404, + "learning_rate": 2.7663922948898192e-06, + "loss": 0.8366, + "num_tokens": 61320832892.0, + "step": 14671 + }, + { + "epoch": 1.7435531788472964, + "grad_norm": 0.21762827979700286, + "learning_rate": 2.7656930719585385e-06, + "loss": 0.7967, + "num_tokens": 61325008995.0, + "step": 14672 + }, + { + "epoch": 1.7436720142602495, + "grad_norm": 0.2414158536895231, + "learning_rate": 2.7649941539714846e-06, + "loss": 0.7627, + "num_tokens": 61329199109.0, + "step": 14673 + }, + { + "epoch": 1.7437908496732026, + "grad_norm": 0.21551645053295237, + "learning_rate": 2.7642955409545408e-06, + "loss": 0.8307, + "num_tokens": 61333387796.0, + "step": 14674 + }, + { + "epoch": 1.7439096850861557, + "grad_norm": 0.23044106311365342, + "learning_rate": 2.763597232933578e-06, + "loss": 0.8398, + "num_tokens": 61337577309.0, + "step": 14675 + }, + { + "epoch": 1.7440285204991088, + "grad_norm": 0.2474555678235731, + "learning_rate": 2.762899229934458e-06, + "loss": 0.8325, + "num_tokens": 61341749927.0, + "step": 14676 + }, + { + "epoch": 1.744147355912062, + "grad_norm": 0.21790252175453267, + "learning_rate": 2.76220153198303e-06, + "loss": 0.7904, + "num_tokens": 61345918953.0, + "step": 14677 + }, + { + "epoch": 1.7442661913250148, + "grad_norm": 0.24626357566497775, + "learning_rate": 2.761504139105133e-06, + "loss": 0.79, + "num_tokens": 61350089917.0, + "step": 14678 + }, + { + "epoch": 1.744385026737968, + "grad_norm": 0.23578389518305037, + "learning_rate": 2.76080705132659e-06, + "loss": 0.8226, + "num_tokens": 61354279802.0, + "step": 14679 + }, + { + "epoch": 1.744503862150921, + "grad_norm": 0.24838295535765365, + "learning_rate": 2.7601102686732205e-06, + "loss": 0.8061, + "num_tokens": 61358469706.0, + "step": 14680 + }, + { + "epoch": 1.744622697563874, + "grad_norm": 0.2233819994899022, + "learning_rate": 2.7594137911708274e-06, + "loss": 0.7665, + "num_tokens": 61362658132.0, + "step": 14681 + }, + { + "epoch": 1.744741532976827, + "grad_norm": 0.25339809302190175, + "learning_rate": 2.7587176188452035e-06, + "loss": 0.7924, + "num_tokens": 61366846919.0, + "step": 14682 + }, + { + "epoch": 1.74486036838978, + "grad_norm": 0.21618838196612916, + "learning_rate": 2.7580217517221336e-06, + "loss": 0.8136, + "num_tokens": 61371035162.0, + "step": 14683 + }, + { + "epoch": 1.7449792038027332, + "grad_norm": 0.251222895177096, + "learning_rate": 2.7573261898273825e-06, + "loss": 0.8582, + "num_tokens": 61375223617.0, + "step": 14684 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 0.23930128148433108, + "learning_rate": 2.75663093318671e-06, + "loss": 0.7963, + "num_tokens": 61379412838.0, + "step": 14685 + }, + { + "epoch": 1.7452168746286394, + "grad_norm": 0.23363907698697192, + "learning_rate": 2.7559359818258674e-06, + "loss": 0.8111, + "num_tokens": 61383592481.0, + "step": 14686 + }, + { + "epoch": 1.7453357100415925, + "grad_norm": 0.22835044896391848, + "learning_rate": 2.7552413357705882e-06, + "loss": 0.8266, + "num_tokens": 61387754466.0, + "step": 14687 + }, + { + "epoch": 1.7454545454545456, + "grad_norm": 0.22443978091466396, + "learning_rate": 2.7545469950466016e-06, + "loss": 0.7967, + "num_tokens": 61391909220.0, + "step": 14688 + }, + { + "epoch": 1.7455733808674985, + "grad_norm": 0.23346928606291695, + "learning_rate": 2.7538529596796153e-06, + "loss": 0.789, + "num_tokens": 61396093328.0, + "step": 14689 + }, + { + "epoch": 1.7456922162804516, + "grad_norm": 0.22454703721702274, + "learning_rate": 2.7531592296953365e-06, + "loss": 0.8113, + "num_tokens": 61400280123.0, + "step": 14690 + }, + { + "epoch": 1.7458110516934047, + "grad_norm": 0.2182458036822566, + "learning_rate": 2.752465805119453e-06, + "loss": 0.7787, + "num_tokens": 61404471137.0, + "step": 14691 + }, + { + "epoch": 1.7459298871063575, + "grad_norm": 0.24410796893816897, + "learning_rate": 2.7517726859776454e-06, + "loss": 0.8027, + "num_tokens": 61408659730.0, + "step": 14692 + }, + { + "epoch": 1.7460487225193106, + "grad_norm": 0.222800866948826, + "learning_rate": 2.7510798722955866e-06, + "loss": 0.7895, + "num_tokens": 61412850578.0, + "step": 14693 + }, + { + "epoch": 1.7461675579322637, + "grad_norm": 0.21989861965389196, + "learning_rate": 2.7503873640989287e-06, + "loss": 0.7988, + "num_tokens": 61416999519.0, + "step": 14694 + }, + { + "epoch": 1.7462863933452168, + "grad_norm": 0.24295241223839986, + "learning_rate": 2.7496951614133193e-06, + "loss": 0.7864, + "num_tokens": 61421189534.0, + "step": 14695 + }, + { + "epoch": 1.74640522875817, + "grad_norm": 0.21883325182781496, + "learning_rate": 2.7490032642643927e-06, + "loss": 0.8372, + "num_tokens": 61425377955.0, + "step": 14696 + }, + { + "epoch": 1.746524064171123, + "grad_norm": 0.23237089223560214, + "learning_rate": 2.748311672677773e-06, + "loss": 0.8273, + "num_tokens": 61429568011.0, + "step": 14697 + }, + { + "epoch": 1.7466428995840761, + "grad_norm": 0.2141472079327591, + "learning_rate": 2.747620386679072e-06, + "loss": 0.7618, + "num_tokens": 61433757591.0, + "step": 14698 + }, + { + "epoch": 1.7467617349970292, + "grad_norm": 0.22880460179094658, + "learning_rate": 2.7469294062938896e-06, + "loss": 0.7967, + "num_tokens": 61437947295.0, + "step": 14699 + }, + { + "epoch": 1.7468805704099821, + "grad_norm": 0.2204497202485623, + "learning_rate": 2.7462387315478144e-06, + "loss": 0.8302, + "num_tokens": 61442117443.0, + "step": 14700 + }, + { + "epoch": 1.7469994058229352, + "grad_norm": 0.22523253855160907, + "learning_rate": 2.745548362466427e-06, + "loss": 0.8426, + "num_tokens": 61446306397.0, + "step": 14701 + }, + { + "epoch": 1.7471182412358883, + "grad_norm": 0.22472474226146277, + "learning_rate": 2.744858299075292e-06, + "loss": 0.813, + "num_tokens": 61450487383.0, + "step": 14702 + }, + { + "epoch": 1.7472370766488412, + "grad_norm": 0.2137354029737421, + "learning_rate": 2.7441685413999656e-06, + "loss": 0.7938, + "num_tokens": 61454663555.0, + "step": 14703 + }, + { + "epoch": 1.7473559120617943, + "grad_norm": 0.22974120904228662, + "learning_rate": 2.743479089465993e-06, + "loss": 0.7772, + "num_tokens": 61458844025.0, + "step": 14704 + }, + { + "epoch": 1.7474747474747474, + "grad_norm": 0.2262345171321336, + "learning_rate": 2.7427899432989034e-06, + "loss": 0.8118, + "num_tokens": 61462983641.0, + "step": 14705 + }, + { + "epoch": 1.7475935828877005, + "grad_norm": 0.21898351993361265, + "learning_rate": 2.7421011029242217e-06, + "loss": 0.7738, + "num_tokens": 61467173002.0, + "step": 14706 + }, + { + "epoch": 1.7477124183006536, + "grad_norm": 0.24136598380419247, + "learning_rate": 2.741412568367457e-06, + "loss": 0.7786, + "num_tokens": 61471335289.0, + "step": 14707 + }, + { + "epoch": 1.7478312537136067, + "grad_norm": 0.2333198871905541, + "learning_rate": 2.740724339654108e-06, + "loss": 0.8243, + "num_tokens": 61475525170.0, + "step": 14708 + }, + { + "epoch": 1.7479500891265598, + "grad_norm": 0.2368991494952156, + "learning_rate": 2.7400364168096623e-06, + "loss": 0.8225, + "num_tokens": 61479713116.0, + "step": 14709 + }, + { + "epoch": 1.748068924539513, + "grad_norm": 0.24068087352429698, + "learning_rate": 2.739348799859596e-06, + "loss": 0.8303, + "num_tokens": 61483902734.0, + "step": 14710 + }, + { + "epoch": 1.7481877599524658, + "grad_norm": 0.2512945692201853, + "learning_rate": 2.7386614888293712e-06, + "loss": 0.8197, + "num_tokens": 61488091540.0, + "step": 14711 + }, + { + "epoch": 1.7483065953654189, + "grad_norm": 0.23415067065680953, + "learning_rate": 2.7379744837444443e-06, + "loss": 0.8081, + "num_tokens": 61492281463.0, + "step": 14712 + }, + { + "epoch": 1.748425430778372, + "grad_norm": 0.23587884576220816, + "learning_rate": 2.7372877846302575e-06, + "loss": 0.7875, + "num_tokens": 61496471861.0, + "step": 14713 + }, + { + "epoch": 1.7485442661913249, + "grad_norm": 0.2122500631248227, + "learning_rate": 2.7366013915122413e-06, + "loss": 0.816, + "num_tokens": 61500662597.0, + "step": 14714 + }, + { + "epoch": 1.748663101604278, + "grad_norm": 0.23224266031212124, + "learning_rate": 2.735915304415816e-06, + "loss": 0.8321, + "num_tokens": 61504849879.0, + "step": 14715 + }, + { + "epoch": 1.748781937017231, + "grad_norm": 0.22585349584806771, + "learning_rate": 2.735229523366387e-06, + "loss": 0.7535, + "num_tokens": 61509039204.0, + "step": 14716 + }, + { + "epoch": 1.7489007724301842, + "grad_norm": 0.22375849028176661, + "learning_rate": 2.734544048389351e-06, + "loss": 0.8335, + "num_tokens": 61513213715.0, + "step": 14717 + }, + { + "epoch": 1.7490196078431373, + "grad_norm": 0.2344458926084926, + "learning_rate": 2.7338588795100974e-06, + "loss": 0.7425, + "num_tokens": 61517403863.0, + "step": 14718 + }, + { + "epoch": 1.7491384432560904, + "grad_norm": 0.2191625503430132, + "learning_rate": 2.7331740167539967e-06, + "loss": 0.825, + "num_tokens": 61521592203.0, + "step": 14719 + }, + { + "epoch": 1.7492572786690435, + "grad_norm": 0.21880706045923282, + "learning_rate": 2.732489460146415e-06, + "loss": 0.8147, + "num_tokens": 61525781841.0, + "step": 14720 + }, + { + "epoch": 1.7493761140819966, + "grad_norm": 0.2469296695747731, + "learning_rate": 2.7318052097127007e-06, + "loss": 0.825, + "num_tokens": 61529944108.0, + "step": 14721 + }, + { + "epoch": 1.7494949494949497, + "grad_norm": 0.4128430069991896, + "learning_rate": 2.7311212654781945e-06, + "loss": 0.7978, + "num_tokens": 61534130296.0, + "step": 14722 + }, + { + "epoch": 1.7496137849079025, + "grad_norm": 0.2495392245774945, + "learning_rate": 2.7304376274682232e-06, + "loss": 0.7554, + "num_tokens": 61538320262.0, + "step": 14723 + }, + { + "epoch": 1.7497326203208556, + "grad_norm": 0.23440123973045673, + "learning_rate": 2.729754295708109e-06, + "loss": 0.8418, + "num_tokens": 61542509897.0, + "step": 14724 + }, + { + "epoch": 1.7498514557338085, + "grad_norm": 0.2432053745145204, + "learning_rate": 2.729071270223158e-06, + "loss": 0.848, + "num_tokens": 61546698857.0, + "step": 14725 + }, + { + "epoch": 1.7499702911467616, + "grad_norm": 0.22803293374324152, + "learning_rate": 2.72838855103866e-06, + "loss": 0.7681, + "num_tokens": 61550887365.0, + "step": 14726 + }, + { + "epoch": 1.7500891265597147, + "grad_norm": 0.2396174731025017, + "learning_rate": 2.7277061381799007e-06, + "loss": 0.8139, + "num_tokens": 61555046573.0, + "step": 14727 + }, + { + "epoch": 1.7502079619726678, + "grad_norm": 0.24997505585548338, + "learning_rate": 2.727024031672153e-06, + "loss": 0.7812, + "num_tokens": 61559236579.0, + "step": 14728 + }, + { + "epoch": 1.750326797385621, + "grad_norm": 0.23289240290817456, + "learning_rate": 2.726342231540677e-06, + "loss": 0.8079, + "num_tokens": 61563427617.0, + "step": 14729 + }, + { + "epoch": 1.750445632798574, + "grad_norm": 0.2347527264275072, + "learning_rate": 2.7256607378107213e-06, + "loss": 0.8008, + "num_tokens": 61567616587.0, + "step": 14730 + }, + { + "epoch": 1.7505644682115271, + "grad_norm": 0.21577487846997734, + "learning_rate": 2.7249795505075255e-06, + "loss": 0.793, + "num_tokens": 61571805511.0, + "step": 14731 + }, + { + "epoch": 1.7506833036244802, + "grad_norm": 0.23959904162287166, + "learning_rate": 2.724298669656315e-06, + "loss": 0.8057, + "num_tokens": 61575984626.0, + "step": 14732 + }, + { + "epoch": 1.7508021390374333, + "grad_norm": 0.23489998622365654, + "learning_rate": 2.7236180952823057e-06, + "loss": 0.801, + "num_tokens": 61580142474.0, + "step": 14733 + }, + { + "epoch": 1.7509209744503862, + "grad_norm": 0.22659249710842472, + "learning_rate": 2.722937827410701e-06, + "loss": 0.8133, + "num_tokens": 61584324663.0, + "step": 14734 + }, + { + "epoch": 1.7510398098633393, + "grad_norm": 0.2739709222980121, + "learning_rate": 2.722257866066695e-06, + "loss": 0.8104, + "num_tokens": 61588513016.0, + "step": 14735 + }, + { + "epoch": 1.7511586452762922, + "grad_norm": 0.22764985440651703, + "learning_rate": 2.7215782112754667e-06, + "loss": 0.8027, + "num_tokens": 61592677186.0, + "step": 14736 + }, + { + "epoch": 1.7512774806892453, + "grad_norm": 0.24095466945607408, + "learning_rate": 2.720898863062188e-06, + "loss": 0.848, + "num_tokens": 61596866310.0, + "step": 14737 + }, + { + "epoch": 1.7513963161021984, + "grad_norm": 0.2331485719736265, + "learning_rate": 2.7202198214520158e-06, + "loss": 0.8044, + "num_tokens": 61601055015.0, + "step": 14738 + }, + { + "epoch": 1.7515151515151515, + "grad_norm": 0.22295366897918673, + "learning_rate": 2.7195410864701e-06, + "loss": 0.8245, + "num_tokens": 61605226992.0, + "step": 14739 + }, + { + "epoch": 1.7516339869281046, + "grad_norm": 0.22560206768161534, + "learning_rate": 2.7188626581415733e-06, + "loss": 0.7833, + "num_tokens": 61609414361.0, + "step": 14740 + }, + { + "epoch": 1.7517528223410577, + "grad_norm": 0.22607272922186727, + "learning_rate": 2.7181845364915636e-06, + "loss": 0.7991, + "num_tokens": 61613602327.0, + "step": 14741 + }, + { + "epoch": 1.7518716577540108, + "grad_norm": 0.23566876385048238, + "learning_rate": 2.717506721545178e-06, + "loss": 0.8169, + "num_tokens": 61617759115.0, + "step": 14742 + }, + { + "epoch": 1.7519904931669639, + "grad_norm": 0.2180390780332876, + "learning_rate": 2.716829213327525e-06, + "loss": 0.8288, + "num_tokens": 61621943376.0, + "step": 14743 + }, + { + "epoch": 1.752109328579917, + "grad_norm": 0.23735464047584584, + "learning_rate": 2.7161520118636924e-06, + "loss": 0.7984, + "num_tokens": 61626134283.0, + "step": 14744 + }, + { + "epoch": 1.7522281639928698, + "grad_norm": 0.2273637087187739, + "learning_rate": 2.715475117178758e-06, + "loss": 0.7788, + "num_tokens": 61630323983.0, + "step": 14745 + }, + { + "epoch": 1.752346999405823, + "grad_norm": 0.23374618080580992, + "learning_rate": 2.714798529297793e-06, + "loss": 0.8485, + "num_tokens": 61634514490.0, + "step": 14746 + }, + { + "epoch": 1.752465834818776, + "grad_norm": 0.24250837912282003, + "learning_rate": 2.7141222482458503e-06, + "loss": 0.7972, + "num_tokens": 61638704064.0, + "step": 14747 + }, + { + "epoch": 1.752584670231729, + "grad_norm": 0.22312339528714514, + "learning_rate": 2.713446274047974e-06, + "loss": 0.8066, + "num_tokens": 61642894746.0, + "step": 14748 + }, + { + "epoch": 1.752703505644682, + "grad_norm": 0.24395808621515896, + "learning_rate": 2.7127706067292005e-06, + "loss": 0.8256, + "num_tokens": 61647083071.0, + "step": 14749 + }, + { + "epoch": 1.7528223410576351, + "grad_norm": 0.2257107916595033, + "learning_rate": 2.712095246314551e-06, + "loss": 0.84, + "num_tokens": 61651266233.0, + "step": 14750 + }, + { + "epoch": 1.7529411764705882, + "grad_norm": 0.23386520814993692, + "learning_rate": 2.7114201928290394e-06, + "loss": 0.7868, + "num_tokens": 61655407752.0, + "step": 14751 + }, + { + "epoch": 1.7530600118835413, + "grad_norm": 0.22158168071170625, + "learning_rate": 2.71074544629766e-06, + "loss": 0.7973, + "num_tokens": 61659595917.0, + "step": 14752 + }, + { + "epoch": 1.7531788472964944, + "grad_norm": 0.22275171182425765, + "learning_rate": 2.710071006745404e-06, + "loss": 0.78, + "num_tokens": 61663735535.0, + "step": 14753 + }, + { + "epoch": 1.7532976827094475, + "grad_norm": 0.22358881499459204, + "learning_rate": 2.709396874197247e-06, + "loss": 0.807, + "num_tokens": 61667912952.0, + "step": 14754 + }, + { + "epoch": 1.7534165181224006, + "grad_norm": 0.22901195634699398, + "learning_rate": 2.7087230486781542e-06, + "loss": 0.7815, + "num_tokens": 61672070784.0, + "step": 14755 + }, + { + "epoch": 1.7535353535353535, + "grad_norm": 0.23985498685840254, + "learning_rate": 2.708049530213082e-06, + "loss": 0.7897, + "num_tokens": 61676260411.0, + "step": 14756 + }, + { + "epoch": 1.7536541889483066, + "grad_norm": 0.21651333443738666, + "learning_rate": 2.707376318826971e-06, + "loss": 0.8049, + "num_tokens": 61680429545.0, + "step": 14757 + }, + { + "epoch": 1.7537730243612597, + "grad_norm": 0.2454235446006018, + "learning_rate": 2.706703414544753e-06, + "loss": 0.7992, + "num_tokens": 61684619303.0, + "step": 14758 + }, + { + "epoch": 1.7538918597742126, + "grad_norm": 0.23993168896032274, + "learning_rate": 2.706030817391347e-06, + "loss": 0.8542, + "num_tokens": 61688799974.0, + "step": 14759 + }, + { + "epoch": 1.7540106951871657, + "grad_norm": 0.2399143366383291, + "learning_rate": 2.7053585273916638e-06, + "loss": 0.7975, + "num_tokens": 61692979916.0, + "step": 14760 + }, + { + "epoch": 1.7541295306001188, + "grad_norm": 0.24938297380844934, + "learning_rate": 2.7046865445705983e-06, + "loss": 0.8008, + "num_tokens": 61697170223.0, + "step": 14761 + }, + { + "epoch": 1.7542483660130719, + "grad_norm": 0.250365691775523, + "learning_rate": 2.704014868953037e-06, + "loss": 0.7805, + "num_tokens": 61701359176.0, + "step": 14762 + }, + { + "epoch": 1.754367201426025, + "grad_norm": 0.25142151201068097, + "learning_rate": 2.703343500563855e-06, + "loss": 0.7898, + "num_tokens": 61705543238.0, + "step": 14763 + }, + { + "epoch": 1.754486036838978, + "grad_norm": 0.2256178983383606, + "learning_rate": 2.7026724394279143e-06, + "loss": 0.7869, + "num_tokens": 61709708055.0, + "step": 14764 + }, + { + "epoch": 1.7546048722519312, + "grad_norm": 0.24207985958603995, + "learning_rate": 2.7020016855700678e-06, + "loss": 0.799, + "num_tokens": 61713877314.0, + "step": 14765 + }, + { + "epoch": 1.7547237076648843, + "grad_norm": 0.22243296502939003, + "learning_rate": 2.701331239015154e-06, + "loss": 0.7987, + "num_tokens": 61718066610.0, + "step": 14766 + }, + { + "epoch": 1.7548425430778372, + "grad_norm": 0.23338797643245204, + "learning_rate": 2.700661099788005e-06, + "loss": 0.8407, + "num_tokens": 61722255818.0, + "step": 14767 + }, + { + "epoch": 1.7549613784907903, + "grad_norm": 0.2292994366756522, + "learning_rate": 2.6999912679134333e-06, + "loss": 0.7926, + "num_tokens": 61726444887.0, + "step": 14768 + }, + { + "epoch": 1.7550802139037434, + "grad_norm": 0.23393659868837638, + "learning_rate": 2.69932174341625e-06, + "loss": 0.8181, + "num_tokens": 61730633779.0, + "step": 14769 + }, + { + "epoch": 1.7551990493166962, + "grad_norm": 0.22670905054557006, + "learning_rate": 2.6986525263212458e-06, + "loss": 0.7773, + "num_tokens": 61734768275.0, + "step": 14770 + }, + { + "epoch": 1.7553178847296493, + "grad_norm": 0.21521060695118727, + "learning_rate": 2.6979836166532086e-06, + "loss": 0.8169, + "num_tokens": 61738957569.0, + "step": 14771 + }, + { + "epoch": 1.7554367201426024, + "grad_norm": 0.2319798003052266, + "learning_rate": 2.697315014436908e-06, + "loss": 0.8009, + "num_tokens": 61743147822.0, + "step": 14772 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.21629240893261478, + "learning_rate": 2.6966467196971025e-06, + "loss": 0.8048, + "num_tokens": 61747335666.0, + "step": 14773 + }, + { + "epoch": 1.7556743909685086, + "grad_norm": 0.2411489398436016, + "learning_rate": 2.695978732458543e-06, + "loss": 0.8101, + "num_tokens": 61751524260.0, + "step": 14774 + }, + { + "epoch": 1.7557932263814617, + "grad_norm": 0.23196694080539323, + "learning_rate": 2.6953110527459685e-06, + "loss": 0.8022, + "num_tokens": 61755713147.0, + "step": 14775 + }, + { + "epoch": 1.7559120617944148, + "grad_norm": 0.2035223329158268, + "learning_rate": 2.6946436805841042e-06, + "loss": 0.8042, + "num_tokens": 61759902719.0, + "step": 14776 + }, + { + "epoch": 1.756030897207368, + "grad_norm": 0.22718238843929178, + "learning_rate": 2.6939766159976684e-06, + "loss": 0.8227, + "num_tokens": 61764091913.0, + "step": 14777 + }, + { + "epoch": 1.7561497326203208, + "grad_norm": 0.22397882593796403, + "learning_rate": 2.6933098590113583e-06, + "loss": 0.7872, + "num_tokens": 61768250410.0, + "step": 14778 + }, + { + "epoch": 1.756268568033274, + "grad_norm": 0.2375462284480197, + "learning_rate": 2.692643409649871e-06, + "loss": 0.786, + "num_tokens": 61772439400.0, + "step": 14779 + }, + { + "epoch": 1.756387403446227, + "grad_norm": 0.2229157336021562, + "learning_rate": 2.6919772679378857e-06, + "loss": 0.8395, + "num_tokens": 61776628585.0, + "step": 14780 + }, + { + "epoch": 1.75650623885918, + "grad_norm": 0.2318711729508031, + "learning_rate": 2.6913114339000706e-06, + "loss": 0.8215, + "num_tokens": 61780816110.0, + "step": 14781 + }, + { + "epoch": 1.756625074272133, + "grad_norm": 0.2249946883372656, + "learning_rate": 2.690645907561089e-06, + "loss": 0.7988, + "num_tokens": 61785005422.0, + "step": 14782 + }, + { + "epoch": 1.756743909685086, + "grad_norm": 0.2541977052874707, + "learning_rate": 2.6899806889455826e-06, + "loss": 0.8259, + "num_tokens": 61789163965.0, + "step": 14783 + }, + { + "epoch": 1.7568627450980392, + "grad_norm": 0.23456895442082726, + "learning_rate": 2.6893157780781876e-06, + "loss": 0.7931, + "num_tokens": 61793354074.0, + "step": 14784 + }, + { + "epoch": 1.7569815805109923, + "grad_norm": 0.21849405153860413, + "learning_rate": 2.688651174983529e-06, + "loss": 0.7749, + "num_tokens": 61797535023.0, + "step": 14785 + }, + { + "epoch": 1.7571004159239454, + "grad_norm": 0.23364226936893406, + "learning_rate": 2.687986879686218e-06, + "loss": 0.7973, + "num_tokens": 61801707996.0, + "step": 14786 + }, + { + "epoch": 1.7572192513368985, + "grad_norm": 0.21845672188533158, + "learning_rate": 2.6873228922108563e-06, + "loss": 0.8564, + "num_tokens": 61805872117.0, + "step": 14787 + }, + { + "epoch": 1.7573380867498516, + "grad_norm": 0.22686471561232135, + "learning_rate": 2.6866592125820343e-06, + "loss": 0.8299, + "num_tokens": 61810047039.0, + "step": 14788 + }, + { + "epoch": 1.7574569221628045, + "grad_norm": 0.21509585161598727, + "learning_rate": 2.685995840824329e-06, + "loss": 0.8276, + "num_tokens": 61814236921.0, + "step": 14789 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 0.22208989424600903, + "learning_rate": 2.685332776962309e-06, + "loss": 0.8465, + "num_tokens": 61818403470.0, + "step": 14790 + }, + { + "epoch": 1.7576945929887107, + "grad_norm": 0.23015698840824175, + "learning_rate": 2.684670021020529e-06, + "loss": 0.8435, + "num_tokens": 61822589862.0, + "step": 14791 + }, + { + "epoch": 1.7578134284016635, + "grad_norm": 0.22357240875453946, + "learning_rate": 2.684007573023532e-06, + "loss": 0.8223, + "num_tokens": 61826778442.0, + "step": 14792 + }, + { + "epoch": 1.7579322638146166, + "grad_norm": 0.229282259318835, + "learning_rate": 2.683345432995852e-06, + "loss": 0.8451, + "num_tokens": 61830967054.0, + "step": 14793 + }, + { + "epoch": 1.7580510992275697, + "grad_norm": 0.22320788869111258, + "learning_rate": 2.68268360096201e-06, + "loss": 0.7967, + "num_tokens": 61835139795.0, + "step": 14794 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.239502813089701, + "learning_rate": 2.6820220769465156e-06, + "loss": 0.8093, + "num_tokens": 61839328328.0, + "step": 14795 + }, + { + "epoch": 1.758288770053476, + "grad_norm": 0.24433381836304452, + "learning_rate": 2.681360860973868e-06, + "loss": 0.8242, + "num_tokens": 61843466823.0, + "step": 14796 + }, + { + "epoch": 1.758407605466429, + "grad_norm": 0.2350485546800112, + "learning_rate": 2.680699953068553e-06, + "loss": 0.792, + "num_tokens": 61847655455.0, + "step": 14797 + }, + { + "epoch": 1.7585264408793821, + "grad_norm": 0.2299357505974849, + "learning_rate": 2.6800393532550485e-06, + "loss": 0.8282, + "num_tokens": 61851844573.0, + "step": 14798 + }, + { + "epoch": 1.7586452762923352, + "grad_norm": 0.24506064398520158, + "learning_rate": 2.679379061557816e-06, + "loss": 0.8127, + "num_tokens": 61856033917.0, + "step": 14799 + }, + { + "epoch": 1.7587641117052881, + "grad_norm": 0.2269469591890282, + "learning_rate": 2.6787190780013076e-06, + "loss": 0.8291, + "num_tokens": 61860174431.0, + "step": 14800 + }, + { + "epoch": 1.7588829471182412, + "grad_norm": 0.23886211158180298, + "learning_rate": 2.678059402609969e-06, + "loss": 0.8165, + "num_tokens": 61864364017.0, + "step": 14801 + }, + { + "epoch": 1.7590017825311943, + "grad_norm": 0.23474407089729177, + "learning_rate": 2.677400035408227e-06, + "loss": 0.8237, + "num_tokens": 61868548231.0, + "step": 14802 + }, + { + "epoch": 1.7591206179441472, + "grad_norm": 0.2264498903366494, + "learning_rate": 2.676740976420503e-06, + "loss": 0.7756, + "num_tokens": 61872738630.0, + "step": 14803 + }, + { + "epoch": 1.7592394533571003, + "grad_norm": 0.22930716199791407, + "learning_rate": 2.6760822256712006e-06, + "loss": 0.7904, + "num_tokens": 61876928932.0, + "step": 14804 + }, + { + "epoch": 1.7593582887700534, + "grad_norm": 0.2253445944336143, + "learning_rate": 2.6754237831847174e-06, + "loss": 0.826, + "num_tokens": 61881117675.0, + "step": 14805 + }, + { + "epoch": 1.7594771241830065, + "grad_norm": 0.2177444719992201, + "learning_rate": 2.6747656489854344e-06, + "loss": 0.812, + "num_tokens": 61885256421.0, + "step": 14806 + }, + { + "epoch": 1.7595959595959596, + "grad_norm": 0.2308136978232853, + "learning_rate": 2.674107823097732e-06, + "loss": 0.8272, + "num_tokens": 61889444735.0, + "step": 14807 + }, + { + "epoch": 1.7597147950089127, + "grad_norm": 0.23575102573361767, + "learning_rate": 2.673450305545967e-06, + "loss": 0.8017, + "num_tokens": 61893608010.0, + "step": 14808 + }, + { + "epoch": 1.7598336304218658, + "grad_norm": 0.21828410155697062, + "learning_rate": 2.6727930963544892e-06, + "loss": 0.7528, + "num_tokens": 61897797397.0, + "step": 14809 + }, + { + "epoch": 1.759952465834819, + "grad_norm": 0.23275262731061358, + "learning_rate": 2.672136195547638e-06, + "loss": 0.8289, + "num_tokens": 61901942966.0, + "step": 14810 + }, + { + "epoch": 1.760071301247772, + "grad_norm": 0.2291836497284965, + "learning_rate": 2.67147960314974e-06, + "loss": 0.8152, + "num_tokens": 61906130762.0, + "step": 14811 + }, + { + "epoch": 1.7601901366607249, + "grad_norm": 0.21761858722371266, + "learning_rate": 2.6708233191851113e-06, + "loss": 0.7839, + "num_tokens": 61910319303.0, + "step": 14812 + }, + { + "epoch": 1.760308972073678, + "grad_norm": 0.23161922902536325, + "learning_rate": 2.670167343678059e-06, + "loss": 0.82, + "num_tokens": 61914507461.0, + "step": 14813 + }, + { + "epoch": 1.7604278074866309, + "grad_norm": 0.21933555687759565, + "learning_rate": 2.6695116766528727e-06, + "loss": 0.8097, + "num_tokens": 61918698123.0, + "step": 14814 + }, + { + "epoch": 1.760546642899584, + "grad_norm": 0.24847502792961834, + "learning_rate": 2.6688563181338343e-06, + "loss": 0.7772, + "num_tokens": 61922887490.0, + "step": 14815 + }, + { + "epoch": 1.760665478312537, + "grad_norm": 0.22666597704670177, + "learning_rate": 2.668201268145215e-06, + "loss": 0.8211, + "num_tokens": 61927066862.0, + "step": 14816 + }, + { + "epoch": 1.7607843137254902, + "grad_norm": 0.22810598232630716, + "learning_rate": 2.6675465267112733e-06, + "loss": 0.8008, + "num_tokens": 61931256466.0, + "step": 14817 + }, + { + "epoch": 1.7609031491384433, + "grad_norm": 0.2437682046820368, + "learning_rate": 2.666892093856257e-06, + "loss": 0.832, + "num_tokens": 61935408484.0, + "step": 14818 + }, + { + "epoch": 1.7610219845513964, + "grad_norm": 0.2198782199319017, + "learning_rate": 2.666237969604401e-06, + "loss": 0.805, + "num_tokens": 61939596685.0, + "step": 14819 + }, + { + "epoch": 1.7611408199643495, + "grad_norm": 0.23643215955166022, + "learning_rate": 2.6655841539799294e-06, + "loss": 0.8302, + "num_tokens": 61943785471.0, + "step": 14820 + }, + { + "epoch": 1.7612596553773026, + "grad_norm": 0.22001670380440644, + "learning_rate": 2.664930647007057e-06, + "loss": 0.8131, + "num_tokens": 61947973443.0, + "step": 14821 + }, + { + "epoch": 1.7613784907902557, + "grad_norm": 0.22064663759497802, + "learning_rate": 2.664277448709983e-06, + "loss": 0.7957, + "num_tokens": 61952162970.0, + "step": 14822 + }, + { + "epoch": 1.7614973262032085, + "grad_norm": 0.23317172370686634, + "learning_rate": 2.6636245591128994e-06, + "loss": 0.7639, + "num_tokens": 61956322104.0, + "step": 14823 + }, + { + "epoch": 1.7616161616161616, + "grad_norm": 0.21537431012212294, + "learning_rate": 2.6629719782399847e-06, + "loss": 0.7879, + "num_tokens": 61960511578.0, + "step": 14824 + }, + { + "epoch": 1.7617349970291145, + "grad_norm": 0.2378637973453266, + "learning_rate": 2.662319706115406e-06, + "loss": 0.7561, + "num_tokens": 61964699902.0, + "step": 14825 + }, + { + "epoch": 1.7618538324420676, + "grad_norm": 0.23266741589274748, + "learning_rate": 2.6616677427633194e-06, + "loss": 0.7854, + "num_tokens": 61968877055.0, + "step": 14826 + }, + { + "epoch": 1.7619726678550207, + "grad_norm": 0.22702064758821863, + "learning_rate": 2.6610160882078673e-06, + "loss": 0.8227, + "num_tokens": 61973046593.0, + "step": 14827 + }, + { + "epoch": 1.7620915032679738, + "grad_norm": 0.2463510666999977, + "learning_rate": 2.660364742473185e-06, + "loss": 0.8361, + "num_tokens": 61977235664.0, + "step": 14828 + }, + { + "epoch": 1.762210338680927, + "grad_norm": 0.22938378051838534, + "learning_rate": 2.659713705583393e-06, + "loss": 0.8255, + "num_tokens": 61981424116.0, + "step": 14829 + }, + { + "epoch": 1.76232917409388, + "grad_norm": 0.22998270164971207, + "learning_rate": 2.659062977562603e-06, + "loss": 0.8059, + "num_tokens": 61985613007.0, + "step": 14830 + }, + { + "epoch": 1.7624480095068331, + "grad_norm": 0.23696965157383998, + "learning_rate": 2.65841255843491e-06, + "loss": 0.8223, + "num_tokens": 61989788273.0, + "step": 14831 + }, + { + "epoch": 1.7625668449197862, + "grad_norm": 0.2406386324029855, + "learning_rate": 2.6577624482244052e-06, + "loss": 0.8234, + "num_tokens": 61993949138.0, + "step": 14832 + }, + { + "epoch": 1.7626856803327393, + "grad_norm": 0.23921221003203985, + "learning_rate": 2.6571126469551616e-06, + "loss": 0.8201, + "num_tokens": 61998138422.0, + "step": 14833 + }, + { + "epoch": 1.7628045157456922, + "grad_norm": 0.21885486491681894, + "learning_rate": 2.6564631546512454e-06, + "loss": 0.7963, + "num_tokens": 62002269979.0, + "step": 14834 + }, + { + "epoch": 1.7629233511586453, + "grad_norm": 0.22850511171637455, + "learning_rate": 2.65581397133671e-06, + "loss": 0.7809, + "num_tokens": 62006459001.0, + "step": 14835 + }, + { + "epoch": 1.7630421865715984, + "grad_norm": 0.230290523025239, + "learning_rate": 2.6551650970355948e-06, + "loss": 0.7746, + "num_tokens": 62010647735.0, + "step": 14836 + }, + { + "epoch": 1.7631610219845513, + "grad_norm": 0.22869306309826748, + "learning_rate": 2.65451653177193e-06, + "loss": 0.8151, + "num_tokens": 62014808020.0, + "step": 14837 + }, + { + "epoch": 1.7632798573975044, + "grad_norm": 0.2291468388178375, + "learning_rate": 2.6538682755697333e-06, + "loss": 0.7873, + "num_tokens": 62018997938.0, + "step": 14838 + }, + { + "epoch": 1.7633986928104575, + "grad_norm": 0.23081349959703096, + "learning_rate": 2.653220328453014e-06, + "loss": 0.8223, + "num_tokens": 62023153785.0, + "step": 14839 + }, + { + "epoch": 1.7635175282234106, + "grad_norm": 0.2132827896033625, + "learning_rate": 2.6525726904457698e-06, + "loss": 0.822, + "num_tokens": 62027342735.0, + "step": 14840 + }, + { + "epoch": 1.7636363636363637, + "grad_norm": 0.22979525706721235, + "learning_rate": 2.6519253615719797e-06, + "loss": 0.7887, + "num_tokens": 62031532195.0, + "step": 14841 + }, + { + "epoch": 1.7637551990493168, + "grad_norm": 0.22219310679414297, + "learning_rate": 2.6512783418556198e-06, + "loss": 0.8012, + "num_tokens": 62035720682.0, + "step": 14842 + }, + { + "epoch": 1.7638740344622699, + "grad_norm": 0.24049248961022976, + "learning_rate": 2.650631631320651e-06, + "loss": 0.8074, + "num_tokens": 62039894732.0, + "step": 14843 + }, + { + "epoch": 1.763992869875223, + "grad_norm": 0.23448811991049548, + "learning_rate": 2.6499852299910202e-06, + "loss": 0.7812, + "num_tokens": 62044081935.0, + "step": 14844 + }, + { + "epoch": 1.7641117052881758, + "grad_norm": 0.2451706388209013, + "learning_rate": 2.649339137890672e-06, + "loss": 0.8313, + "num_tokens": 62048257502.0, + "step": 14845 + }, + { + "epoch": 1.764230540701129, + "grad_norm": 0.2332072786427128, + "learning_rate": 2.6486933550435285e-06, + "loss": 0.8435, + "num_tokens": 62052447957.0, + "step": 14846 + }, + { + "epoch": 1.764349376114082, + "grad_norm": 0.23739239994271982, + "learning_rate": 2.648047881473506e-06, + "loss": 0.7931, + "num_tokens": 62056637224.0, + "step": 14847 + }, + { + "epoch": 1.764468211527035, + "grad_norm": 0.23070779614055933, + "learning_rate": 2.6474027172045088e-06, + "loss": 0.7734, + "num_tokens": 62060826964.0, + "step": 14848 + }, + { + "epoch": 1.764587046939988, + "grad_norm": 0.23734498651120153, + "learning_rate": 2.6467578622604305e-06, + "loss": 0.7583, + "num_tokens": 62065015883.0, + "step": 14849 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.24807737924061868, + "learning_rate": 2.646113316665151e-06, + "loss": 0.8072, + "num_tokens": 62069205563.0, + "step": 14850 + }, + { + "epoch": 1.7648247177658942, + "grad_norm": 0.22425126816950838, + "learning_rate": 2.6454690804425404e-06, + "loss": 0.802, + "num_tokens": 62073394677.0, + "step": 14851 + }, + { + "epoch": 1.7649435531788473, + "grad_norm": 0.2465781290666025, + "learning_rate": 2.6448251536164576e-06, + "loss": 0.8064, + "num_tokens": 62077583023.0, + "step": 14852 + }, + { + "epoch": 1.7650623885918004, + "grad_norm": 0.2439324844419655, + "learning_rate": 2.6441815362107485e-06, + "loss": 0.7652, + "num_tokens": 62081772129.0, + "step": 14853 + }, + { + "epoch": 1.7651812240047535, + "grad_norm": 0.24307128988128407, + "learning_rate": 2.6435382282492496e-06, + "loss": 0.7711, + "num_tokens": 62085935235.0, + "step": 14854 + }, + { + "epoch": 1.7653000594177066, + "grad_norm": 0.250213766820544, + "learning_rate": 2.642895229755783e-06, + "loss": 0.7888, + "num_tokens": 62090123030.0, + "step": 14855 + }, + { + "epoch": 1.7654188948306595, + "grad_norm": 0.24399506507003801, + "learning_rate": 2.6422525407541646e-06, + "loss": 0.7917, + "num_tokens": 62094310914.0, + "step": 14856 + }, + { + "epoch": 1.7655377302436126, + "grad_norm": 0.23335338034671607, + "learning_rate": 2.641610161268189e-06, + "loss": 0.8058, + "num_tokens": 62098500307.0, + "step": 14857 + }, + { + "epoch": 1.7656565656565657, + "grad_norm": 0.2421321673909903, + "learning_rate": 2.6409680913216508e-06, + "loss": 0.8181, + "num_tokens": 62102651643.0, + "step": 14858 + }, + { + "epoch": 1.7657754010695186, + "grad_norm": 0.25001791167968307, + "learning_rate": 2.640326330938327e-06, + "loss": 0.8095, + "num_tokens": 62106841502.0, + "step": 14859 + }, + { + "epoch": 1.7658942364824717, + "grad_norm": 0.24674976812037125, + "learning_rate": 2.639684880141984e-06, + "loss": 0.8097, + "num_tokens": 62111027932.0, + "step": 14860 + }, + { + "epoch": 1.7660130718954248, + "grad_norm": 0.22664671057781305, + "learning_rate": 2.639043738956379e-06, + "loss": 0.7843, + "num_tokens": 62115218079.0, + "step": 14861 + }, + { + "epoch": 1.7661319073083779, + "grad_norm": 0.22688595649017115, + "learning_rate": 2.6384029074052507e-06, + "loss": 0.8604, + "num_tokens": 62119408190.0, + "step": 14862 + }, + { + "epoch": 1.766250742721331, + "grad_norm": 0.24457278281635342, + "learning_rate": 2.6377623855123336e-06, + "loss": 0.8012, + "num_tokens": 62123565177.0, + "step": 14863 + }, + { + "epoch": 1.766369578134284, + "grad_norm": 0.2234260845894422, + "learning_rate": 2.6371221733013497e-06, + "loss": 0.7856, + "num_tokens": 62127749813.0, + "step": 14864 + }, + { + "epoch": 1.7664884135472372, + "grad_norm": 0.22214139288180362, + "learning_rate": 2.6364822707960068e-06, + "loss": 0.8249, + "num_tokens": 62131939395.0, + "step": 14865 + }, + { + "epoch": 1.7666072489601903, + "grad_norm": 0.2269139452757993, + "learning_rate": 2.635842678020005e-06, + "loss": 0.8005, + "num_tokens": 62136130415.0, + "step": 14866 + }, + { + "epoch": 1.7667260843731432, + "grad_norm": 0.21288028160671538, + "learning_rate": 2.6352033949970273e-06, + "loss": 0.7718, + "num_tokens": 62140320498.0, + "step": 14867 + }, + { + "epoch": 1.7668449197860963, + "grad_norm": 0.247280483904174, + "learning_rate": 2.634564421750749e-06, + "loss": 0.8246, + "num_tokens": 62144509952.0, + "step": 14868 + }, + { + "epoch": 1.7669637551990494, + "grad_norm": 0.23890093149041455, + "learning_rate": 2.6339257583048365e-06, + "loss": 0.8124, + "num_tokens": 62148663884.0, + "step": 14869 + }, + { + "epoch": 1.7670825906120022, + "grad_norm": 0.2305289629672864, + "learning_rate": 2.633287404682936e-06, + "loss": 0.7974, + "num_tokens": 62152853181.0, + "step": 14870 + }, + { + "epoch": 1.7672014260249553, + "grad_norm": 0.24969566494291845, + "learning_rate": 2.6326493609086956e-06, + "loss": 0.7817, + "num_tokens": 62157041746.0, + "step": 14871 + }, + { + "epoch": 1.7673202614379084, + "grad_norm": 0.21994428444468428, + "learning_rate": 2.632011627005738e-06, + "loss": 0.8087, + "num_tokens": 62161231066.0, + "step": 14872 + }, + { + "epoch": 1.7674390968508615, + "grad_norm": 0.23249726486573888, + "learning_rate": 2.6313742029976842e-06, + "loss": 0.8187, + "num_tokens": 62165420497.0, + "step": 14873 + }, + { + "epoch": 1.7675579322638146, + "grad_norm": 0.24397042492246823, + "learning_rate": 2.630737088908136e-06, + "loss": 0.7809, + "num_tokens": 62169579529.0, + "step": 14874 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 0.22830933778666998, + "learning_rate": 2.6301002847606925e-06, + "loss": 0.8086, + "num_tokens": 62173768037.0, + "step": 14875 + }, + { + "epoch": 1.7677956030897208, + "grad_norm": 0.24715597290996505, + "learning_rate": 2.6294637905789344e-06, + "loss": 0.7931, + "num_tokens": 62177955513.0, + "step": 14876 + }, + { + "epoch": 1.767914438502674, + "grad_norm": 0.23341749271052595, + "learning_rate": 2.6288276063864326e-06, + "loss": 0.7764, + "num_tokens": 62182144750.0, + "step": 14877 + }, + { + "epoch": 1.7680332739156268, + "grad_norm": 0.21215996613379623, + "learning_rate": 2.628191732206749e-06, + "loss": 0.8061, + "num_tokens": 62186334723.0, + "step": 14878 + }, + { + "epoch": 1.76815210932858, + "grad_norm": 1.0903690889648447, + "learning_rate": 2.627556168063432e-06, + "loss": 0.8288, + "num_tokens": 62190523989.0, + "step": 14879 + }, + { + "epoch": 1.768270944741533, + "grad_norm": 0.2755342969573723, + "learning_rate": 2.626920913980017e-06, + "loss": 0.8386, + "num_tokens": 62194712764.0, + "step": 14880 + }, + { + "epoch": 1.768389780154486, + "grad_norm": 0.23432028466753457, + "learning_rate": 2.6262859699800307e-06, + "loss": 0.7878, + "num_tokens": 62198902223.0, + "step": 14881 + }, + { + "epoch": 1.768508615567439, + "grad_norm": 0.23705008576904282, + "learning_rate": 2.625651336086987e-06, + "loss": 0.8264, + "num_tokens": 62203047197.0, + "step": 14882 + }, + { + "epoch": 1.768627450980392, + "grad_norm": 0.27341714987133664, + "learning_rate": 2.6250170123243896e-06, + "loss": 0.8173, + "num_tokens": 62207236058.0, + "step": 14883 + }, + { + "epoch": 1.7687462863933452, + "grad_norm": 0.22455032231777733, + "learning_rate": 2.6243829987157278e-06, + "loss": 0.7941, + "num_tokens": 62211393768.0, + "step": 14884 + }, + { + "epoch": 1.7688651218062983, + "grad_norm": 0.23086007601978661, + "learning_rate": 2.6237492952844817e-06, + "loss": 0.7726, + "num_tokens": 62215583394.0, + "step": 14885 + }, + { + "epoch": 1.7689839572192514, + "grad_norm": 0.22782977048517328, + "learning_rate": 2.6231159020541214e-06, + "loss": 0.7939, + "num_tokens": 62219772554.0, + "step": 14886 + }, + { + "epoch": 1.7691027926322045, + "grad_norm": 0.21586715009354307, + "learning_rate": 2.622482819048103e-06, + "loss": 0.8234, + "num_tokens": 62223938767.0, + "step": 14887 + }, + { + "epoch": 1.7692216280451576, + "grad_norm": 0.23629817913090512, + "learning_rate": 2.621850046289869e-06, + "loss": 0.7863, + "num_tokens": 62228118104.0, + "step": 14888 + }, + { + "epoch": 1.7693404634581105, + "grad_norm": 0.24430571069346313, + "learning_rate": 2.621217583802854e-06, + "loss": 0.8086, + "num_tokens": 62232280625.0, + "step": 14889 + }, + { + "epoch": 1.7694592988710636, + "grad_norm": 0.22025139027197543, + "learning_rate": 2.620585431610482e-06, + "loss": 0.7895, + "num_tokens": 62236470165.0, + "step": 14890 + }, + { + "epoch": 1.7695781342840167, + "grad_norm": 0.23284062152203447, + "learning_rate": 2.6199535897361644e-06, + "loss": 0.8257, + "num_tokens": 62240660104.0, + "step": 14891 + }, + { + "epoch": 1.7696969696969695, + "grad_norm": 0.22310934516111902, + "learning_rate": 2.6193220582032987e-06, + "loss": 0.7809, + "num_tokens": 62244850375.0, + "step": 14892 + }, + { + "epoch": 1.7698158051099226, + "grad_norm": 0.22840906380716633, + "learning_rate": 2.6186908370352727e-06, + "loss": 0.792, + "num_tokens": 62249039768.0, + "step": 14893 + }, + { + "epoch": 1.7699346405228757, + "grad_norm": 0.24211260338450696, + "learning_rate": 2.6180599262554623e-06, + "loss": 0.8288, + "num_tokens": 62253228118.0, + "step": 14894 + }, + { + "epoch": 1.7700534759358288, + "grad_norm": 0.22946564495016092, + "learning_rate": 2.617429325887232e-06, + "loss": 0.7959, + "num_tokens": 62257418366.0, + "step": 14895 + }, + { + "epoch": 1.770172311348782, + "grad_norm": 0.23298963409358875, + "learning_rate": 2.616799035953936e-06, + "loss": 0.8342, + "num_tokens": 62261607846.0, + "step": 14896 + }, + { + "epoch": 1.770291146761735, + "grad_norm": 0.2287447290445156, + "learning_rate": 2.616169056478919e-06, + "loss": 0.7883, + "num_tokens": 62265794868.0, + "step": 14897 + }, + { + "epoch": 1.7704099821746881, + "grad_norm": 0.2295882050965521, + "learning_rate": 2.615539387485506e-06, + "loss": 0.8049, + "num_tokens": 62269956620.0, + "step": 14898 + }, + { + "epoch": 1.7705288175876412, + "grad_norm": 0.2355216104177638, + "learning_rate": 2.6149100289970176e-06, + "loss": 0.8073, + "num_tokens": 62274145132.0, + "step": 14899 + }, + { + "epoch": 1.7706476530005941, + "grad_norm": 0.2156025309253309, + "learning_rate": 2.6142809810367625e-06, + "loss": 0.82, + "num_tokens": 62278332912.0, + "step": 14900 + }, + { + "epoch": 1.7707664884135472, + "grad_norm": 0.23411908823918817, + "learning_rate": 2.613652243628033e-06, + "loss": 0.8039, + "num_tokens": 62282522062.0, + "step": 14901 + }, + { + "epoch": 1.7708853238265003, + "grad_norm": 0.22442629973307904, + "learning_rate": 2.613023816794119e-06, + "loss": 0.8212, + "num_tokens": 62286674726.0, + "step": 14902 + }, + { + "epoch": 1.7710041592394532, + "grad_norm": 0.2195373625751731, + "learning_rate": 2.6123957005582903e-06, + "loss": 0.7866, + "num_tokens": 62290864591.0, + "step": 14903 + }, + { + "epoch": 1.7711229946524063, + "grad_norm": 0.23496276239544145, + "learning_rate": 2.611767894943806e-06, + "loss": 0.8147, + "num_tokens": 62295030179.0, + "step": 14904 + }, + { + "epoch": 1.7712418300653594, + "grad_norm": 0.22540430647999252, + "learning_rate": 2.611140399973918e-06, + "loss": 0.7941, + "num_tokens": 62299167830.0, + "step": 14905 + }, + { + "epoch": 1.7713606654783125, + "grad_norm": 0.2311639370568136, + "learning_rate": 2.610513215671865e-06, + "loss": 0.7831, + "num_tokens": 62303340393.0, + "step": 14906 + }, + { + "epoch": 1.7714795008912656, + "grad_norm": 0.23217595538270402, + "learning_rate": 2.6098863420608733e-06, + "loss": 0.7569, + "num_tokens": 62307529856.0, + "step": 14907 + }, + { + "epoch": 1.7715983363042187, + "grad_norm": 0.2190830070416752, + "learning_rate": 2.609259779164157e-06, + "loss": 0.7695, + "num_tokens": 62311688645.0, + "step": 14908 + }, + { + "epoch": 1.7717171717171718, + "grad_norm": 0.2119236806224626, + "learning_rate": 2.6086335270049218e-06, + "loss": 0.8, + "num_tokens": 62315865927.0, + "step": 14909 + }, + { + "epoch": 1.771836007130125, + "grad_norm": 0.2208072249651102, + "learning_rate": 2.6080075856063584e-06, + "loss": 0.8145, + "num_tokens": 62320053477.0, + "step": 14910 + }, + { + "epoch": 1.771954842543078, + "grad_norm": 0.2256394809144923, + "learning_rate": 2.607381954991649e-06, + "loss": 0.7737, + "num_tokens": 62324206629.0, + "step": 14911 + }, + { + "epoch": 1.7720736779560309, + "grad_norm": 0.22154925561760597, + "learning_rate": 2.6067566351839617e-06, + "loss": 0.7993, + "num_tokens": 62328395888.0, + "step": 14912 + }, + { + "epoch": 1.772192513368984, + "grad_norm": 0.21341161231630573, + "learning_rate": 2.606131626206455e-06, + "loss": 0.8039, + "num_tokens": 62332586249.0, + "step": 14913 + }, + { + "epoch": 1.7723113487819369, + "grad_norm": 0.23869788952587953, + "learning_rate": 2.605506928082272e-06, + "loss": 0.7987, + "num_tokens": 62336753115.0, + "step": 14914 + }, + { + "epoch": 1.77243018419489, + "grad_norm": 0.22135836743922155, + "learning_rate": 2.604882540834552e-06, + "loss": 0.8016, + "num_tokens": 62340932902.0, + "step": 14915 + }, + { + "epoch": 1.772549019607843, + "grad_norm": 0.2335071392533725, + "learning_rate": 2.604258464486415e-06, + "loss": 0.8049, + "num_tokens": 62345098843.0, + "step": 14916 + }, + { + "epoch": 1.7726678550207962, + "grad_norm": 0.22970617262209603, + "learning_rate": 2.6036346990609746e-06, + "loss": 0.8011, + "num_tokens": 62349289668.0, + "step": 14917 + }, + { + "epoch": 1.7727866904337493, + "grad_norm": 0.2205575437675891, + "learning_rate": 2.603011244581331e-06, + "loss": 0.7971, + "num_tokens": 62353465720.0, + "step": 14918 + }, + { + "epoch": 1.7729055258467024, + "grad_norm": 0.23441258360812883, + "learning_rate": 2.6023881010705708e-06, + "loss": 0.8033, + "num_tokens": 62357655187.0, + "step": 14919 + }, + { + "epoch": 1.7730243612596555, + "grad_norm": 0.2347420788158476, + "learning_rate": 2.601765268551771e-06, + "loss": 0.782, + "num_tokens": 62361844394.0, + "step": 14920 + }, + { + "epoch": 1.7731431966726086, + "grad_norm": 0.21999348738320515, + "learning_rate": 2.601142747047999e-06, + "loss": 0.7732, + "num_tokens": 62366033800.0, + "step": 14921 + }, + { + "epoch": 1.7732620320855617, + "grad_norm": 0.2534040928358838, + "learning_rate": 2.6005205365823096e-06, + "loss": 0.7747, + "num_tokens": 62370212611.0, + "step": 14922 + }, + { + "epoch": 1.7733808674985145, + "grad_norm": 0.23651551891867273, + "learning_rate": 2.5998986371777434e-06, + "loss": 0.8345, + "num_tokens": 62374403530.0, + "step": 14923 + }, + { + "epoch": 1.7734997029114676, + "grad_norm": 0.23074936700445173, + "learning_rate": 2.599277048857332e-06, + "loss": 0.7909, + "num_tokens": 62378592421.0, + "step": 14924 + }, + { + "epoch": 1.7736185383244205, + "grad_norm": 0.2428228636401742, + "learning_rate": 2.5986557716440953e-06, + "loss": 0.7881, + "num_tokens": 62382782513.0, + "step": 14925 + }, + { + "epoch": 1.7737373737373736, + "grad_norm": 0.236283631177813, + "learning_rate": 2.5980348055610406e-06, + "loss": 0.8517, + "num_tokens": 62386971860.0, + "step": 14926 + }, + { + "epoch": 1.7738562091503267, + "grad_norm": 0.22048361761652316, + "learning_rate": 2.597414150631163e-06, + "loss": 0.7945, + "num_tokens": 62391154490.0, + "step": 14927 + }, + { + "epoch": 1.7739750445632798, + "grad_norm": 0.22407436474637302, + "learning_rate": 2.5967938068774525e-06, + "loss": 0.7945, + "num_tokens": 62395334126.0, + "step": 14928 + }, + { + "epoch": 1.774093879976233, + "grad_norm": 0.2214004669480224, + "learning_rate": 2.596173774322878e-06, + "loss": 0.7877, + "num_tokens": 62399523892.0, + "step": 14929 + }, + { + "epoch": 1.774212715389186, + "grad_norm": 0.23558444919939583, + "learning_rate": 2.595554052990402e-06, + "loss": 0.8294, + "num_tokens": 62403702114.0, + "step": 14930 + }, + { + "epoch": 1.7743315508021391, + "grad_norm": 0.2277352489116264, + "learning_rate": 2.594934642902976e-06, + "loss": 0.8139, + "num_tokens": 62407869138.0, + "step": 14931 + }, + { + "epoch": 1.7744503862150922, + "grad_norm": 0.22701866639226725, + "learning_rate": 2.5943155440835394e-06, + "loss": 0.7983, + "num_tokens": 62412058800.0, + "step": 14932 + }, + { + "epoch": 1.7745692216280453, + "grad_norm": 0.22929151312591253, + "learning_rate": 2.5936967565550166e-06, + "loss": 0.7909, + "num_tokens": 62416246228.0, + "step": 14933 + }, + { + "epoch": 1.7746880570409982, + "grad_norm": 0.2332636631522953, + "learning_rate": 2.5930782803403258e-06, + "loss": 0.8424, + "num_tokens": 62420435152.0, + "step": 14934 + }, + { + "epoch": 1.7748068924539513, + "grad_norm": 0.24672039965425707, + "learning_rate": 2.592460115462371e-06, + "loss": 0.7752, + "num_tokens": 62424624194.0, + "step": 14935 + }, + { + "epoch": 1.7749257278669044, + "grad_norm": 0.21919409879158708, + "learning_rate": 2.5918422619440447e-06, + "loss": 0.768, + "num_tokens": 62428812516.0, + "step": 14936 + }, + { + "epoch": 1.7750445632798573, + "grad_norm": 0.22280513100191865, + "learning_rate": 2.591224719808229e-06, + "loss": 0.8096, + "num_tokens": 62433001248.0, + "step": 14937 + }, + { + "epoch": 1.7751633986928104, + "grad_norm": 0.2257716586640315, + "learning_rate": 2.5906074890777917e-06, + "loss": 0.83, + "num_tokens": 62437191172.0, + "step": 14938 + }, + { + "epoch": 1.7752822341057635, + "grad_norm": 0.2368810517202493, + "learning_rate": 2.589990569775591e-06, + "loss": 0.757, + "num_tokens": 62441370183.0, + "step": 14939 + }, + { + "epoch": 1.7754010695187166, + "grad_norm": 0.2249319906339375, + "learning_rate": 2.5893739619244763e-06, + "loss": 0.8325, + "num_tokens": 62445559660.0, + "step": 14940 + }, + { + "epoch": 1.7755199049316697, + "grad_norm": 0.22278805527312817, + "learning_rate": 2.5887576655472796e-06, + "loss": 0.7953, + "num_tokens": 62449748676.0, + "step": 14941 + }, + { + "epoch": 1.7756387403446228, + "grad_norm": 0.23218514329412804, + "learning_rate": 2.5881416806668265e-06, + "loss": 0.799, + "num_tokens": 62453887012.0, + "step": 14942 + }, + { + "epoch": 1.7757575757575759, + "grad_norm": 0.21810938107794867, + "learning_rate": 2.5875260073059273e-06, + "loss": 0.7749, + "num_tokens": 62458076253.0, + "step": 14943 + }, + { + "epoch": 1.775876411170529, + "grad_norm": 0.23255771355017563, + "learning_rate": 2.586910645487383e-06, + "loss": 0.7934, + "num_tokens": 62462266088.0, + "step": 14944 + }, + { + "epoch": 1.7759952465834818, + "grad_norm": 0.2244506097414426, + "learning_rate": 2.5862955952339847e-06, + "loss": 0.8317, + "num_tokens": 62466455409.0, + "step": 14945 + }, + { + "epoch": 1.776114081996435, + "grad_norm": 0.2214242743747214, + "learning_rate": 2.5856808565685053e-06, + "loss": 0.7808, + "num_tokens": 62470633301.0, + "step": 14946 + }, + { + "epoch": 1.776232917409388, + "grad_norm": 0.2343281894656092, + "learning_rate": 2.5850664295137146e-06, + "loss": 0.7889, + "num_tokens": 62474822624.0, + "step": 14947 + }, + { + "epoch": 1.776351752822341, + "grad_norm": 0.22557471743734364, + "learning_rate": 2.584452314092366e-06, + "loss": 0.8468, + "num_tokens": 62479011145.0, + "step": 14948 + }, + { + "epoch": 1.776470588235294, + "grad_norm": 0.22574829561921173, + "learning_rate": 2.583838510327201e-06, + "loss": 0.7923, + "num_tokens": 62483176853.0, + "step": 14949 + }, + { + "epoch": 1.7765894236482471, + "grad_norm": 0.2333362912158736, + "learning_rate": 2.583225018240954e-06, + "loss": 0.8167, + "num_tokens": 62487366327.0, + "step": 14950 + }, + { + "epoch": 1.7767082590612002, + "grad_norm": 0.21944612255327292, + "learning_rate": 2.5826118378563403e-06, + "loss": 0.8172, + "num_tokens": 62491555680.0, + "step": 14951 + }, + { + "epoch": 1.7768270944741533, + "grad_norm": 0.2245745064745037, + "learning_rate": 2.581998969196069e-06, + "loss": 0.824, + "num_tokens": 62495718796.0, + "step": 14952 + }, + { + "epoch": 1.7769459298871064, + "grad_norm": 0.22019738504213865, + "learning_rate": 2.5813864122828385e-06, + "loss": 0.8274, + "num_tokens": 62499878989.0, + "step": 14953 + }, + { + "epoch": 1.7770647653000595, + "grad_norm": 0.21192470767538943, + "learning_rate": 2.580774167139334e-06, + "loss": 0.8106, + "num_tokens": 62504068487.0, + "step": 14954 + }, + { + "epoch": 1.7771836007130126, + "grad_norm": 0.2246102565825224, + "learning_rate": 2.5801622337882294e-06, + "loss": 0.8092, + "num_tokens": 62508257610.0, + "step": 14955 + }, + { + "epoch": 1.7773024361259655, + "grad_norm": 0.22259731024099394, + "learning_rate": 2.579550612252184e-06, + "loss": 0.807, + "num_tokens": 62512445726.0, + "step": 14956 + }, + { + "epoch": 1.7774212715389186, + "grad_norm": 0.22053734187952406, + "learning_rate": 2.5789393025538495e-06, + "loss": 0.829, + "num_tokens": 62516604007.0, + "step": 14957 + }, + { + "epoch": 1.7775401069518717, + "grad_norm": 0.22114409115517247, + "learning_rate": 2.5783283047158654e-06, + "loss": 0.8124, + "num_tokens": 62520775036.0, + "step": 14958 + }, + { + "epoch": 1.7776589423648246, + "grad_norm": 0.23436997448067204, + "learning_rate": 2.577717618760857e-06, + "loss": 0.8129, + "num_tokens": 62524938026.0, + "step": 14959 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.21460712771278279, + "learning_rate": 2.5771072447114428e-06, + "loss": 0.7959, + "num_tokens": 62529108487.0, + "step": 14960 + }, + { + "epoch": 1.7778966131907308, + "grad_norm": 0.22126308101814166, + "learning_rate": 2.5764971825902257e-06, + "loss": 0.7722, + "num_tokens": 62533275797.0, + "step": 14961 + }, + { + "epoch": 1.7780154486036839, + "grad_norm": 0.22281975702160198, + "learning_rate": 2.5758874324197988e-06, + "loss": 0.8074, + "num_tokens": 62537465668.0, + "step": 14962 + }, + { + "epoch": 1.778134284016637, + "grad_norm": 0.23452976255347913, + "learning_rate": 2.575277994222742e-06, + "loss": 0.7903, + "num_tokens": 62541655270.0, + "step": 14963 + }, + { + "epoch": 1.77825311942959, + "grad_norm": 0.21846040047992352, + "learning_rate": 2.574668868021626e-06, + "loss": 0.8092, + "num_tokens": 62545833315.0, + "step": 14964 + }, + { + "epoch": 1.7783719548425432, + "grad_norm": 0.22648686713028418, + "learning_rate": 2.5740600538390093e-06, + "loss": 0.8015, + "num_tokens": 62549996721.0, + "step": 14965 + }, + { + "epoch": 1.7784907902554963, + "grad_norm": 0.23781414729812514, + "learning_rate": 2.573451551697437e-06, + "loss": 0.7861, + "num_tokens": 62554177773.0, + "step": 14966 + }, + { + "epoch": 1.7786096256684492, + "grad_norm": 0.22044753880863246, + "learning_rate": 2.5728433616194447e-06, + "loss": 0.8098, + "num_tokens": 62558340001.0, + "step": 14967 + }, + { + "epoch": 1.7787284610814023, + "grad_norm": 0.22547574080573551, + "learning_rate": 2.572235483627556e-06, + "loss": 0.791, + "num_tokens": 62562501727.0, + "step": 14968 + }, + { + "epoch": 1.7788472964943554, + "grad_norm": 0.21893058818309588, + "learning_rate": 2.5716279177442815e-06, + "loss": 0.817, + "num_tokens": 62566671369.0, + "step": 14969 + }, + { + "epoch": 1.7789661319073082, + "grad_norm": 0.22238366840424031, + "learning_rate": 2.5710206639921226e-06, + "loss": 0.77, + "num_tokens": 62570861414.0, + "step": 14970 + }, + { + "epoch": 1.7790849673202613, + "grad_norm": 0.22161167702731682, + "learning_rate": 2.5704137223935675e-06, + "loss": 0.8075, + "num_tokens": 62575050039.0, + "step": 14971 + }, + { + "epoch": 1.7792038027332144, + "grad_norm": 0.233756144177478, + "learning_rate": 2.569807092971094e-06, + "loss": 0.8287, + "num_tokens": 62579239954.0, + "step": 14972 + }, + { + "epoch": 1.7793226381461675, + "grad_norm": 0.22460202609202534, + "learning_rate": 2.569200775747166e-06, + "loss": 0.7973, + "num_tokens": 62583397337.0, + "step": 14973 + }, + { + "epoch": 1.7794414735591206, + "grad_norm": 0.3535632643057047, + "learning_rate": 2.568594770744239e-06, + "loss": 0.7785, + "num_tokens": 62587554572.0, + "step": 14974 + }, + { + "epoch": 1.7795603089720737, + "grad_norm": 0.24892292219205295, + "learning_rate": 2.5679890779847545e-06, + "loss": 0.8197, + "num_tokens": 62591743807.0, + "step": 14975 + }, + { + "epoch": 1.7796791443850268, + "grad_norm": 0.22664910838317864, + "learning_rate": 2.5673836974911455e-06, + "loss": 0.8508, + "num_tokens": 62595931944.0, + "step": 14976 + }, + { + "epoch": 1.77979797979798, + "grad_norm": 0.24307192672550296, + "learning_rate": 2.5667786292858283e-06, + "loss": 0.8064, + "num_tokens": 62600096356.0, + "step": 14977 + }, + { + "epoch": 1.7799168152109328, + "grad_norm": 0.2336764233747129, + "learning_rate": 2.56617387339121e-06, + "loss": 0.8288, + "num_tokens": 62604286118.0, + "step": 14978 + }, + { + "epoch": 1.780035650623886, + "grad_norm": 0.21748110839541693, + "learning_rate": 2.565569429829691e-06, + "loss": 0.8217, + "num_tokens": 62608446236.0, + "step": 14979 + }, + { + "epoch": 1.780154486036839, + "grad_norm": 0.23194034456585957, + "learning_rate": 2.5649652986236523e-06, + "loss": 0.8162, + "num_tokens": 62612608494.0, + "step": 14980 + }, + { + "epoch": 1.780273321449792, + "grad_norm": 0.224120644388479, + "learning_rate": 2.5643614797954695e-06, + "loss": 0.7914, + "num_tokens": 62616797959.0, + "step": 14981 + }, + { + "epoch": 1.780392156862745, + "grad_norm": 0.22304269600210613, + "learning_rate": 2.5637579733675016e-06, + "loss": 0.7913, + "num_tokens": 62620986734.0, + "step": 14982 + }, + { + "epoch": 1.780510992275698, + "grad_norm": 0.22691496245318618, + "learning_rate": 2.5631547793620996e-06, + "loss": 0.8442, + "num_tokens": 62625161926.0, + "step": 14983 + }, + { + "epoch": 1.7806298276886512, + "grad_norm": 0.22533180771695813, + "learning_rate": 2.5625518978016008e-06, + "loss": 0.8112, + "num_tokens": 62629345778.0, + "step": 14984 + }, + { + "epoch": 1.7807486631016043, + "grad_norm": 0.2274206760646642, + "learning_rate": 2.5619493287083336e-06, + "loss": 0.7672, + "num_tokens": 62633534327.0, + "step": 14985 + }, + { + "epoch": 1.7808674985145574, + "grad_norm": 0.23850898851168678, + "learning_rate": 2.5613470721046145e-06, + "loss": 0.8539, + "num_tokens": 62637661031.0, + "step": 14986 + }, + { + "epoch": 1.7809863339275105, + "grad_norm": 0.2260228791411287, + "learning_rate": 2.5607451280127433e-06, + "loss": 0.8219, + "num_tokens": 62641806028.0, + "step": 14987 + }, + { + "epoch": 1.7811051693404636, + "grad_norm": 0.2265521569149049, + "learning_rate": 2.5601434964550143e-06, + "loss": 0.8386, + "num_tokens": 62645975237.0, + "step": 14988 + }, + { + "epoch": 1.7812240047534165, + "grad_norm": 0.23611239130224693, + "learning_rate": 2.559542177453707e-06, + "loss": 0.8225, + "num_tokens": 62650164901.0, + "step": 14989 + }, + { + "epoch": 1.7813428401663696, + "grad_norm": 0.22306607313945445, + "learning_rate": 2.558941171031089e-06, + "loss": 0.8022, + "num_tokens": 62654353854.0, + "step": 14990 + }, + { + "epoch": 1.7814616755793227, + "grad_norm": 0.2230187647109092, + "learning_rate": 2.558340477209423e-06, + "loss": 0.8025, + "num_tokens": 62658542938.0, + "step": 14991 + }, + { + "epoch": 1.7815805109922755, + "grad_norm": 0.22530453301293452, + "learning_rate": 2.5577400960109494e-06, + "loss": 0.7884, + "num_tokens": 62662730357.0, + "step": 14992 + }, + { + "epoch": 1.7816993464052286, + "grad_norm": 0.22202171201067825, + "learning_rate": 2.5571400274579036e-06, + "loss": 0.8196, + "num_tokens": 62666920065.0, + "step": 14993 + }, + { + "epoch": 1.7818181818181817, + "grad_norm": 0.21879778790167254, + "learning_rate": 2.5565402715725084e-06, + "loss": 0.807, + "num_tokens": 62671100213.0, + "step": 14994 + }, + { + "epoch": 1.7819370172311348, + "grad_norm": 0.24436107421880557, + "learning_rate": 2.5559408283769752e-06, + "loss": 0.8095, + "num_tokens": 62675263545.0, + "step": 14995 + }, + { + "epoch": 1.782055852644088, + "grad_norm": 0.22968985808723819, + "learning_rate": 2.555341697893504e-06, + "loss": 0.8308, + "num_tokens": 62679451021.0, + "step": 14996 + }, + { + "epoch": 1.782174688057041, + "grad_norm": 0.2233360066938678, + "learning_rate": 2.55474288014428e-06, + "loss": 0.8242, + "num_tokens": 62683614447.0, + "step": 14997 + }, + { + "epoch": 1.7822935234699941, + "grad_norm": 0.2378462337242317, + "learning_rate": 2.5541443751514838e-06, + "loss": 0.7772, + "num_tokens": 62687803402.0, + "step": 14998 + }, + { + "epoch": 1.7824123588829472, + "grad_norm": 0.23881903933098392, + "learning_rate": 2.553546182937276e-06, + "loss": 0.8229, + "num_tokens": 62691993601.0, + "step": 14999 + }, + { + "epoch": 1.7825311942959001, + "grad_norm": 0.22351904214479676, + "learning_rate": 2.55294830352381e-06, + "loss": 0.7817, + "num_tokens": 62696183529.0, + "step": 15000 + }, + { + "epoch": 1.7826500297088532, + "grad_norm": 0.25508789693910355, + "learning_rate": 2.55235073693323e-06, + "loss": 0.8271, + "num_tokens": 62700373518.0, + "step": 15001 + }, + { + "epoch": 1.7827688651218063, + "grad_norm": 0.21896848748936248, + "learning_rate": 2.5517534831876663e-06, + "loss": 0.7562, + "num_tokens": 62704554168.0, + "step": 15002 + }, + { + "epoch": 1.7828877005347592, + "grad_norm": 0.24124493306320238, + "learning_rate": 2.551156542309231e-06, + "loss": 0.8272, + "num_tokens": 62708742813.0, + "step": 15003 + }, + { + "epoch": 1.7830065359477123, + "grad_norm": 0.2614949546379178, + "learning_rate": 2.5505599143200377e-06, + "loss": 0.8133, + "num_tokens": 62712884330.0, + "step": 15004 + }, + { + "epoch": 1.7831253713606654, + "grad_norm": 0.21671565324370773, + "learning_rate": 2.5499635992421785e-06, + "loss": 0.8475, + "num_tokens": 62717064418.0, + "step": 15005 + }, + { + "epoch": 1.7832442067736185, + "grad_norm": 0.244426982653921, + "learning_rate": 2.549367597097737e-06, + "loss": 0.8017, + "num_tokens": 62721253305.0, + "step": 15006 + }, + { + "epoch": 1.7833630421865716, + "grad_norm": 0.23548685227530158, + "learning_rate": 2.548771907908788e-06, + "loss": 0.8331, + "num_tokens": 62725443219.0, + "step": 15007 + }, + { + "epoch": 1.7834818775995247, + "grad_norm": 0.23633027973605933, + "learning_rate": 2.5481765316973873e-06, + "loss": 0.7967, + "num_tokens": 62729610346.0, + "step": 15008 + }, + { + "epoch": 1.7836007130124778, + "grad_norm": 0.23059029666741201, + "learning_rate": 2.5475814684855854e-06, + "loss": 0.8102, + "num_tokens": 62733779273.0, + "step": 15009 + }, + { + "epoch": 1.783719548425431, + "grad_norm": 0.22946127692056595, + "learning_rate": 2.5469867182954203e-06, + "loss": 0.8135, + "num_tokens": 62737966381.0, + "step": 15010 + }, + { + "epoch": 1.783838383838384, + "grad_norm": 0.23387179302578243, + "learning_rate": 2.5463922811489183e-06, + "loss": 0.8082, + "num_tokens": 62742153232.0, + "step": 15011 + }, + { + "epoch": 1.7839572192513369, + "grad_norm": 0.23598571288021364, + "learning_rate": 2.545798157068094e-06, + "loss": 0.7974, + "num_tokens": 62746343122.0, + "step": 15012 + }, + { + "epoch": 1.78407605466429, + "grad_norm": 0.23693035766076745, + "learning_rate": 2.545204346074947e-06, + "loss": 0.8464, + "num_tokens": 62750533211.0, + "step": 15013 + }, + { + "epoch": 1.7841948900772429, + "grad_norm": 0.23347991427311643, + "learning_rate": 2.544610848191468e-06, + "loss": 0.8145, + "num_tokens": 62754722476.0, + "step": 15014 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.22922063610632995, + "learning_rate": 2.5440176634396392e-06, + "loss": 0.7907, + "num_tokens": 62758881629.0, + "step": 15015 + }, + { + "epoch": 1.784432560903149, + "grad_norm": 0.22758622778510132, + "learning_rate": 2.543424791841424e-06, + "loss": 0.7814, + "num_tokens": 62763070143.0, + "step": 15016 + }, + { + "epoch": 1.7845513963161022, + "grad_norm": 0.2248029699108177, + "learning_rate": 2.5428322334187845e-06, + "loss": 0.8753, + "num_tokens": 62767249783.0, + "step": 15017 + }, + { + "epoch": 1.7846702317290553, + "grad_norm": 0.21964641821658512, + "learning_rate": 2.5422399881936603e-06, + "loss": 0.846, + "num_tokens": 62771435691.0, + "step": 15018 + }, + { + "epoch": 1.7847890671420084, + "grad_norm": 0.21193261340866465, + "learning_rate": 2.5416480561879865e-06, + "loss": 0.7548, + "num_tokens": 62775625442.0, + "step": 15019 + }, + { + "epoch": 1.7849079025549615, + "grad_norm": 0.2265387171376402, + "learning_rate": 2.5410564374236816e-06, + "loss": 0.812, + "num_tokens": 62779738527.0, + "step": 15020 + }, + { + "epoch": 1.7850267379679146, + "grad_norm": 0.21907838538999733, + "learning_rate": 2.5404651319226585e-06, + "loss": 0.7945, + "num_tokens": 62783927728.0, + "step": 15021 + }, + { + "epoch": 1.7851455733808677, + "grad_norm": 0.2344595069191071, + "learning_rate": 2.5398741397068146e-06, + "loss": 0.8235, + "num_tokens": 62788115695.0, + "step": 15022 + }, + { + "epoch": 1.7852644087938205, + "grad_norm": 0.2307751359188111, + "learning_rate": 2.5392834607980343e-06, + "loss": 0.8083, + "num_tokens": 62792273228.0, + "step": 15023 + }, + { + "epoch": 1.7853832442067736, + "grad_norm": 0.22384262236991218, + "learning_rate": 2.538693095218195e-06, + "loss": 0.8363, + "num_tokens": 62796461862.0, + "step": 15024 + }, + { + "epoch": 1.7855020796197265, + "grad_norm": 0.22958390812703575, + "learning_rate": 2.538103042989158e-06, + "loss": 0.8037, + "num_tokens": 62800635564.0, + "step": 15025 + }, + { + "epoch": 1.7856209150326796, + "grad_norm": 0.2393714956960949, + "learning_rate": 2.537513304132776e-06, + "loss": 0.826, + "num_tokens": 62804802005.0, + "step": 15026 + }, + { + "epoch": 1.7857397504456327, + "grad_norm": 0.22905166826076773, + "learning_rate": 2.5369238786708884e-06, + "loss": 0.7821, + "num_tokens": 62808977029.0, + "step": 15027 + }, + { + "epoch": 1.7858585858585858, + "grad_norm": 0.21971707180150932, + "learning_rate": 2.5363347666253253e-06, + "loss": 0.8143, + "num_tokens": 62813166050.0, + "step": 15028 + }, + { + "epoch": 1.785977421271539, + "grad_norm": 0.22747244222500154, + "learning_rate": 2.5357459680178987e-06, + "loss": 0.7986, + "num_tokens": 62817341432.0, + "step": 15029 + }, + { + "epoch": 1.786096256684492, + "grad_norm": 0.23775262646422832, + "learning_rate": 2.53515748287042e-06, + "loss": 0.8029, + "num_tokens": 62821530703.0, + "step": 15030 + }, + { + "epoch": 1.7862150920974451, + "grad_norm": 0.2173637342550287, + "learning_rate": 2.534569311204679e-06, + "loss": 0.8342, + "num_tokens": 62825693393.0, + "step": 15031 + }, + { + "epoch": 1.7863339275103982, + "grad_norm": 0.23209358679948885, + "learning_rate": 2.5339814530424594e-06, + "loss": 0.7997, + "num_tokens": 62829871296.0, + "step": 15032 + }, + { + "epoch": 1.7864527629233513, + "grad_norm": 0.25671976878718866, + "learning_rate": 2.5333939084055315e-06, + "loss": 0.7856, + "num_tokens": 62834061195.0, + "step": 15033 + }, + { + "epoch": 1.7865715983363042, + "grad_norm": 0.22714072728083448, + "learning_rate": 2.532806677315653e-06, + "loss": 0.7948, + "num_tokens": 62838228299.0, + "step": 15034 + }, + { + "epoch": 1.7866904337492573, + "grad_norm": 0.23144639468063868, + "learning_rate": 2.5322197597945693e-06, + "loss": 0.8069, + "num_tokens": 62842417199.0, + "step": 15035 + }, + { + "epoch": 1.7868092691622104, + "grad_norm": 0.2311656238320107, + "learning_rate": 2.53163315586402e-06, + "loss": 0.7995, + "num_tokens": 62846579716.0, + "step": 15036 + }, + { + "epoch": 1.7869281045751633, + "grad_norm": 0.23417143121730213, + "learning_rate": 2.5310468655457273e-06, + "loss": 0.765, + "num_tokens": 62850768752.0, + "step": 15037 + }, + { + "epoch": 1.7870469399881164, + "grad_norm": 0.2272778876393, + "learning_rate": 2.5304608888614035e-06, + "loss": 0.8319, + "num_tokens": 62854956227.0, + "step": 15038 + }, + { + "epoch": 1.7871657754010695, + "grad_norm": 0.23159965397681864, + "learning_rate": 2.529875225832749e-06, + "loss": 0.7841, + "num_tokens": 62859124970.0, + "step": 15039 + }, + { + "epoch": 1.7872846108140226, + "grad_norm": 0.24135967538419825, + "learning_rate": 2.5292898764814533e-06, + "loss": 0.7839, + "num_tokens": 62863314348.0, + "step": 15040 + }, + { + "epoch": 1.7874034462269757, + "grad_norm": 0.2286090568124569, + "learning_rate": 2.5287048408291913e-06, + "loss": 0.8205, + "num_tokens": 62867491615.0, + "step": 15041 + }, + { + "epoch": 1.7875222816399288, + "grad_norm": 0.21199674808368754, + "learning_rate": 2.5281201188976333e-06, + "loss": 0.7992, + "num_tokens": 62871681545.0, + "step": 15042 + }, + { + "epoch": 1.7876411170528819, + "grad_norm": 0.2284716180189577, + "learning_rate": 2.5275357107084318e-06, + "loss": 0.7966, + "num_tokens": 62875867876.0, + "step": 15043 + }, + { + "epoch": 1.787759952465835, + "grad_norm": 0.25164745204044053, + "learning_rate": 2.5269516162832295e-06, + "loss": 0.7818, + "num_tokens": 62880057793.0, + "step": 15044 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 0.2237798193040601, + "learning_rate": 2.5263678356436556e-06, + "loss": 0.816, + "num_tokens": 62884215146.0, + "step": 15045 + }, + { + "epoch": 1.787997623291741, + "grad_norm": 0.23822157910408662, + "learning_rate": 2.5257843688113306e-06, + "loss": 0.7745, + "num_tokens": 62888348198.0, + "step": 15046 + }, + { + "epoch": 1.788116458704694, + "grad_norm": 0.23437274216390178, + "learning_rate": 2.525201215807863e-06, + "loss": 0.7894, + "num_tokens": 62892537919.0, + "step": 15047 + }, + { + "epoch": 1.788235294117647, + "grad_norm": 0.22879444430896612, + "learning_rate": 2.5246183766548476e-06, + "loss": 0.8358, + "num_tokens": 62896727668.0, + "step": 15048 + }, + { + "epoch": 1.7883541295306, + "grad_norm": 0.2306242307125443, + "learning_rate": 2.5240358513738706e-06, + "loss": 0.8193, + "num_tokens": 62900917637.0, + "step": 15049 + }, + { + "epoch": 1.7884729649435531, + "grad_norm": 0.2148427975494238, + "learning_rate": 2.5234536399865034e-06, + "loss": 0.8106, + "num_tokens": 62905105126.0, + "step": 15050 + }, + { + "epoch": 1.7885918003565062, + "grad_norm": 0.22504183292418156, + "learning_rate": 2.522871742514307e-06, + "loss": 0.784, + "num_tokens": 62909286971.0, + "step": 15051 + }, + { + "epoch": 1.7887106357694593, + "grad_norm": 0.24498841649403122, + "learning_rate": 2.522290158978834e-06, + "loss": 0.8109, + "num_tokens": 62913475798.0, + "step": 15052 + }, + { + "epoch": 1.7888294711824124, + "grad_norm": 0.21556236290425157, + "learning_rate": 2.5217088894016184e-06, + "loss": 0.8147, + "num_tokens": 62917664532.0, + "step": 15053 + }, + { + "epoch": 1.7889483065953655, + "grad_norm": 0.22637371200339634, + "learning_rate": 2.521127933804189e-06, + "loss": 0.7935, + "num_tokens": 62921854476.0, + "step": 15054 + }, + { + "epoch": 1.7890671420083186, + "grad_norm": 0.21867207011770023, + "learning_rate": 2.52054729220806e-06, + "loss": 0.8435, + "num_tokens": 62926043100.0, + "step": 15055 + }, + { + "epoch": 1.7891859774212715, + "grad_norm": 0.24179921565418663, + "learning_rate": 2.519966964634734e-06, + "loss": 0.819, + "num_tokens": 62930232527.0, + "step": 15056 + }, + { + "epoch": 1.7893048128342246, + "grad_norm": 0.21950894715071675, + "learning_rate": 2.5193869511057035e-06, + "loss": 0.794, + "num_tokens": 62934421139.0, + "step": 15057 + }, + { + "epoch": 1.7894236482471777, + "grad_norm": 0.20761293424794428, + "learning_rate": 2.518807251642447e-06, + "loss": 0.8274, + "num_tokens": 62938594619.0, + "step": 15058 + }, + { + "epoch": 1.7895424836601306, + "grad_norm": 0.21506524738430718, + "learning_rate": 2.5182278662664363e-06, + "loss": 0.8591, + "num_tokens": 62942782989.0, + "step": 15059 + }, + { + "epoch": 1.7896613190730837, + "grad_norm": 0.22190715169435618, + "learning_rate": 2.5176487949991224e-06, + "loss": 0.8173, + "num_tokens": 62946973149.0, + "step": 15060 + }, + { + "epoch": 1.7897801544860368, + "grad_norm": 0.2264519143475529, + "learning_rate": 2.5170700378619526e-06, + "loss": 0.7803, + "num_tokens": 62951161590.0, + "step": 15061 + }, + { + "epoch": 1.7898989898989899, + "grad_norm": 0.2145578657031533, + "learning_rate": 2.5164915948763625e-06, + "loss": 0.7875, + "num_tokens": 62955351880.0, + "step": 15062 + }, + { + "epoch": 1.790017825311943, + "grad_norm": 0.2256695433423466, + "learning_rate": 2.5159134660637717e-06, + "loss": 0.7675, + "num_tokens": 62959541841.0, + "step": 15063 + }, + { + "epoch": 1.790136660724896, + "grad_norm": 0.22380485661799893, + "learning_rate": 2.5153356514455924e-06, + "loss": 0.8142, + "num_tokens": 62963723645.0, + "step": 15064 + }, + { + "epoch": 1.7902554961378492, + "grad_norm": 0.21572004467528508, + "learning_rate": 2.5147581510432194e-06, + "loss": 0.8258, + "num_tokens": 62967901989.0, + "step": 15065 + }, + { + "epoch": 1.7903743315508023, + "grad_norm": 0.23579938607050988, + "learning_rate": 2.5141809648780407e-06, + "loss": 0.8248, + "num_tokens": 62972090516.0, + "step": 15066 + }, + { + "epoch": 1.7904931669637552, + "grad_norm": 0.22877253478033746, + "learning_rate": 2.5136040929714324e-06, + "loss": 0.8117, + "num_tokens": 62976262115.0, + "step": 15067 + }, + { + "epoch": 1.7906120023767083, + "grad_norm": 0.22352200405239722, + "learning_rate": 2.513027535344758e-06, + "loss": 0.8051, + "num_tokens": 62980451389.0, + "step": 15068 + }, + { + "epoch": 1.7907308377896614, + "grad_norm": 0.22199188098787953, + "learning_rate": 2.5124512920193704e-06, + "loss": 0.8384, + "num_tokens": 62984639497.0, + "step": 15069 + }, + { + "epoch": 1.7908496732026142, + "grad_norm": 0.21701224519189466, + "learning_rate": 2.511875363016607e-06, + "loss": 0.769, + "num_tokens": 62988829352.0, + "step": 15070 + }, + { + "epoch": 1.7909685086155673, + "grad_norm": 0.25061987341661496, + "learning_rate": 2.5112997483577985e-06, + "loss": 0.8376, + "num_tokens": 62993017142.0, + "step": 15071 + }, + { + "epoch": 1.7910873440285204, + "grad_norm": 0.23070114412776896, + "learning_rate": 2.5107244480642615e-06, + "loss": 0.8047, + "num_tokens": 62997208177.0, + "step": 15072 + }, + { + "epoch": 1.7912061794414735, + "grad_norm": 0.23200831140711228, + "learning_rate": 2.5101494621573e-06, + "loss": 0.8256, + "num_tokens": 63001381438.0, + "step": 15073 + }, + { + "epoch": 1.7913250148544266, + "grad_norm": 0.25108271500237683, + "learning_rate": 2.50957479065821e-06, + "loss": 0.8338, + "num_tokens": 63005538444.0, + "step": 15074 + }, + { + "epoch": 1.7914438502673797, + "grad_norm": 0.23506447874314923, + "learning_rate": 2.509000433588272e-06, + "loss": 0.833, + "num_tokens": 63009696400.0, + "step": 15075 + }, + { + "epoch": 1.7915626856803328, + "grad_norm": 0.2267091200721397, + "learning_rate": 2.508426390968756e-06, + "loss": 0.8079, + "num_tokens": 63013884427.0, + "step": 15076 + }, + { + "epoch": 1.791681521093286, + "grad_norm": 0.2182700915964763, + "learning_rate": 2.5078526628209225e-06, + "loss": 0.8211, + "num_tokens": 63018073833.0, + "step": 15077 + }, + { + "epoch": 1.7918003565062388, + "grad_norm": 0.21644312512500014, + "learning_rate": 2.5072792491660165e-06, + "loss": 0.7945, + "num_tokens": 63022263765.0, + "step": 15078 + }, + { + "epoch": 1.791919191919192, + "grad_norm": 0.21855199635972042, + "learning_rate": 2.506706150025275e-06, + "loss": 0.8097, + "num_tokens": 63026453168.0, + "step": 15079 + }, + { + "epoch": 1.792038027332145, + "grad_norm": 0.22851208553663738, + "learning_rate": 2.5061333654199215e-06, + "loss": 0.8138, + "num_tokens": 63030642128.0, + "step": 15080 + }, + { + "epoch": 1.792156862745098, + "grad_norm": 0.22593439715368838, + "learning_rate": 2.5055608953711676e-06, + "loss": 0.8026, + "num_tokens": 63034830612.0, + "step": 15081 + }, + { + "epoch": 1.792275698158051, + "grad_norm": 0.21625917725618246, + "learning_rate": 2.5049887399002136e-06, + "loss": 0.7731, + "num_tokens": 63039018636.0, + "step": 15082 + }, + { + "epoch": 1.792394533571004, + "grad_norm": 0.25930770963026134, + "learning_rate": 2.5044168990282504e-06, + "loss": 0.8037, + "num_tokens": 63043207523.0, + "step": 15083 + }, + { + "epoch": 1.7925133689839572, + "grad_norm": 0.22007384613372596, + "learning_rate": 2.503845372776452e-06, + "loss": 0.7949, + "num_tokens": 63047396088.0, + "step": 15084 + }, + { + "epoch": 1.7926322043969103, + "grad_norm": 0.2209917567893485, + "learning_rate": 2.5032741611659867e-06, + "loss": 0.8176, + "num_tokens": 63051562237.0, + "step": 15085 + }, + { + "epoch": 1.7927510398098634, + "grad_norm": 0.2441621349786048, + "learning_rate": 2.502703264218007e-06, + "loss": 0.8372, + "num_tokens": 63055750624.0, + "step": 15086 + }, + { + "epoch": 1.7928698752228165, + "grad_norm": 0.23181041003773936, + "learning_rate": 2.5021326819536558e-06, + "loss": 0.8379, + "num_tokens": 63059939719.0, + "step": 15087 + }, + { + "epoch": 1.7929887106357696, + "grad_norm": 0.2309718125510479, + "learning_rate": 2.5015624143940627e-06, + "loss": 0.8042, + "num_tokens": 63064120403.0, + "step": 15088 + }, + { + "epoch": 1.7931075460487225, + "grad_norm": 0.22421928226620966, + "learning_rate": 2.5009924615603474e-06, + "loss": 0.7767, + "num_tokens": 63068294052.0, + "step": 15089 + }, + { + "epoch": 1.7932263814616756, + "grad_norm": 0.2340749062633927, + "learning_rate": 2.5004228234736173e-06, + "loss": 0.7949, + "num_tokens": 63072483024.0, + "step": 15090 + }, + { + "epoch": 1.7933452168746287, + "grad_norm": 0.2154052348005223, + "learning_rate": 2.4998535001549684e-06, + "loss": 0.8388, + "num_tokens": 63076672051.0, + "step": 15091 + }, + { + "epoch": 1.7934640522875815, + "grad_norm": 0.23107482916926816, + "learning_rate": 2.4992844916254823e-06, + "loss": 0.8049, + "num_tokens": 63080860510.0, + "step": 15092 + }, + { + "epoch": 1.7935828877005346, + "grad_norm": 0.2201878907710965, + "learning_rate": 2.498715797906234e-06, + "loss": 0.8274, + "num_tokens": 63085049707.0, + "step": 15093 + }, + { + "epoch": 1.7937017231134877, + "grad_norm": 0.21300798759465298, + "learning_rate": 2.498147419018284e-06, + "loss": 0.7978, + "num_tokens": 63089183086.0, + "step": 15094 + }, + { + "epoch": 1.7938205585264408, + "grad_norm": 0.21508380670959412, + "learning_rate": 2.49757935498268e-06, + "loss": 0.7581, + "num_tokens": 63093372628.0, + "step": 15095 + }, + { + "epoch": 1.793939393939394, + "grad_norm": 0.22300417075565795, + "learning_rate": 2.4970116058204603e-06, + "loss": 0.8538, + "num_tokens": 63097560989.0, + "step": 15096 + }, + { + "epoch": 1.794058229352347, + "grad_norm": 0.2287974160786741, + "learning_rate": 2.4964441715526506e-06, + "loss": 0.8567, + "num_tokens": 63101727958.0, + "step": 15097 + }, + { + "epoch": 1.7941770647653001, + "grad_norm": 0.21400312670355176, + "learning_rate": 2.495877052200263e-06, + "loss": 0.8016, + "num_tokens": 63105905993.0, + "step": 15098 + }, + { + "epoch": 1.7942959001782532, + "grad_norm": 0.21720605751220468, + "learning_rate": 2.4953102477843e-06, + "loss": 0.7971, + "num_tokens": 63110077196.0, + "step": 15099 + }, + { + "epoch": 1.7944147355912063, + "grad_norm": 0.22157827458931795, + "learning_rate": 2.4947437583257555e-06, + "loss": 0.7914, + "num_tokens": 63114267219.0, + "step": 15100 + }, + { + "epoch": 1.7945335710041592, + "grad_norm": 0.21577156534586137, + "learning_rate": 2.4941775838456077e-06, + "loss": 0.8253, + "num_tokens": 63118455659.0, + "step": 15101 + }, + { + "epoch": 1.7946524064171123, + "grad_norm": 0.20606381397912468, + "learning_rate": 2.4936117243648216e-06, + "loss": 0.8521, + "num_tokens": 63122641971.0, + "step": 15102 + }, + { + "epoch": 1.7947712418300652, + "grad_norm": 0.22138689304362272, + "learning_rate": 2.4930461799043542e-06, + "loss": 0.7829, + "num_tokens": 63126811651.0, + "step": 15103 + }, + { + "epoch": 1.7948900772430183, + "grad_norm": 0.21415920902208027, + "learning_rate": 2.4924809504851487e-06, + "loss": 0.8215, + "num_tokens": 63131001956.0, + "step": 15104 + }, + { + "epoch": 1.7950089126559714, + "grad_norm": 0.21662545422049576, + "learning_rate": 2.491916036128138e-06, + "loss": 0.7951, + "num_tokens": 63135189477.0, + "step": 15105 + }, + { + "epoch": 1.7951277480689245, + "grad_norm": 0.22733053840316939, + "learning_rate": 2.4913514368542454e-06, + "loss": 0.8113, + "num_tokens": 63139379213.0, + "step": 15106 + }, + { + "epoch": 1.7952465834818776, + "grad_norm": 0.22512855170647933, + "learning_rate": 2.490787152684376e-06, + "loss": 0.8037, + "num_tokens": 63143567455.0, + "step": 15107 + }, + { + "epoch": 1.7953654188948307, + "grad_norm": 0.2259794730229398, + "learning_rate": 2.4902231836394275e-06, + "loss": 0.8222, + "num_tokens": 63147749967.0, + "step": 15108 + }, + { + "epoch": 1.7954842543077838, + "grad_norm": 0.22028365977298678, + "learning_rate": 2.4896595297402887e-06, + "loss": 0.8099, + "num_tokens": 63151939495.0, + "step": 15109 + }, + { + "epoch": 1.795603089720737, + "grad_norm": 0.22434542805262886, + "learning_rate": 2.4890961910078303e-06, + "loss": 0.7972, + "num_tokens": 63156100831.0, + "step": 15110 + }, + { + "epoch": 1.79572192513369, + "grad_norm": 0.23041483717941602, + "learning_rate": 2.4885331674629163e-06, + "loss": 0.8039, + "num_tokens": 63160289730.0, + "step": 15111 + }, + { + "epoch": 1.7958407605466429, + "grad_norm": 0.2246603891877961, + "learning_rate": 2.4879704591263975e-06, + "loss": 0.7705, + "num_tokens": 63164479140.0, + "step": 15112 + }, + { + "epoch": 1.795959595959596, + "grad_norm": 0.21972828939377653, + "learning_rate": 2.4874080660191124e-06, + "loss": 0.8105, + "num_tokens": 63168652567.0, + "step": 15113 + }, + { + "epoch": 1.7960784313725489, + "grad_norm": 0.24902078951150997, + "learning_rate": 2.4868459881618875e-06, + "loss": 0.8413, + "num_tokens": 63172841530.0, + "step": 15114 + }, + { + "epoch": 1.796197266785502, + "grad_norm": 0.2202609784937344, + "learning_rate": 2.48628422557554e-06, + "loss": 0.7957, + "num_tokens": 63177011832.0, + "step": 15115 + }, + { + "epoch": 1.796316102198455, + "grad_norm": 0.2196684709703372, + "learning_rate": 2.485722778280873e-06, + "loss": 0.8362, + "num_tokens": 63181198815.0, + "step": 15116 + }, + { + "epoch": 1.7964349376114082, + "grad_norm": 0.230415698955788, + "learning_rate": 2.4851616462986807e-06, + "loss": 0.8096, + "num_tokens": 63185364437.0, + "step": 15117 + }, + { + "epoch": 1.7965537730243613, + "grad_norm": 0.23366495870479528, + "learning_rate": 2.4846008296497393e-06, + "loss": 0.785, + "num_tokens": 63189553129.0, + "step": 15118 + }, + { + "epoch": 1.7966726084373144, + "grad_norm": 0.23289509054590735, + "learning_rate": 2.4840403283548213e-06, + "loss": 0.8225, + "num_tokens": 63193711809.0, + "step": 15119 + }, + { + "epoch": 1.7967914438502675, + "grad_norm": 0.22735333341316405, + "learning_rate": 2.483480142434683e-06, + "loss": 0.8001, + "num_tokens": 63197900480.0, + "step": 15120 + }, + { + "epoch": 1.7969102792632206, + "grad_norm": 0.23504175803709146, + "learning_rate": 2.4829202719100708e-06, + "loss": 0.8521, + "num_tokens": 63202090159.0, + "step": 15121 + }, + { + "epoch": 1.7970291146761737, + "grad_norm": 0.21979341794625026, + "learning_rate": 2.4823607168017188e-06, + "loss": 0.7887, + "num_tokens": 63206266174.0, + "step": 15122 + }, + { + "epoch": 1.7971479500891265, + "grad_norm": 0.23609139861973413, + "learning_rate": 2.4818014771303472e-06, + "loss": 0.7782, + "num_tokens": 63210403295.0, + "step": 15123 + }, + { + "epoch": 1.7972667855020796, + "grad_norm": 0.23696834539215308, + "learning_rate": 2.4812425529166657e-06, + "loss": 0.7654, + "num_tokens": 63214570190.0, + "step": 15124 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 0.22258466643863517, + "learning_rate": 2.4806839441813765e-06, + "loss": 0.7937, + "num_tokens": 63218745164.0, + "step": 15125 + }, + { + "epoch": 1.7975044563279856, + "grad_norm": 0.2251857128522288, + "learning_rate": 2.480125650945166e-06, + "loss": 0.8038, + "num_tokens": 63222902921.0, + "step": 15126 + }, + { + "epoch": 1.7976232917409387, + "grad_norm": 0.25482908358065803, + "learning_rate": 2.47956767322871e-06, + "loss": 0.8145, + "num_tokens": 63227070669.0, + "step": 15127 + }, + { + "epoch": 1.7977421271538918, + "grad_norm": 0.22566228670147284, + "learning_rate": 2.4790100110526697e-06, + "loss": 0.7958, + "num_tokens": 63231259431.0, + "step": 15128 + }, + { + "epoch": 1.797860962566845, + "grad_norm": 0.2510265205209988, + "learning_rate": 2.4784526644376987e-06, + "loss": 0.8271, + "num_tokens": 63235417705.0, + "step": 15129 + }, + { + "epoch": 1.797979797979798, + "grad_norm": 0.24474530191655403, + "learning_rate": 2.4778956334044364e-06, + "loss": 0.7853, + "num_tokens": 63239578544.0, + "step": 15130 + }, + { + "epoch": 1.7980986333927511, + "grad_norm": 0.23012312520500489, + "learning_rate": 2.4773389179735145e-06, + "loss": 0.7865, + "num_tokens": 63243769372.0, + "step": 15131 + }, + { + "epoch": 1.7982174688057042, + "grad_norm": 0.22985673170342003, + "learning_rate": 2.4767825181655493e-06, + "loss": 0.8161, + "num_tokens": 63247958755.0, + "step": 15132 + }, + { + "epoch": 1.7983363042186573, + "grad_norm": 0.23869853126114504, + "learning_rate": 2.4762264340011436e-06, + "loss": 0.7818, + "num_tokens": 63252134367.0, + "step": 15133 + }, + { + "epoch": 1.7984551396316102, + "grad_norm": 0.22561969619403593, + "learning_rate": 2.4756706655008934e-06, + "loss": 0.7967, + "num_tokens": 63256294411.0, + "step": 15134 + }, + { + "epoch": 1.7985739750445633, + "grad_norm": 0.25156256670395805, + "learning_rate": 2.4751152126853788e-06, + "loss": 0.7786, + "num_tokens": 63260469955.0, + "step": 15135 + }, + { + "epoch": 1.7986928104575164, + "grad_norm": 0.23633744906722076, + "learning_rate": 2.474560075575172e-06, + "loss": 0.8377, + "num_tokens": 63264637669.0, + "step": 15136 + }, + { + "epoch": 1.7988116458704693, + "grad_norm": 0.2326397235491138, + "learning_rate": 2.4740052541908303e-06, + "loss": 0.8325, + "num_tokens": 63268825332.0, + "step": 15137 + }, + { + "epoch": 1.7989304812834224, + "grad_norm": 0.23741267709505717, + "learning_rate": 2.473450748552901e-06, + "loss": 0.8232, + "num_tokens": 63272986920.0, + "step": 15138 + }, + { + "epoch": 1.7990493166963755, + "grad_norm": 0.22891707803392294, + "learning_rate": 2.4728965586819205e-06, + "loss": 0.8318, + "num_tokens": 63277171909.0, + "step": 15139 + }, + { + "epoch": 1.7991681521093286, + "grad_norm": 0.22682788463453232, + "learning_rate": 2.4723426845984096e-06, + "loss": 0.8268, + "num_tokens": 63281362208.0, + "step": 15140 + }, + { + "epoch": 1.7992869875222817, + "grad_norm": 0.22999782583149203, + "learning_rate": 2.471789126322883e-06, + "loss": 0.7953, + "num_tokens": 63285544265.0, + "step": 15141 + }, + { + "epoch": 1.7994058229352348, + "grad_norm": 0.22983043742006132, + "learning_rate": 2.4712358838758393e-06, + "loss": 0.7878, + "num_tokens": 63289732211.0, + "step": 15142 + }, + { + "epoch": 1.7995246583481879, + "grad_norm": 0.22678433976428286, + "learning_rate": 2.470682957277767e-06, + "loss": 0.7852, + "num_tokens": 63293921033.0, + "step": 15143 + }, + { + "epoch": 1.799643493761141, + "grad_norm": 0.2189202196067638, + "learning_rate": 2.470130346549144e-06, + "loss": 0.7767, + "num_tokens": 63298110079.0, + "step": 15144 + }, + { + "epoch": 1.7997623291740938, + "grad_norm": 0.21767913774776504, + "learning_rate": 2.4695780517104334e-06, + "loss": 0.8141, + "num_tokens": 63302259406.0, + "step": 15145 + }, + { + "epoch": 1.799881164587047, + "grad_norm": 0.22112493879412737, + "learning_rate": 2.4690260727820898e-06, + "loss": 0.7762, + "num_tokens": 63306426828.0, + "step": 15146 + }, + { + "epoch": 1.8, + "grad_norm": 0.23520939195543328, + "learning_rate": 2.4684744097845554e-06, + "loss": 0.8022, + "num_tokens": 63310616062.0, + "step": 15147 + }, + { + "epoch": 1.800118835412953, + "grad_norm": 0.2145672347677987, + "learning_rate": 2.4679230627382592e-06, + "loss": 0.7982, + "num_tokens": 63314781304.0, + "step": 15148 + }, + { + "epoch": 1.800237670825906, + "grad_norm": 0.24305555286789793, + "learning_rate": 2.467372031663618e-06, + "loss": 0.8156, + "num_tokens": 63318951214.0, + "step": 15149 + }, + { + "epoch": 1.8003565062388591, + "grad_norm": 0.2319413501541547, + "learning_rate": 2.4668213165810414e-06, + "loss": 0.7867, + "num_tokens": 63323139223.0, + "step": 15150 + }, + { + "epoch": 1.8004753416518122, + "grad_norm": 0.22559389614696904, + "learning_rate": 2.4662709175109225e-06, + "loss": 0.8135, + "num_tokens": 63327314674.0, + "step": 15151 + }, + { + "epoch": 1.8005941770647653, + "grad_norm": 0.22473019636582864, + "learning_rate": 2.465720834473645e-06, + "loss": 0.8122, + "num_tokens": 63331502293.0, + "step": 15152 + }, + { + "epoch": 1.8007130124777184, + "grad_norm": 0.23364995867440683, + "learning_rate": 2.465171067489581e-06, + "loss": 0.8005, + "num_tokens": 63335692744.0, + "step": 15153 + }, + { + "epoch": 1.8008318478906715, + "grad_norm": 0.2276317161687847, + "learning_rate": 2.464621616579088e-06, + "loss": 0.7585, + "num_tokens": 63339880327.0, + "step": 15154 + }, + { + "epoch": 1.8009506833036246, + "grad_norm": 0.24236915892279565, + "learning_rate": 2.4640724817625154e-06, + "loss": 0.8234, + "num_tokens": 63344051482.0, + "step": 15155 + }, + { + "epoch": 1.8010695187165775, + "grad_norm": 0.2284286582754141, + "learning_rate": 2.4635236630601984e-06, + "loss": 0.7929, + "num_tokens": 63348239285.0, + "step": 15156 + }, + { + "epoch": 1.8011883541295306, + "grad_norm": 0.2126803297116915, + "learning_rate": 2.462975160492464e-06, + "loss": 0.8491, + "num_tokens": 63352427977.0, + "step": 15157 + }, + { + "epoch": 1.8013071895424837, + "grad_norm": 0.24190247325911288, + "learning_rate": 2.462426974079625e-06, + "loss": 0.8028, + "num_tokens": 63356617836.0, + "step": 15158 + }, + { + "epoch": 1.8014260249554366, + "grad_norm": 0.2414924205961252, + "learning_rate": 2.4618791038419798e-06, + "loss": 0.8093, + "num_tokens": 63360792966.0, + "step": 15159 + }, + { + "epoch": 1.8015448603683897, + "grad_norm": 0.2320718002386173, + "learning_rate": 2.4613315497998206e-06, + "loss": 0.8051, + "num_tokens": 63364969647.0, + "step": 15160 + }, + { + "epoch": 1.8016636957813428, + "grad_norm": 0.2343207343880088, + "learning_rate": 2.4607843119734227e-06, + "loss": 0.7873, + "num_tokens": 63369158867.0, + "step": 15161 + }, + { + "epoch": 1.8017825311942959, + "grad_norm": 0.21783508511470095, + "learning_rate": 2.4602373903830525e-06, + "loss": 0.8076, + "num_tokens": 63373348354.0, + "step": 15162 + }, + { + "epoch": 1.801901366607249, + "grad_norm": 0.2353181702371977, + "learning_rate": 2.459690785048968e-06, + "loss": 0.8213, + "num_tokens": 63377513549.0, + "step": 15163 + }, + { + "epoch": 1.802020202020202, + "grad_norm": 0.23635683565065752, + "learning_rate": 2.459144495991408e-06, + "loss": 0.7797, + "num_tokens": 63381702597.0, + "step": 15164 + }, + { + "epoch": 1.8021390374331552, + "grad_norm": 0.22347988710817362, + "learning_rate": 2.458598523230604e-06, + "loss": 0.7852, + "num_tokens": 63385888613.0, + "step": 15165 + }, + { + "epoch": 1.8022578728461083, + "grad_norm": 0.24245020708024315, + "learning_rate": 2.4580528667867777e-06, + "loss": 0.8284, + "num_tokens": 63390077146.0, + "step": 15166 + }, + { + "epoch": 1.8023767082590612, + "grad_norm": 0.23918537483788146, + "learning_rate": 2.457507526680133e-06, + "loss": 0.7933, + "num_tokens": 63394265534.0, + "step": 15167 + }, + { + "epoch": 1.8024955436720143, + "grad_norm": 0.22177378339216985, + "learning_rate": 2.456962502930868e-06, + "loss": 0.78, + "num_tokens": 63398405831.0, + "step": 15168 + }, + { + "epoch": 1.8026143790849674, + "grad_norm": 0.219486075767675, + "learning_rate": 2.456417795559166e-06, + "loss": 0.8238, + "num_tokens": 63402594872.0, + "step": 15169 + }, + { + "epoch": 1.8027332144979202, + "grad_norm": 0.2377027698020875, + "learning_rate": 2.4558734045852e-06, + "loss": 0.7911, + "num_tokens": 63406784273.0, + "step": 15170 + }, + { + "epoch": 1.8028520499108733, + "grad_norm": 0.21672787963580356, + "learning_rate": 2.45532933002913e-06, + "loss": 0.7892, + "num_tokens": 63410972143.0, + "step": 15171 + }, + { + "epoch": 1.8029708853238264, + "grad_norm": 0.23318767105307184, + "learning_rate": 2.4547855719111047e-06, + "loss": 0.8249, + "num_tokens": 63415160322.0, + "step": 15172 + }, + { + "epoch": 1.8030897207367795, + "grad_norm": 0.22231956397873412, + "learning_rate": 2.454242130251262e-06, + "loss": 0.8078, + "num_tokens": 63419329265.0, + "step": 15173 + }, + { + "epoch": 1.8032085561497326, + "grad_norm": 0.22139703149260376, + "learning_rate": 2.4536990050697286e-06, + "loss": 0.849, + "num_tokens": 63423517286.0, + "step": 15174 + }, + { + "epoch": 1.8033273915626857, + "grad_norm": 0.24242214553980035, + "learning_rate": 2.453156196386614e-06, + "loss": 0.7953, + "num_tokens": 63427685082.0, + "step": 15175 + }, + { + "epoch": 1.8034462269756388, + "grad_norm": 0.23961336691610427, + "learning_rate": 2.4526137042220236e-06, + "loss": 0.7954, + "num_tokens": 63431874748.0, + "step": 15176 + }, + { + "epoch": 1.803565062388592, + "grad_norm": 0.21069883505084008, + "learning_rate": 2.452071528596049e-06, + "loss": 0.8102, + "num_tokens": 63436064399.0, + "step": 15177 + }, + { + "epoch": 1.8036838978015448, + "grad_norm": 0.2309124308007535, + "learning_rate": 2.4515296695287655e-06, + "loss": 0.7756, + "num_tokens": 63440254025.0, + "step": 15178 + }, + { + "epoch": 1.803802733214498, + "grad_norm": 0.2330439270755163, + "learning_rate": 2.450988127040243e-06, + "loss": 0.8178, + "num_tokens": 63444406841.0, + "step": 15179 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.22518411393279353, + "learning_rate": 2.4504469011505335e-06, + "loss": 0.7887, + "num_tokens": 63448597257.0, + "step": 15180 + }, + { + "epoch": 1.8040404040404039, + "grad_norm": 0.22964052182803946, + "learning_rate": 2.4499059918796813e-06, + "loss": 0.8075, + "num_tokens": 63452784402.0, + "step": 15181 + }, + { + "epoch": 1.804159239453357, + "grad_norm": 0.2195660873567587, + "learning_rate": 2.4493653992477196e-06, + "loss": 0.8136, + "num_tokens": 63456973123.0, + "step": 15182 + }, + { + "epoch": 1.80427807486631, + "grad_norm": 0.22709597913612148, + "learning_rate": 2.4488251232746673e-06, + "loss": 0.8111, + "num_tokens": 63461138307.0, + "step": 15183 + }, + { + "epoch": 1.8043969102792632, + "grad_norm": 0.22526970076097824, + "learning_rate": 2.4482851639805346e-06, + "loss": 0.8247, + "num_tokens": 63465312549.0, + "step": 15184 + }, + { + "epoch": 1.8045157456922163, + "grad_norm": 0.2246660045832456, + "learning_rate": 2.447745521385315e-06, + "loss": 0.8333, + "num_tokens": 63469501100.0, + "step": 15185 + }, + { + "epoch": 1.8046345811051694, + "grad_norm": 0.22317849775270607, + "learning_rate": 2.447206195508994e-06, + "loss": 0.7865, + "num_tokens": 63473662400.0, + "step": 15186 + }, + { + "epoch": 1.8047534165181225, + "grad_norm": 0.22376755867584266, + "learning_rate": 2.4466671863715468e-06, + "loss": 0.8157, + "num_tokens": 63477852625.0, + "step": 15187 + }, + { + "epoch": 1.8048722519310756, + "grad_norm": 0.22477385435388897, + "learning_rate": 2.4461284939929302e-06, + "loss": 0.8207, + "num_tokens": 63482042002.0, + "step": 15188 + }, + { + "epoch": 1.8049910873440285, + "grad_norm": 0.2367076331616215, + "learning_rate": 2.4455901183931002e-06, + "loss": 0.8042, + "num_tokens": 63486230990.0, + "step": 15189 + }, + { + "epoch": 1.8051099227569816, + "grad_norm": 0.22481192002132347, + "learning_rate": 2.44505205959199e-06, + "loss": 0.814, + "num_tokens": 63490420441.0, + "step": 15190 + }, + { + "epoch": 1.8052287581699347, + "grad_norm": 0.23161158651398767, + "learning_rate": 2.4445143176095264e-06, + "loss": 0.8029, + "num_tokens": 63494608823.0, + "step": 15191 + }, + { + "epoch": 1.8053475935828875, + "grad_norm": 0.21696486432365236, + "learning_rate": 2.4439768924656253e-06, + "loss": 0.7868, + "num_tokens": 63498798831.0, + "step": 15192 + }, + { + "epoch": 1.8054664289958406, + "grad_norm": 0.2140612430838943, + "learning_rate": 2.443439784180189e-06, + "loss": 0.7691, + "num_tokens": 63502989082.0, + "step": 15193 + }, + { + "epoch": 1.8055852644087937, + "grad_norm": 0.21340327810673626, + "learning_rate": 2.4429029927731067e-06, + "loss": 0.8326, + "num_tokens": 63507170304.0, + "step": 15194 + }, + { + "epoch": 1.8057040998217468, + "grad_norm": 0.22169268042911627, + "learning_rate": 2.442366518264259e-06, + "loss": 0.791, + "num_tokens": 63511358837.0, + "step": 15195 + }, + { + "epoch": 1.8058229352347, + "grad_norm": 0.2225694886992622, + "learning_rate": 2.4418303606735132e-06, + "loss": 0.7848, + "num_tokens": 63515547910.0, + "step": 15196 + }, + { + "epoch": 1.805941770647653, + "grad_norm": 0.2189274698011944, + "learning_rate": 2.4412945200207246e-06, + "loss": 0.797, + "num_tokens": 63519726333.0, + "step": 15197 + }, + { + "epoch": 1.8060606060606061, + "grad_norm": 0.232434890043884, + "learning_rate": 2.4407589963257383e-06, + "loss": 0.8376, + "num_tokens": 63523887073.0, + "step": 15198 + }, + { + "epoch": 1.8061794414735592, + "grad_norm": 0.21050453576593614, + "learning_rate": 2.440223789608386e-06, + "loss": 0.8033, + "num_tokens": 63528075551.0, + "step": 15199 + }, + { + "epoch": 1.8062982768865123, + "grad_norm": 0.21300833373409372, + "learning_rate": 2.439688899888487e-06, + "loss": 0.8129, + "num_tokens": 63532258196.0, + "step": 15200 + }, + { + "epoch": 1.8064171122994652, + "grad_norm": 0.22393260305477217, + "learning_rate": 2.4391543271858525e-06, + "loss": 0.8367, + "num_tokens": 63536441840.0, + "step": 15201 + }, + { + "epoch": 1.8065359477124183, + "grad_norm": 0.21394202402097, + "learning_rate": 2.4386200715202768e-06, + "loss": 0.7914, + "num_tokens": 63540631665.0, + "step": 15202 + }, + { + "epoch": 1.8066547831253712, + "grad_norm": 0.21797481799652305, + "learning_rate": 2.4380861329115473e-06, + "loss": 0.8121, + "num_tokens": 63544820851.0, + "step": 15203 + }, + { + "epoch": 1.8067736185383243, + "grad_norm": 0.23230617321711164, + "learning_rate": 2.4375525113794364e-06, + "loss": 0.8034, + "num_tokens": 63549009405.0, + "step": 15204 + }, + { + "epoch": 1.8068924539512774, + "grad_norm": 0.21422172775247986, + "learning_rate": 2.437019206943707e-06, + "loss": 0.8149, + "num_tokens": 63553199448.0, + "step": 15205 + }, + { + "epoch": 1.8070112893642305, + "grad_norm": 0.22420474376712662, + "learning_rate": 2.4364862196241087e-06, + "loss": 0.7741, + "num_tokens": 63557384888.0, + "step": 15206 + }, + { + "epoch": 1.8071301247771836, + "grad_norm": 0.20854103793216483, + "learning_rate": 2.4359535494403784e-06, + "loss": 0.8279, + "num_tokens": 63561573990.0, + "step": 15207 + }, + { + "epoch": 1.8072489601901367, + "grad_norm": 0.2011328469801608, + "learning_rate": 2.4354211964122444e-06, + "loss": 0.7765, + "num_tokens": 63565736399.0, + "step": 15208 + }, + { + "epoch": 1.8073677956030898, + "grad_norm": 0.21795258824962252, + "learning_rate": 2.4348891605594215e-06, + "loss": 0.794, + "num_tokens": 63569925950.0, + "step": 15209 + }, + { + "epoch": 1.807486631016043, + "grad_norm": 0.24019534658974157, + "learning_rate": 2.434357441901612e-06, + "loss": 0.8257, + "num_tokens": 63574115076.0, + "step": 15210 + }, + { + "epoch": 1.807605466428996, + "grad_norm": 0.2263802202977255, + "learning_rate": 2.4338260404585077e-06, + "loss": 0.8207, + "num_tokens": 63578303119.0, + "step": 15211 + }, + { + "epoch": 1.8077243018419489, + "grad_norm": 0.21235731898030524, + "learning_rate": 2.4332949562497887e-06, + "loss": 0.7775, + "num_tokens": 63582453379.0, + "step": 15212 + }, + { + "epoch": 1.807843137254902, + "grad_norm": 0.21104566680365958, + "learning_rate": 2.4327641892951205e-06, + "loss": 0.7509, + "num_tokens": 63586612469.0, + "step": 15213 + }, + { + "epoch": 1.8079619726678549, + "grad_norm": 0.2159726354552451, + "learning_rate": 2.4322337396141614e-06, + "loss": 0.8145, + "num_tokens": 63590800718.0, + "step": 15214 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.22897257002706922, + "learning_rate": 2.431703607226555e-06, + "loss": 0.7662, + "num_tokens": 63594989158.0, + "step": 15215 + }, + { + "epoch": 1.808199643493761, + "grad_norm": 0.22157326847975417, + "learning_rate": 2.431173792151937e-06, + "loss": 0.8175, + "num_tokens": 63599147388.0, + "step": 15216 + }, + { + "epoch": 1.8083184789067142, + "grad_norm": 0.21689506788245405, + "learning_rate": 2.4306442944099225e-06, + "loss": 0.7988, + "num_tokens": 63603337175.0, + "step": 15217 + }, + { + "epoch": 1.8084373143196673, + "grad_norm": 0.23829107243801828, + "learning_rate": 2.4301151140201236e-06, + "loss": 0.7804, + "num_tokens": 63607525601.0, + "step": 15218 + }, + { + "epoch": 1.8085561497326204, + "grad_norm": 0.22090578701278096, + "learning_rate": 2.429586251002137e-06, + "loss": 0.7827, + "num_tokens": 63611715117.0, + "step": 15219 + }, + { + "epoch": 1.8086749851455735, + "grad_norm": 0.22071758685262516, + "learning_rate": 2.42905770537555e-06, + "loss": 0.8165, + "num_tokens": 63615903048.0, + "step": 15220 + }, + { + "epoch": 1.8087938205585266, + "grad_norm": 0.2138263908729442, + "learning_rate": 2.4285294771599357e-06, + "loss": 0.8092, + "num_tokens": 63620091417.0, + "step": 15221 + }, + { + "epoch": 1.8089126559714797, + "grad_norm": 0.21951109029204743, + "learning_rate": 2.428001566374855e-06, + "loss": 0.8109, + "num_tokens": 63624255847.0, + "step": 15222 + }, + { + "epoch": 1.8090314913844325, + "grad_norm": 0.21966742006164772, + "learning_rate": 2.4274739730398592e-06, + "loss": 0.7793, + "num_tokens": 63628432381.0, + "step": 15223 + }, + { + "epoch": 1.8091503267973856, + "grad_norm": 0.23200192140181874, + "learning_rate": 2.4269466971744853e-06, + "loss": 0.7878, + "num_tokens": 63632596257.0, + "step": 15224 + }, + { + "epoch": 1.8092691622103387, + "grad_norm": 0.21988097986342003, + "learning_rate": 2.426419738798263e-06, + "loss": 0.7987, + "num_tokens": 63636783553.0, + "step": 15225 + }, + { + "epoch": 1.8093879976232916, + "grad_norm": 0.21429954862530107, + "learning_rate": 2.4258930979307048e-06, + "loss": 0.7874, + "num_tokens": 63640972963.0, + "step": 15226 + }, + { + "epoch": 1.8095068330362447, + "grad_norm": 0.21932241033275937, + "learning_rate": 2.425366774591315e-06, + "loss": 0.7565, + "num_tokens": 63645162702.0, + "step": 15227 + }, + { + "epoch": 1.8096256684491978, + "grad_norm": 0.2216449661281548, + "learning_rate": 2.424840768799585e-06, + "loss": 0.8331, + "num_tokens": 63649350726.0, + "step": 15228 + }, + { + "epoch": 1.809744503862151, + "grad_norm": 0.23427849303209333, + "learning_rate": 2.424315080574995e-06, + "loss": 0.783, + "num_tokens": 63653539059.0, + "step": 15229 + }, + { + "epoch": 1.809863339275104, + "grad_norm": 0.218679075678682, + "learning_rate": 2.4237897099370122e-06, + "loss": 0.8189, + "num_tokens": 63657699203.0, + "step": 15230 + }, + { + "epoch": 1.809982174688057, + "grad_norm": 0.22086441897535447, + "learning_rate": 2.423264656905094e-06, + "loss": 0.795, + "num_tokens": 63661885072.0, + "step": 15231 + }, + { + "epoch": 1.8101010101010102, + "grad_norm": 0.22821658815998638, + "learning_rate": 2.4227399214986834e-06, + "loss": 0.8107, + "num_tokens": 63666069928.0, + "step": 15232 + }, + { + "epoch": 1.8102198455139633, + "grad_norm": 0.21307835076140869, + "learning_rate": 2.4222155037372146e-06, + "loss": 0.7829, + "num_tokens": 63670256433.0, + "step": 15233 + }, + { + "epoch": 1.8103386809269162, + "grad_norm": 0.2306535653216472, + "learning_rate": 2.4216914036401073e-06, + "loss": 0.7992, + "num_tokens": 63674443349.0, + "step": 15234 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.22073523588305508, + "learning_rate": 2.4211676212267714e-06, + "loss": 0.8119, + "num_tokens": 63678632213.0, + "step": 15235 + }, + { + "epoch": 1.8105763517528224, + "grad_norm": 0.22328540254477788, + "learning_rate": 2.4206441565166044e-06, + "loss": 0.7773, + "num_tokens": 63682813532.0, + "step": 15236 + }, + { + "epoch": 1.8106951871657753, + "grad_norm": 0.2346587655893584, + "learning_rate": 2.420121009528993e-06, + "loss": 0.7859, + "num_tokens": 63686981963.0, + "step": 15237 + }, + { + "epoch": 1.8108140225787284, + "grad_norm": 0.23176883305070925, + "learning_rate": 2.419598180283307e-06, + "loss": 0.768, + "num_tokens": 63691147178.0, + "step": 15238 + }, + { + "epoch": 1.8109328579916815, + "grad_norm": 0.23206112047306662, + "learning_rate": 2.419075668798913e-06, + "loss": 0.7873, + "num_tokens": 63695323162.0, + "step": 15239 + }, + { + "epoch": 1.8110516934046346, + "grad_norm": 0.23801032736941305, + "learning_rate": 2.41855347509516e-06, + "loss": 0.8164, + "num_tokens": 63699484228.0, + "step": 15240 + }, + { + "epoch": 1.8111705288175877, + "grad_norm": 0.24606121924438598, + "learning_rate": 2.418031599191386e-06, + "loss": 0.7878, + "num_tokens": 63703672740.0, + "step": 15241 + }, + { + "epoch": 1.8112893642305408, + "grad_norm": 0.2254007829201266, + "learning_rate": 2.4175100411069197e-06, + "loss": 0.8229, + "num_tokens": 63707861841.0, + "step": 15242 + }, + { + "epoch": 1.8114081996434939, + "grad_norm": 0.24638828859912104, + "learning_rate": 2.4169888008610733e-06, + "loss": 0.8334, + "num_tokens": 63712045490.0, + "step": 15243 + }, + { + "epoch": 1.811527035056447, + "grad_norm": 0.24353851148624028, + "learning_rate": 2.4164678784731516e-06, + "loss": 0.7674, + "num_tokens": 63716235419.0, + "step": 15244 + }, + { + "epoch": 1.8116458704693998, + "grad_norm": 0.22472506331060405, + "learning_rate": 2.415947273962445e-06, + "loss": 0.77, + "num_tokens": 63720425877.0, + "step": 15245 + }, + { + "epoch": 1.811764705882353, + "grad_norm": 0.2363077088355879, + "learning_rate": 2.415426987348234e-06, + "loss": 0.7815, + "num_tokens": 63724608789.0, + "step": 15246 + }, + { + "epoch": 1.811883541295306, + "grad_norm": 0.22085860697969417, + "learning_rate": 2.414907018649789e-06, + "loss": 0.8129, + "num_tokens": 63728798366.0, + "step": 15247 + }, + { + "epoch": 1.812002376708259, + "grad_norm": 0.2299271396068457, + "learning_rate": 2.4143873678863634e-06, + "loss": 0.7928, + "num_tokens": 63732986922.0, + "step": 15248 + }, + { + "epoch": 1.812121212121212, + "grad_norm": 0.21615379559862127, + "learning_rate": 2.4138680350772016e-06, + "loss": 0.798, + "num_tokens": 63737155394.0, + "step": 15249 + }, + { + "epoch": 1.8122400475341651, + "grad_norm": 0.21771368285492892, + "learning_rate": 2.413349020241537e-06, + "loss": 0.8118, + "num_tokens": 63741344640.0, + "step": 15250 + }, + { + "epoch": 1.8123588829471182, + "grad_norm": 0.2276627137484058, + "learning_rate": 2.4128303233985887e-06, + "loss": 0.7588, + "num_tokens": 63745506050.0, + "step": 15251 + }, + { + "epoch": 1.8124777183600713, + "grad_norm": 0.23191150171290614, + "learning_rate": 2.4123119445675696e-06, + "loss": 0.8204, + "num_tokens": 63749695598.0, + "step": 15252 + }, + { + "epoch": 1.8125965537730244, + "grad_norm": 0.21723015758191577, + "learning_rate": 2.4117938837676747e-06, + "loss": 0.7821, + "num_tokens": 63753883956.0, + "step": 15253 + }, + { + "epoch": 1.8127153891859775, + "grad_norm": 0.2240206498354502, + "learning_rate": 2.4112761410180887e-06, + "loss": 0.8355, + "num_tokens": 63758074020.0, + "step": 15254 + }, + { + "epoch": 1.8128342245989306, + "grad_norm": 0.23168560414491918, + "learning_rate": 2.4107587163379865e-06, + "loss": 0.843, + "num_tokens": 63762235841.0, + "step": 15255 + }, + { + "epoch": 1.8129530600118835, + "grad_norm": 0.21121314941636796, + "learning_rate": 2.410241609746531e-06, + "loss": 0.7961, + "num_tokens": 63766422563.0, + "step": 15256 + }, + { + "epoch": 1.8130718954248366, + "grad_norm": 0.2368761182586501, + "learning_rate": 2.4097248212628705e-06, + "loss": 0.8163, + "num_tokens": 63770611237.0, + "step": 15257 + }, + { + "epoch": 1.8131907308377897, + "grad_norm": 0.2284660687731033, + "learning_rate": 2.409208350906145e-06, + "loss": 0.8186, + "num_tokens": 63774781663.0, + "step": 15258 + }, + { + "epoch": 1.8133095662507426, + "grad_norm": 0.22755789495952788, + "learning_rate": 2.4086921986954795e-06, + "loss": 0.8425, + "num_tokens": 63778942487.0, + "step": 15259 + }, + { + "epoch": 1.8134284016636957, + "grad_norm": 0.22809212523568675, + "learning_rate": 2.4081763646499905e-06, + "loss": 0.8137, + "num_tokens": 63783111473.0, + "step": 15260 + }, + { + "epoch": 1.8135472370766488, + "grad_norm": 0.2285842262825803, + "learning_rate": 2.4076608487887792e-06, + "loss": 0.7999, + "num_tokens": 63787299607.0, + "step": 15261 + }, + { + "epoch": 1.8136660724896019, + "grad_norm": 0.23566149851890647, + "learning_rate": 2.407145651130939e-06, + "loss": 0.7967, + "num_tokens": 63791487405.0, + "step": 15262 + }, + { + "epoch": 1.813784907902555, + "grad_norm": 0.22375333294743685, + "learning_rate": 2.40663077169555e-06, + "loss": 0.8278, + "num_tokens": 63795653730.0, + "step": 15263 + }, + { + "epoch": 1.813903743315508, + "grad_norm": 0.22577482381183142, + "learning_rate": 2.406116210501675e-06, + "loss": 0.7897, + "num_tokens": 63799842835.0, + "step": 15264 + }, + { + "epoch": 1.8140225787284612, + "grad_norm": 0.22831004051059542, + "learning_rate": 2.4056019675683747e-06, + "loss": 0.843, + "num_tokens": 63804030939.0, + "step": 15265 + }, + { + "epoch": 1.8141414141414143, + "grad_norm": 0.23717091798863757, + "learning_rate": 2.4050880429146923e-06, + "loss": 0.7904, + "num_tokens": 63808221610.0, + "step": 15266 + }, + { + "epoch": 1.8142602495543672, + "grad_norm": 0.22133594764397027, + "learning_rate": 2.4045744365596603e-06, + "loss": 0.7945, + "num_tokens": 63812412132.0, + "step": 15267 + }, + { + "epoch": 1.8143790849673203, + "grad_norm": 0.22258794850433544, + "learning_rate": 2.4040611485222997e-06, + "loss": 0.827, + "num_tokens": 63816585658.0, + "step": 15268 + }, + { + "epoch": 1.8144979203802734, + "grad_norm": 0.23845887371376973, + "learning_rate": 2.403548178821616e-06, + "loss": 0.8176, + "num_tokens": 63820767995.0, + "step": 15269 + }, + { + "epoch": 1.8146167557932262, + "grad_norm": 0.22452024147421273, + "learning_rate": 2.403035527476608e-06, + "loss": 0.8403, + "num_tokens": 63824957134.0, + "step": 15270 + }, + { + "epoch": 1.8147355912061793, + "grad_norm": 0.22924669023331876, + "learning_rate": 2.402523194506263e-06, + "loss": 0.7756, + "num_tokens": 63829102730.0, + "step": 15271 + }, + { + "epoch": 1.8148544266191324, + "grad_norm": 0.22751210758594959, + "learning_rate": 2.402011179929553e-06, + "loss": 0.825, + "num_tokens": 63833290484.0, + "step": 15272 + }, + { + "epoch": 1.8149732620320855, + "grad_norm": 0.23130693142363304, + "learning_rate": 2.4014994837654397e-06, + "loss": 0.798, + "num_tokens": 63837479248.0, + "step": 15273 + }, + { + "epoch": 1.8150920974450386, + "grad_norm": 0.2356476336766527, + "learning_rate": 2.4009881060328723e-06, + "loss": 0.7974, + "num_tokens": 63841663982.0, + "step": 15274 + }, + { + "epoch": 1.8152109328579917, + "grad_norm": 0.2162010233150692, + "learning_rate": 2.400477046750789e-06, + "loss": 0.8263, + "num_tokens": 63845853026.0, + "step": 15275 + }, + { + "epoch": 1.8153297682709448, + "grad_norm": 0.24789403362057463, + "learning_rate": 2.399966305938116e-06, + "loss": 0.7742, + "num_tokens": 63850024384.0, + "step": 15276 + }, + { + "epoch": 1.815448603683898, + "grad_norm": 0.22324176329781756, + "learning_rate": 2.3994558836137683e-06, + "loss": 0.7698, + "num_tokens": 63854198424.0, + "step": 15277 + }, + { + "epoch": 1.8155674390968508, + "grad_norm": 0.2191303270242591, + "learning_rate": 2.398945779796649e-06, + "loss": 0.8049, + "num_tokens": 63858387985.0, + "step": 15278 + }, + { + "epoch": 1.815686274509804, + "grad_norm": 0.22302894417826202, + "learning_rate": 2.3984359945056483e-06, + "loss": 0.8269, + "num_tokens": 63862554559.0, + "step": 15279 + }, + { + "epoch": 1.815805109922757, + "grad_norm": 0.23966537640608487, + "learning_rate": 2.397926527759645e-06, + "loss": 0.8371, + "num_tokens": 63866744328.0, + "step": 15280 + }, + { + "epoch": 1.8159239453357099, + "grad_norm": 0.21955694691174063, + "learning_rate": 2.397417379577507e-06, + "loss": 0.8299, + "num_tokens": 63870930057.0, + "step": 15281 + }, + { + "epoch": 1.816042780748663, + "grad_norm": 0.23394315502085827, + "learning_rate": 2.396908549978089e-06, + "loss": 0.8194, + "num_tokens": 63875117950.0, + "step": 15282 + }, + { + "epoch": 1.816161616161616, + "grad_norm": 0.24047721808100883, + "learning_rate": 2.396400038980235e-06, + "loss": 0.7835, + "num_tokens": 63879308231.0, + "step": 15283 + }, + { + "epoch": 1.8162804515745692, + "grad_norm": 0.22957792587861087, + "learning_rate": 2.3958918466027775e-06, + "loss": 0.8092, + "num_tokens": 63883479494.0, + "step": 15284 + }, + { + "epoch": 1.8163992869875223, + "grad_norm": 0.2216652691460284, + "learning_rate": 2.395383972864536e-06, + "loss": 0.8324, + "num_tokens": 63887669196.0, + "step": 15285 + }, + { + "epoch": 1.8165181224004754, + "grad_norm": 0.21965571646645785, + "learning_rate": 2.3948764177843182e-06, + "loss": 0.7667, + "num_tokens": 63891858614.0, + "step": 15286 + }, + { + "epoch": 1.8166369578134285, + "grad_norm": 0.23303536345830442, + "learning_rate": 2.3943691813809215e-06, + "loss": 0.814, + "num_tokens": 63896037541.0, + "step": 15287 + }, + { + "epoch": 1.8167557932263816, + "grad_norm": 0.21233116748479042, + "learning_rate": 2.393862263673131e-06, + "loss": 0.8086, + "num_tokens": 63900222226.0, + "step": 15288 + }, + { + "epoch": 1.8168746286393347, + "grad_norm": 0.21957437683565945, + "learning_rate": 2.3933556646797185e-06, + "loss": 0.7867, + "num_tokens": 63904377639.0, + "step": 15289 + }, + { + "epoch": 1.8169934640522876, + "grad_norm": 0.2195763184997149, + "learning_rate": 2.3928493844194454e-06, + "loss": 0.8337, + "num_tokens": 63908566941.0, + "step": 15290 + }, + { + "epoch": 1.8171122994652407, + "grad_norm": 0.21878101377665563, + "learning_rate": 2.392343422911061e-06, + "loss": 0.7789, + "num_tokens": 63912754471.0, + "step": 15291 + }, + { + "epoch": 1.8172311348781935, + "grad_norm": 0.22091345536468038, + "learning_rate": 2.391837780173303e-06, + "loss": 0.7767, + "num_tokens": 63916944847.0, + "step": 15292 + }, + { + "epoch": 1.8173499702911466, + "grad_norm": 0.23100624167790404, + "learning_rate": 2.3913324562248973e-06, + "loss": 0.8102, + "num_tokens": 63921112233.0, + "step": 15293 + }, + { + "epoch": 1.8174688057040997, + "grad_norm": 0.23046727069520595, + "learning_rate": 2.3908274510845582e-06, + "loss": 0.7986, + "num_tokens": 63925301962.0, + "step": 15294 + }, + { + "epoch": 1.8175876411170528, + "grad_norm": 0.23166609407307812, + "learning_rate": 2.3903227647709857e-06, + "loss": 0.8193, + "num_tokens": 63929491432.0, + "step": 15295 + }, + { + "epoch": 1.817706476530006, + "grad_norm": 0.2303182793177398, + "learning_rate": 2.389818397302869e-06, + "loss": 0.8344, + "num_tokens": 63933679387.0, + "step": 15296 + }, + { + "epoch": 1.817825311942959, + "grad_norm": 0.21916993726718137, + "learning_rate": 2.3893143486988916e-06, + "loss": 0.7887, + "num_tokens": 63937867362.0, + "step": 15297 + }, + { + "epoch": 1.8179441473559121, + "grad_norm": 0.2253440447548155, + "learning_rate": 2.3888106189777167e-06, + "loss": 0.795, + "num_tokens": 63942055743.0, + "step": 15298 + }, + { + "epoch": 1.8180629827688652, + "grad_norm": 0.2157350187122833, + "learning_rate": 2.3883072081580004e-06, + "loss": 0.7981, + "num_tokens": 63946222354.0, + "step": 15299 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.22112006667680217, + "learning_rate": 2.387804116258384e-06, + "loss": 0.862, + "num_tokens": 63950411249.0, + "step": 15300 + }, + { + "epoch": 1.8183006535947712, + "grad_norm": 0.22051395474167418, + "learning_rate": 2.3873013432975e-06, + "loss": 0.7778, + "num_tokens": 63954601378.0, + "step": 15301 + }, + { + "epoch": 1.8184194890077243, + "grad_norm": 0.22278521850513688, + "learning_rate": 2.386798889293966e-06, + "loss": 0.8013, + "num_tokens": 63958789543.0, + "step": 15302 + }, + { + "epoch": 1.8185383244206772, + "grad_norm": 0.22052511578236075, + "learning_rate": 2.386296754266393e-06, + "loss": 0.787, + "num_tokens": 63962976904.0, + "step": 15303 + }, + { + "epoch": 1.8186571598336303, + "grad_norm": 0.23258705554391912, + "learning_rate": 2.385794938233375e-06, + "loss": 0.829, + "num_tokens": 63967166453.0, + "step": 15304 + }, + { + "epoch": 1.8187759952465834, + "grad_norm": 0.23609642301048667, + "learning_rate": 2.3852934412134956e-06, + "loss": 0.7995, + "num_tokens": 63971347302.0, + "step": 15305 + }, + { + "epoch": 1.8188948306595365, + "grad_norm": 0.2191126873437711, + "learning_rate": 2.384792263225326e-06, + "loss": 0.7875, + "num_tokens": 63975505416.0, + "step": 15306 + }, + { + "epoch": 1.8190136660724896, + "grad_norm": 0.22494210748466914, + "learning_rate": 2.3842914042874283e-06, + "loss": 0.8135, + "num_tokens": 63979695565.0, + "step": 15307 + }, + { + "epoch": 1.8191325014854427, + "grad_norm": 0.24034175445833317, + "learning_rate": 2.3837908644183486e-06, + "loss": 0.8098, + "num_tokens": 63983884217.0, + "step": 15308 + }, + { + "epoch": 1.8192513368983958, + "grad_norm": 0.23551030435416062, + "learning_rate": 2.383290643636628e-06, + "loss": 0.8263, + "num_tokens": 63988073200.0, + "step": 15309 + }, + { + "epoch": 1.819370172311349, + "grad_norm": 0.232757391623786, + "learning_rate": 2.382790741960788e-06, + "loss": 0.8123, + "num_tokens": 63992261412.0, + "step": 15310 + }, + { + "epoch": 1.819489007724302, + "grad_norm": 0.2195176945663906, + "learning_rate": 2.3822911594093412e-06, + "loss": 0.8127, + "num_tokens": 63996428102.0, + "step": 15311 + }, + { + "epoch": 1.8196078431372549, + "grad_norm": 0.2220752745978316, + "learning_rate": 2.3817918960007907e-06, + "loss": 0.8336, + "num_tokens": 64000607478.0, + "step": 15312 + }, + { + "epoch": 1.819726678550208, + "grad_norm": 0.22767956286917965, + "learning_rate": 2.381292951753625e-06, + "loss": 0.8157, + "num_tokens": 64004796536.0, + "step": 15313 + }, + { + "epoch": 1.819845513963161, + "grad_norm": 0.22280974826330557, + "learning_rate": 2.3807943266863213e-06, + "loss": 0.8174, + "num_tokens": 64008929870.0, + "step": 15314 + }, + { + "epoch": 1.819964349376114, + "grad_norm": 0.21926869503975835, + "learning_rate": 2.3802960208173464e-06, + "loss": 0.8076, + "num_tokens": 64013101186.0, + "step": 15315 + }, + { + "epoch": 1.820083184789067, + "grad_norm": 0.2206557450152597, + "learning_rate": 2.3797980341651536e-06, + "loss": 0.7846, + "num_tokens": 64017290821.0, + "step": 15316 + }, + { + "epoch": 1.8202020202020202, + "grad_norm": 0.21919977148826317, + "learning_rate": 2.379300366748186e-06, + "loss": 0.8273, + "num_tokens": 64021479763.0, + "step": 15317 + }, + { + "epoch": 1.8203208556149733, + "grad_norm": 0.21497017670570812, + "learning_rate": 2.378803018584872e-06, + "loss": 0.8072, + "num_tokens": 64025616297.0, + "step": 15318 + }, + { + "epoch": 1.8204396910279264, + "grad_norm": 0.21775395731027725, + "learning_rate": 2.3783059896936317e-06, + "loss": 0.8166, + "num_tokens": 64029779251.0, + "step": 15319 + }, + { + "epoch": 1.8205585264408795, + "grad_norm": 0.21456332085784005, + "learning_rate": 2.3778092800928705e-06, + "loss": 0.794, + "num_tokens": 64033969486.0, + "step": 15320 + }, + { + "epoch": 1.8206773618538326, + "grad_norm": 0.22290537190214893, + "learning_rate": 2.3773128898009846e-06, + "loss": 0.7899, + "num_tokens": 64038139609.0, + "step": 15321 + }, + { + "epoch": 1.8207961972667857, + "grad_norm": 0.21625571345122377, + "learning_rate": 2.376816818836356e-06, + "loss": 0.8237, + "num_tokens": 64042323076.0, + "step": 15322 + }, + { + "epoch": 1.8209150326797385, + "grad_norm": 0.22224363375522035, + "learning_rate": 2.376321067217357e-06, + "loss": 0.844, + "num_tokens": 64046512391.0, + "step": 15323 + }, + { + "epoch": 1.8210338680926916, + "grad_norm": 0.22562607945925978, + "learning_rate": 2.375825634962346e-06, + "loss": 0.7883, + "num_tokens": 64050691718.0, + "step": 15324 + }, + { + "epoch": 1.8211527035056447, + "grad_norm": 0.22401004809306296, + "learning_rate": 2.3753305220896706e-06, + "loss": 0.7821, + "num_tokens": 64054849776.0, + "step": 15325 + }, + { + "epoch": 1.8212715389185976, + "grad_norm": 0.21784589765122978, + "learning_rate": 2.3748357286176684e-06, + "loss": 0.8035, + "num_tokens": 64059038435.0, + "step": 15326 + }, + { + "epoch": 1.8213903743315507, + "grad_norm": 0.22808584184232458, + "learning_rate": 2.3743412545646586e-06, + "loss": 0.804, + "num_tokens": 64063203384.0, + "step": 15327 + }, + { + "epoch": 1.8215092097445038, + "grad_norm": 0.21584638376678056, + "learning_rate": 2.373847099948957e-06, + "loss": 0.7943, + "num_tokens": 64067371676.0, + "step": 15328 + }, + { + "epoch": 1.821628045157457, + "grad_norm": 0.20668788636212265, + "learning_rate": 2.3733532647888637e-06, + "loss": 0.7955, + "num_tokens": 64071531468.0, + "step": 15329 + }, + { + "epoch": 1.82174688057041, + "grad_norm": 0.22499961476189875, + "learning_rate": 2.372859749102666e-06, + "loss": 0.8032, + "num_tokens": 64075704100.0, + "step": 15330 + }, + { + "epoch": 1.821865715983363, + "grad_norm": 0.22143138961326878, + "learning_rate": 2.3723665529086425e-06, + "loss": 0.8366, + "num_tokens": 64079869165.0, + "step": 15331 + }, + { + "epoch": 1.8219845513963162, + "grad_norm": 0.21612462712173222, + "learning_rate": 2.371873676225055e-06, + "loss": 0.8143, + "num_tokens": 64084051362.0, + "step": 15332 + }, + { + "epoch": 1.8221033868092693, + "grad_norm": 0.21755663836572628, + "learning_rate": 2.371381119070157e-06, + "loss": 0.7693, + "num_tokens": 64088228846.0, + "step": 15333 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.23334342532401794, + "learning_rate": 2.3708888814621897e-06, + "loss": 0.8458, + "num_tokens": 64092358412.0, + "step": 15334 + }, + { + "epoch": 1.8223410576351753, + "grad_norm": 0.21975467674526125, + "learning_rate": 2.3703969634193837e-06, + "loss": 0.8233, + "num_tokens": 64096548107.0, + "step": 15335 + }, + { + "epoch": 1.8224598930481284, + "grad_norm": 0.2176169863791264, + "learning_rate": 2.3699053649599568e-06, + "loss": 0.8208, + "num_tokens": 64100708899.0, + "step": 15336 + }, + { + "epoch": 1.8225787284610813, + "grad_norm": 0.23304391291662813, + "learning_rate": 2.369414086102112e-06, + "loss": 0.786, + "num_tokens": 64104880898.0, + "step": 15337 + }, + { + "epoch": 1.8226975638740344, + "grad_norm": 0.2292425084059167, + "learning_rate": 2.368923126864044e-06, + "loss": 0.7924, + "num_tokens": 64109068322.0, + "step": 15338 + }, + { + "epoch": 1.8228163992869875, + "grad_norm": 0.22165878797051436, + "learning_rate": 2.3684324872639357e-06, + "loss": 0.7649, + "num_tokens": 64113249972.0, + "step": 15339 + }, + { + "epoch": 1.8229352346999406, + "grad_norm": 0.23065196371976723, + "learning_rate": 2.3679421673199552e-06, + "loss": 0.7978, + "num_tokens": 64117401528.0, + "step": 15340 + }, + { + "epoch": 1.8230540701128937, + "grad_norm": 0.22228990225816386, + "learning_rate": 2.3674521670502625e-06, + "loss": 0.7986, + "num_tokens": 64121564637.0, + "step": 15341 + }, + { + "epoch": 1.8231729055258468, + "grad_norm": 0.23575015302291674, + "learning_rate": 2.366962486473005e-06, + "loss": 0.8122, + "num_tokens": 64125754932.0, + "step": 15342 + }, + { + "epoch": 1.8232917409387999, + "grad_norm": 0.21703872506454766, + "learning_rate": 2.3664731256063137e-06, + "loss": 0.816, + "num_tokens": 64129943394.0, + "step": 15343 + }, + { + "epoch": 1.823410576351753, + "grad_norm": 0.21741088888837473, + "learning_rate": 2.3659840844683137e-06, + "loss": 0.791, + "num_tokens": 64134113191.0, + "step": 15344 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.23656540128407488, + "learning_rate": 2.365495363077115e-06, + "loss": 0.7749, + "num_tokens": 64138302489.0, + "step": 15345 + }, + { + "epoch": 1.823648247177659, + "grad_norm": 0.21845825509513397, + "learning_rate": 2.365006961450817e-06, + "loss": 0.7889, + "num_tokens": 64142450501.0, + "step": 15346 + }, + { + "epoch": 1.823767082590612, + "grad_norm": 0.2129625190580255, + "learning_rate": 2.3645188796075077e-06, + "loss": 0.7761, + "num_tokens": 64146639502.0, + "step": 15347 + }, + { + "epoch": 1.823885918003565, + "grad_norm": 0.23193424808452706, + "learning_rate": 2.3640311175652614e-06, + "loss": 0.8142, + "num_tokens": 64150829967.0, + "step": 15348 + }, + { + "epoch": 1.824004753416518, + "grad_norm": 0.23416323484816517, + "learning_rate": 2.3635436753421416e-06, + "loss": 0.8206, + "num_tokens": 64154956235.0, + "step": 15349 + }, + { + "epoch": 1.8241235888294711, + "grad_norm": 0.22979638539940486, + "learning_rate": 2.3630565529561992e-06, + "loss": 0.7724, + "num_tokens": 64159144126.0, + "step": 15350 + }, + { + "epoch": 1.8242424242424242, + "grad_norm": 0.21384740509759895, + "learning_rate": 2.3625697504254755e-06, + "loss": 0.8045, + "num_tokens": 64163330712.0, + "step": 15351 + }, + { + "epoch": 1.8243612596553773, + "grad_norm": 0.234634253517262, + "learning_rate": 2.3620832677679995e-06, + "loss": 0.81, + "num_tokens": 64167497401.0, + "step": 15352 + }, + { + "epoch": 1.8244800950683304, + "grad_norm": 0.22716018477908204, + "learning_rate": 2.361597105001782e-06, + "loss": 0.8074, + "num_tokens": 64171682604.0, + "step": 15353 + }, + { + "epoch": 1.8245989304812835, + "grad_norm": 0.2168311856632655, + "learning_rate": 2.3611112621448326e-06, + "loss": 0.774, + "num_tokens": 64175868917.0, + "step": 15354 + }, + { + "epoch": 1.8247177658942366, + "grad_norm": 0.23021534166085303, + "learning_rate": 2.360625739215142e-06, + "loss": 0.8034, + "num_tokens": 64180046152.0, + "step": 15355 + }, + { + "epoch": 1.8248366013071895, + "grad_norm": 0.23099639357004623, + "learning_rate": 2.3601405362306913e-06, + "loss": 0.8006, + "num_tokens": 64184235316.0, + "step": 15356 + }, + { + "epoch": 1.8249554367201426, + "grad_norm": 0.23358211872961035, + "learning_rate": 2.359655653209449e-06, + "loss": 0.7872, + "num_tokens": 64188401159.0, + "step": 15357 + }, + { + "epoch": 1.8250742721330957, + "grad_norm": 0.23176872693151448, + "learning_rate": 2.35917109016937e-06, + "loss": 0.7898, + "num_tokens": 64192591080.0, + "step": 15358 + }, + { + "epoch": 1.8251931075460486, + "grad_norm": 0.22086410643559198, + "learning_rate": 2.3586868471284002e-06, + "loss": 0.8186, + "num_tokens": 64196780892.0, + "step": 15359 + }, + { + "epoch": 1.8253119429590017, + "grad_norm": 0.23466624250284082, + "learning_rate": 2.3582029241044753e-06, + "loss": 0.7712, + "num_tokens": 64200941785.0, + "step": 15360 + }, + { + "epoch": 1.8254307783719548, + "grad_norm": 0.22650597698334227, + "learning_rate": 2.357719321115514e-06, + "loss": 0.859, + "num_tokens": 64205114294.0, + "step": 15361 + }, + { + "epoch": 1.8255496137849079, + "grad_norm": 0.2461534794578754, + "learning_rate": 2.3572360381794267e-06, + "loss": 0.7843, + "num_tokens": 64209287400.0, + "step": 15362 + }, + { + "epoch": 1.825668449197861, + "grad_norm": 0.21908285169578037, + "learning_rate": 2.3567530753141118e-06, + "loss": 0.8242, + "num_tokens": 64213474630.0, + "step": 15363 + }, + { + "epoch": 1.825787284610814, + "grad_norm": 0.21733588209853488, + "learning_rate": 2.356270432537452e-06, + "loss": 0.8467, + "num_tokens": 64217662272.0, + "step": 15364 + }, + { + "epoch": 1.8259061200237672, + "grad_norm": 0.22262236899758137, + "learning_rate": 2.355788109867325e-06, + "loss": 0.8127, + "num_tokens": 64221822746.0, + "step": 15365 + }, + { + "epoch": 1.8260249554367203, + "grad_norm": 0.21135715236955183, + "learning_rate": 2.3553061073215883e-06, + "loss": 0.7805, + "num_tokens": 64226011349.0, + "step": 15366 + }, + { + "epoch": 1.8261437908496732, + "grad_norm": 0.23029503539147897, + "learning_rate": 2.3548244249180977e-06, + "loss": 0.8483, + "num_tokens": 64230201454.0, + "step": 15367 + }, + { + "epoch": 1.8262626262626263, + "grad_norm": 0.2276403979409913, + "learning_rate": 2.354343062674688e-06, + "loss": 0.7754, + "num_tokens": 64234391891.0, + "step": 15368 + }, + { + "epoch": 1.8263814616755794, + "grad_norm": 0.21540442328593518, + "learning_rate": 2.3538620206091864e-06, + "loss": 0.8252, + "num_tokens": 64238554733.0, + "step": 15369 + }, + { + "epoch": 1.8265002970885322, + "grad_norm": 0.235087536767248, + "learning_rate": 2.3533812987394063e-06, + "loss": 0.7933, + "num_tokens": 64242712104.0, + "step": 15370 + }, + { + "epoch": 1.8266191325014853, + "grad_norm": 0.22141709949482774, + "learning_rate": 2.352900897083153e-06, + "loss": 0.8037, + "num_tokens": 64246901155.0, + "step": 15371 + }, + { + "epoch": 1.8267379679144384, + "grad_norm": 0.22292709603155308, + "learning_rate": 2.352420815658215e-06, + "loss": 0.8276, + "num_tokens": 64251089781.0, + "step": 15372 + }, + { + "epoch": 1.8268568033273915, + "grad_norm": 0.22939130415578918, + "learning_rate": 2.3519410544823733e-06, + "loss": 0.8408, + "num_tokens": 64255277370.0, + "step": 15373 + }, + { + "epoch": 1.8269756387403446, + "grad_norm": 0.21160346027845417, + "learning_rate": 2.3514616135733937e-06, + "loss": 0.7903, + "num_tokens": 64259466343.0, + "step": 15374 + }, + { + "epoch": 1.8270944741532977, + "grad_norm": 0.22011171240111255, + "learning_rate": 2.3509824929490314e-06, + "loss": 0.7781, + "num_tokens": 64263656492.0, + "step": 15375 + }, + { + "epoch": 1.8272133095662508, + "grad_norm": 0.2234105059612184, + "learning_rate": 2.3505036926270306e-06, + "loss": 0.8371, + "num_tokens": 64267843968.0, + "step": 15376 + }, + { + "epoch": 1.827332144979204, + "grad_norm": 0.21353957326237458, + "learning_rate": 2.350025212625124e-06, + "loss": 0.8242, + "num_tokens": 64272033115.0, + "step": 15377 + }, + { + "epoch": 1.8274509803921568, + "grad_norm": 0.21998617986499794, + "learning_rate": 2.349547052961028e-06, + "loss": 0.8025, + "num_tokens": 64276222448.0, + "step": 15378 + }, + { + "epoch": 1.82756981580511, + "grad_norm": 0.22676286370989127, + "learning_rate": 2.349069213652454e-06, + "loss": 0.8078, + "num_tokens": 64280410651.0, + "step": 15379 + }, + { + "epoch": 1.827688651218063, + "grad_norm": 0.2143504615028306, + "learning_rate": 2.3485916947170955e-06, + "loss": 0.8134, + "num_tokens": 64284600076.0, + "step": 15380 + }, + { + "epoch": 1.8278074866310159, + "grad_norm": 0.24985010829076632, + "learning_rate": 2.348114496172639e-06, + "loss": 0.8282, + "num_tokens": 64288788453.0, + "step": 15381 + }, + { + "epoch": 1.827926322043969, + "grad_norm": 0.23453321288754522, + "learning_rate": 2.3476376180367537e-06, + "loss": 0.8267, + "num_tokens": 64292950800.0, + "step": 15382 + }, + { + "epoch": 1.828045157456922, + "grad_norm": 0.22597083414210167, + "learning_rate": 2.3471610603271046e-06, + "loss": 0.817, + "num_tokens": 64297140632.0, + "step": 15383 + }, + { + "epoch": 1.8281639928698752, + "grad_norm": 0.2337925671301109, + "learning_rate": 2.3466848230613345e-06, + "loss": 0.7933, + "num_tokens": 64301317933.0, + "step": 15384 + }, + { + "epoch": 1.8282828282828283, + "grad_norm": 0.23586480710968197, + "learning_rate": 2.346208906257083e-06, + "loss": 0.8501, + "num_tokens": 64305487987.0, + "step": 15385 + }, + { + "epoch": 1.8284016636957814, + "grad_norm": 0.23090838431025795, + "learning_rate": 2.345733309931976e-06, + "loss": 0.8507, + "num_tokens": 64309677319.0, + "step": 15386 + }, + { + "epoch": 1.8285204991087345, + "grad_norm": 0.24106816602745, + "learning_rate": 2.345258034103625e-06, + "loss": 0.8026, + "num_tokens": 64313865873.0, + "step": 15387 + }, + { + "epoch": 1.8286393345216876, + "grad_norm": 0.22357028671600232, + "learning_rate": 2.344783078789632e-06, + "loss": 0.8165, + "num_tokens": 64318044666.0, + "step": 15388 + }, + { + "epoch": 1.8287581699346407, + "grad_norm": 0.21747872472913873, + "learning_rate": 2.344308444007586e-06, + "loss": 0.8233, + "num_tokens": 64322232888.0, + "step": 15389 + }, + { + "epoch": 1.8288770053475936, + "grad_norm": 0.24422069267716806, + "learning_rate": 2.3438341297750626e-06, + "loss": 0.8043, + "num_tokens": 64326421398.0, + "step": 15390 + }, + { + "epoch": 1.8289958407605467, + "grad_norm": 0.2280505067074017, + "learning_rate": 2.343360136109627e-06, + "loss": 0.8227, + "num_tokens": 64330610479.0, + "step": 15391 + }, + { + "epoch": 1.8291146761734995, + "grad_norm": 0.21544068768500305, + "learning_rate": 2.3428864630288366e-06, + "loss": 0.7888, + "num_tokens": 64334799262.0, + "step": 15392 + }, + { + "epoch": 1.8292335115864526, + "grad_norm": 0.234607158620116, + "learning_rate": 2.342413110550232e-06, + "loss": 0.8048, + "num_tokens": 64338989715.0, + "step": 15393 + }, + { + "epoch": 1.8293523469994057, + "grad_norm": 0.2222784563218793, + "learning_rate": 2.3419400786913397e-06, + "loss": 0.8207, + "num_tokens": 64343158787.0, + "step": 15394 + }, + { + "epoch": 1.8294711824123588, + "grad_norm": 0.22954652569940354, + "learning_rate": 2.3414673674696813e-06, + "loss": 0.8339, + "num_tokens": 64347348480.0, + "step": 15395 + }, + { + "epoch": 1.829590017825312, + "grad_norm": 0.23039550171466752, + "learning_rate": 2.3409949769027598e-06, + "loss": 0.8284, + "num_tokens": 64351517922.0, + "step": 15396 + }, + { + "epoch": 1.829708853238265, + "grad_norm": 0.21869912408260997, + "learning_rate": 2.340522907008071e-06, + "loss": 0.7781, + "num_tokens": 64355688399.0, + "step": 15397 + }, + { + "epoch": 1.8298276886512181, + "grad_norm": 0.22487617893527215, + "learning_rate": 2.3400511578031003e-06, + "loss": 0.8023, + "num_tokens": 64359837139.0, + "step": 15398 + }, + { + "epoch": 1.8299465240641712, + "grad_norm": 0.2134253457303661, + "learning_rate": 2.339579729305313e-06, + "loss": 0.784, + "num_tokens": 64364025010.0, + "step": 15399 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 0.2227304736829146, + "learning_rate": 2.3391086215321696e-06, + "loss": 0.8286, + "num_tokens": 64368202306.0, + "step": 15400 + }, + { + "epoch": 1.8301841948900772, + "grad_norm": 0.22149198847242593, + "learning_rate": 2.3386378345011184e-06, + "loss": 0.796, + "num_tokens": 64372390191.0, + "step": 15401 + }, + { + "epoch": 1.8303030303030303, + "grad_norm": 0.22467607271593007, + "learning_rate": 2.3381673682295918e-06, + "loss": 0.8172, + "num_tokens": 64376579055.0, + "step": 15402 + }, + { + "epoch": 1.8304218657159832, + "grad_norm": 0.2296682154707399, + "learning_rate": 2.3376972227350145e-06, + "loss": 0.8027, + "num_tokens": 64380767757.0, + "step": 15403 + }, + { + "epoch": 1.8305407011289363, + "grad_norm": 0.21887297919944915, + "learning_rate": 2.337227398034797e-06, + "loss": 0.7914, + "num_tokens": 64384956234.0, + "step": 15404 + }, + { + "epoch": 1.8306595365418894, + "grad_norm": 0.21008100130424337, + "learning_rate": 2.336757894146338e-06, + "loss": 0.7882, + "num_tokens": 64389146399.0, + "step": 15405 + }, + { + "epoch": 1.8307783719548425, + "grad_norm": 0.223155512934985, + "learning_rate": 2.3362887110870256e-06, + "loss": 0.8442, + "num_tokens": 64393303280.0, + "step": 15406 + }, + { + "epoch": 1.8308972073677956, + "grad_norm": 0.2161015563737812, + "learning_rate": 2.335819848874235e-06, + "loss": 0.791, + "num_tokens": 64397464863.0, + "step": 15407 + }, + { + "epoch": 1.8310160427807487, + "grad_norm": 0.21877830731354375, + "learning_rate": 2.33535130752533e-06, + "loss": 0.7869, + "num_tokens": 64401652365.0, + "step": 15408 + }, + { + "epoch": 1.8311348781937018, + "grad_norm": 0.22150933753144209, + "learning_rate": 2.3348830870576626e-06, + "loss": 0.7983, + "num_tokens": 64405840954.0, + "step": 15409 + }, + { + "epoch": 1.831253713606655, + "grad_norm": 0.21455192621488187, + "learning_rate": 2.3344151874885696e-06, + "loss": 0.8052, + "num_tokens": 64410030431.0, + "step": 15410 + }, + { + "epoch": 1.831372549019608, + "grad_norm": 0.21922304815154012, + "learning_rate": 2.3339476088353826e-06, + "loss": 0.8095, + "num_tokens": 64414185089.0, + "step": 15411 + }, + { + "epoch": 1.8314913844325609, + "grad_norm": 0.2273105473327672, + "learning_rate": 2.3334803511154165e-06, + "loss": 0.7954, + "num_tokens": 64418373692.0, + "step": 15412 + }, + { + "epoch": 1.831610219845514, + "grad_norm": 0.22596335291496053, + "learning_rate": 2.3330134143459745e-06, + "loss": 0.7912, + "num_tokens": 64422564530.0, + "step": 15413 + }, + { + "epoch": 1.831729055258467, + "grad_norm": 0.2864230799705483, + "learning_rate": 2.332546798544351e-06, + "loss": 0.7935, + "num_tokens": 64426698948.0, + "step": 15414 + }, + { + "epoch": 1.83184789067142, + "grad_norm": 0.2109152223216597, + "learning_rate": 2.3320805037278243e-06, + "loss": 0.7882, + "num_tokens": 64430881700.0, + "step": 15415 + }, + { + "epoch": 1.831966726084373, + "grad_norm": 0.21179665376547185, + "learning_rate": 2.3316145299136617e-06, + "loss": 0.8053, + "num_tokens": 64435070024.0, + "step": 15416 + }, + { + "epoch": 1.8320855614973262, + "grad_norm": 0.21348967395987054, + "learning_rate": 2.331148877119123e-06, + "loss": 0.8343, + "num_tokens": 64439258391.0, + "step": 15417 + }, + { + "epoch": 1.8322043969102793, + "grad_norm": 0.21336281497908485, + "learning_rate": 2.330683545361451e-06, + "loss": 0.813, + "num_tokens": 64443402737.0, + "step": 15418 + }, + { + "epoch": 1.8323232323232324, + "grad_norm": 0.20263294618892308, + "learning_rate": 2.3302185346578806e-06, + "loss": 0.7864, + "num_tokens": 64447563277.0, + "step": 15419 + }, + { + "epoch": 1.8324420677361855, + "grad_norm": 0.2310049827030434, + "learning_rate": 2.3297538450256302e-06, + "loss": 0.8023, + "num_tokens": 64451741917.0, + "step": 15420 + }, + { + "epoch": 1.8325609031491386, + "grad_norm": 0.21450964802437705, + "learning_rate": 2.3292894764819095e-06, + "loss": 0.7783, + "num_tokens": 64455924929.0, + "step": 15421 + }, + { + "epoch": 1.8326797385620917, + "grad_norm": 0.21538088155210364, + "learning_rate": 2.3288254290439156e-06, + "loss": 0.802, + "num_tokens": 64460107101.0, + "step": 15422 + }, + { + "epoch": 1.8327985739750445, + "grad_norm": 0.21381811415706994, + "learning_rate": 2.3283617027288323e-06, + "loss": 0.8067, + "num_tokens": 64464292872.0, + "step": 15423 + }, + { + "epoch": 1.8329174093879976, + "grad_norm": 0.21233958778602832, + "learning_rate": 2.3278982975538383e-06, + "loss": 0.7838, + "num_tokens": 64468475084.0, + "step": 15424 + }, + { + "epoch": 1.8330362448009507, + "grad_norm": 0.21253214646004875, + "learning_rate": 2.327435213536089e-06, + "loss": 0.8389, + "num_tokens": 64472634364.0, + "step": 15425 + }, + { + "epoch": 1.8331550802139036, + "grad_norm": 0.2261295802568843, + "learning_rate": 2.3269724506927367e-06, + "loss": 0.8055, + "num_tokens": 64476824217.0, + "step": 15426 + }, + { + "epoch": 1.8332739156268567, + "grad_norm": 0.21765507805531906, + "learning_rate": 2.326510009040919e-06, + "loss": 0.8164, + "num_tokens": 64480964872.0, + "step": 15427 + }, + { + "epoch": 1.8333927510398098, + "grad_norm": 0.2106671854786052, + "learning_rate": 2.326047888597762e-06, + "loss": 0.7846, + "num_tokens": 64485139242.0, + "step": 15428 + }, + { + "epoch": 1.833511586452763, + "grad_norm": 0.21819045018386854, + "learning_rate": 2.3255860893803776e-06, + "loss": 0.8071, + "num_tokens": 64489327508.0, + "step": 15429 + }, + { + "epoch": 1.833630421865716, + "grad_norm": 0.2471531381572331, + "learning_rate": 2.3251246114058702e-06, + "loss": 0.7887, + "num_tokens": 64493481661.0, + "step": 15430 + }, + { + "epoch": 1.833749257278669, + "grad_norm": 0.21071918895975797, + "learning_rate": 2.324663454691329e-06, + "loss": 0.7798, + "num_tokens": 64497670688.0, + "step": 15431 + }, + { + "epoch": 1.8338680926916222, + "grad_norm": 0.21773752932408771, + "learning_rate": 2.3242026192538316e-06, + "loss": 0.8025, + "num_tokens": 64501860174.0, + "step": 15432 + }, + { + "epoch": 1.8339869281045753, + "grad_norm": 0.2179713169706541, + "learning_rate": 2.3237421051104447e-06, + "loss": 0.8162, + "num_tokens": 64506049220.0, + "step": 15433 + }, + { + "epoch": 1.8341057635175282, + "grad_norm": 0.2171683978375068, + "learning_rate": 2.3232819122782235e-06, + "loss": 0.8257, + "num_tokens": 64510211492.0, + "step": 15434 + }, + { + "epoch": 1.8342245989304813, + "grad_norm": 0.22882409179797492, + "learning_rate": 2.3228220407742092e-06, + "loss": 0.8121, + "num_tokens": 64514401187.0, + "step": 15435 + }, + { + "epoch": 1.8343434343434344, + "grad_norm": 0.2446318396308233, + "learning_rate": 2.3223624906154326e-06, + "loss": 0.8207, + "num_tokens": 64518591226.0, + "step": 15436 + }, + { + "epoch": 1.8344622697563873, + "grad_norm": 0.23849742624944834, + "learning_rate": 2.321903261818914e-06, + "loss": 0.8394, + "num_tokens": 64522739488.0, + "step": 15437 + }, + { + "epoch": 1.8345811051693404, + "grad_norm": 0.24179720067082963, + "learning_rate": 2.321444354401658e-06, + "loss": 0.8381, + "num_tokens": 64526928033.0, + "step": 15438 + }, + { + "epoch": 1.8346999405822935, + "grad_norm": 0.2205512101726393, + "learning_rate": 2.3209857683806608e-06, + "loss": 0.7733, + "num_tokens": 64531117666.0, + "step": 15439 + }, + { + "epoch": 1.8348187759952466, + "grad_norm": 0.22213634510656718, + "learning_rate": 2.320527503772905e-06, + "loss": 0.7752, + "num_tokens": 64535279484.0, + "step": 15440 + }, + { + "epoch": 1.8349376114081997, + "grad_norm": 0.2207798289578142, + "learning_rate": 2.3200695605953622e-06, + "loss": 0.8287, + "num_tokens": 64539468054.0, + "step": 15441 + }, + { + "epoch": 1.8350564468211528, + "grad_norm": 0.22570989887397877, + "learning_rate": 2.3196119388649893e-06, + "loss": 0.7931, + "num_tokens": 64543656390.0, + "step": 15442 + }, + { + "epoch": 1.8351752822341059, + "grad_norm": 0.2280062661449174, + "learning_rate": 2.319154638598737e-06, + "loss": 0.82, + "num_tokens": 64547845830.0, + "step": 15443 + }, + { + "epoch": 1.835294117647059, + "grad_norm": 0.21818572098227598, + "learning_rate": 2.3186976598135383e-06, + "loss": 0.7995, + "num_tokens": 64552003638.0, + "step": 15444 + }, + { + "epoch": 1.8354129530600118, + "grad_norm": 0.24566046128870267, + "learning_rate": 2.3182410025263187e-06, + "loss": 0.7817, + "num_tokens": 64556193147.0, + "step": 15445 + }, + { + "epoch": 1.835531788472965, + "grad_norm": 0.22645486410128488, + "learning_rate": 2.317784666753988e-06, + "loss": 0.7949, + "num_tokens": 64560318992.0, + "step": 15446 + }, + { + "epoch": 1.835650623885918, + "grad_norm": 0.21547897598408636, + "learning_rate": 2.3173286525134464e-06, + "loss": 0.7842, + "num_tokens": 64564509012.0, + "step": 15447 + }, + { + "epoch": 1.835769459298871, + "grad_norm": 0.24696965807280197, + "learning_rate": 2.3168729598215797e-06, + "loss": 0.7697, + "num_tokens": 64568698068.0, + "step": 15448 + }, + { + "epoch": 1.835888294711824, + "grad_norm": 0.2299632513342775, + "learning_rate": 2.316417588695267e-06, + "loss": 0.8114, + "num_tokens": 64572887844.0, + "step": 15449 + }, + { + "epoch": 1.8360071301247771, + "grad_norm": 0.22289632288189518, + "learning_rate": 2.315962539151372e-06, + "loss": 0.7947, + "num_tokens": 64577075307.0, + "step": 15450 + }, + { + "epoch": 1.8361259655377302, + "grad_norm": 0.23569260129265765, + "learning_rate": 2.3155078112067452e-06, + "loss": 0.8304, + "num_tokens": 64581263382.0, + "step": 15451 + }, + { + "epoch": 1.8362448009506833, + "grad_norm": 0.2243734104515241, + "learning_rate": 2.3150534048782262e-06, + "loss": 0.8061, + "num_tokens": 64585451833.0, + "step": 15452 + }, + { + "epoch": 1.8363636363636364, + "grad_norm": 0.2523324948725157, + "learning_rate": 2.3145993201826443e-06, + "loss": 0.7707, + "num_tokens": 64589641164.0, + "step": 15453 + }, + { + "epoch": 1.8364824717765895, + "grad_norm": 0.21529534998973723, + "learning_rate": 2.3141455571368157e-06, + "loss": 0.8126, + "num_tokens": 64593787985.0, + "step": 15454 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.22892794576048792, + "learning_rate": 2.313692115757545e-06, + "loss": 0.7906, + "num_tokens": 64597976916.0, + "step": 15455 + }, + { + "epoch": 1.8367201426024955, + "grad_norm": 0.25024173612500106, + "learning_rate": 2.3132389960616235e-06, + "loss": 0.8076, + "num_tokens": 64602112715.0, + "step": 15456 + }, + { + "epoch": 1.8368389780154486, + "grad_norm": 0.20359842474578901, + "learning_rate": 2.3127861980658338e-06, + "loss": 0.778, + "num_tokens": 64606302320.0, + "step": 15457 + }, + { + "epoch": 1.8369578134284017, + "grad_norm": 0.23152278588347608, + "learning_rate": 2.312333721786943e-06, + "loss": 0.8038, + "num_tokens": 64610460778.0, + "step": 15458 + }, + { + "epoch": 1.8370766488413546, + "grad_norm": 0.24776081478579617, + "learning_rate": 2.3118815672417076e-06, + "loss": 0.8081, + "num_tokens": 64614618369.0, + "step": 15459 + }, + { + "epoch": 1.8371954842543077, + "grad_norm": 0.2186853058960808, + "learning_rate": 2.3114297344468735e-06, + "loss": 0.8192, + "num_tokens": 64618807114.0, + "step": 15460 + }, + { + "epoch": 1.8373143196672608, + "grad_norm": 0.2280331104446834, + "learning_rate": 2.3109782234191728e-06, + "loss": 0.7976, + "num_tokens": 64622996093.0, + "step": 15461 + }, + { + "epoch": 1.8374331550802139, + "grad_norm": 0.2208994150336576, + "learning_rate": 2.3105270341753263e-06, + "loss": 0.8504, + "num_tokens": 64627149801.0, + "step": 15462 + }, + { + "epoch": 1.837551990493167, + "grad_norm": 0.2140245953344735, + "learning_rate": 2.3100761667320443e-06, + "loss": 0.7884, + "num_tokens": 64631339532.0, + "step": 15463 + }, + { + "epoch": 1.83767082590612, + "grad_norm": 0.23033586340363452, + "learning_rate": 2.309625621106023e-06, + "loss": 0.8038, + "num_tokens": 64635528490.0, + "step": 15464 + }, + { + "epoch": 1.8377896613190732, + "grad_norm": 0.21305113982056315, + "learning_rate": 2.309175397313948e-06, + "loss": 0.7916, + "num_tokens": 64639716166.0, + "step": 15465 + }, + { + "epoch": 1.8379084967320263, + "grad_norm": 0.21412243549617296, + "learning_rate": 2.3087254953724923e-06, + "loss": 0.8207, + "num_tokens": 64643905542.0, + "step": 15466 + }, + { + "epoch": 1.8380273321449792, + "grad_norm": 0.20817080111397168, + "learning_rate": 2.3082759152983174e-06, + "loss": 0.796, + "num_tokens": 64648095245.0, + "step": 15467 + }, + { + "epoch": 1.8381461675579323, + "grad_norm": 0.2122549357224891, + "learning_rate": 2.3078266571080723e-06, + "loss": 0.7827, + "num_tokens": 64652248059.0, + "step": 15468 + }, + { + "epoch": 1.8382650029708854, + "grad_norm": 0.22315893382594004, + "learning_rate": 2.307377720818395e-06, + "loss": 0.8371, + "num_tokens": 64656437134.0, + "step": 15469 + }, + { + "epoch": 1.8383838383838382, + "grad_norm": 0.22956391935862264, + "learning_rate": 2.3069291064459122e-06, + "loss": 0.8245, + "num_tokens": 64660625960.0, + "step": 15470 + }, + { + "epoch": 1.8385026737967913, + "grad_norm": 0.24445926137400134, + "learning_rate": 2.3064808140072353e-06, + "loss": 0.8113, + "num_tokens": 64664814962.0, + "step": 15471 + }, + { + "epoch": 1.8386215092097444, + "grad_norm": 0.22453270521937896, + "learning_rate": 2.3060328435189693e-06, + "loss": 0.7843, + "num_tokens": 64668986607.0, + "step": 15472 + }, + { + "epoch": 1.8387403446226975, + "grad_norm": 0.2497598774026471, + "learning_rate": 2.305585194997701e-06, + "loss": 0.8208, + "num_tokens": 64673174426.0, + "step": 15473 + }, + { + "epoch": 1.8388591800356506, + "grad_norm": 0.23896611856701122, + "learning_rate": 2.3051378684600082e-06, + "loss": 0.81, + "num_tokens": 64677363486.0, + "step": 15474 + }, + { + "epoch": 1.8389780154486037, + "grad_norm": 0.22138841683283647, + "learning_rate": 2.3046908639224584e-06, + "loss": 0.804, + "num_tokens": 64681515381.0, + "step": 15475 + }, + { + "epoch": 1.8390968508615568, + "grad_norm": 0.23507152255741273, + "learning_rate": 2.304244181401607e-06, + "loss": 0.7777, + "num_tokens": 64685703146.0, + "step": 15476 + }, + { + "epoch": 1.83921568627451, + "grad_norm": 0.22814223141571074, + "learning_rate": 2.3037978209139943e-06, + "loss": 0.7907, + "num_tokens": 64689872508.0, + "step": 15477 + }, + { + "epoch": 1.8393345216874628, + "grad_norm": 0.2288097592857817, + "learning_rate": 2.3033517824761507e-06, + "loss": 0.8294, + "num_tokens": 64694039663.0, + "step": 15478 + }, + { + "epoch": 1.839453357100416, + "grad_norm": 0.22504880515003864, + "learning_rate": 2.302906066104594e-06, + "loss": 0.8395, + "num_tokens": 64698174699.0, + "step": 15479 + }, + { + "epoch": 1.839572192513369, + "grad_norm": 0.23127606824496763, + "learning_rate": 2.3024606718158305e-06, + "loss": 0.8043, + "num_tokens": 64702342763.0, + "step": 15480 + }, + { + "epoch": 1.8396910279263219, + "grad_norm": 0.2291370948273123, + "learning_rate": 2.3020155996263572e-06, + "loss": 0.8202, + "num_tokens": 64706531654.0, + "step": 15481 + }, + { + "epoch": 1.839809863339275, + "grad_norm": 0.21652116123278375, + "learning_rate": 2.301570849552655e-06, + "loss": 0.8172, + "num_tokens": 64710704923.0, + "step": 15482 + }, + { + "epoch": 1.839928698752228, + "grad_norm": 0.22187580494048129, + "learning_rate": 2.3011264216111935e-06, + "loss": 0.7658, + "num_tokens": 64714894719.0, + "step": 15483 + }, + { + "epoch": 1.8400475341651812, + "grad_norm": 0.2055285527974904, + "learning_rate": 2.3006823158184318e-06, + "loss": 0.7798, + "num_tokens": 64719083360.0, + "step": 15484 + }, + { + "epoch": 1.8401663695781343, + "grad_norm": 0.21366092807867335, + "learning_rate": 2.300238532190818e-06, + "loss": 0.8115, + "num_tokens": 64723236067.0, + "step": 15485 + }, + { + "epoch": 1.8402852049910874, + "grad_norm": 0.22023153037073245, + "learning_rate": 2.299795070744784e-06, + "loss": 0.8027, + "num_tokens": 64727425494.0, + "step": 15486 + }, + { + "epoch": 1.8404040404040405, + "grad_norm": 0.2103438987850638, + "learning_rate": 2.2993519314967566e-06, + "loss": 0.7788, + "num_tokens": 64731614151.0, + "step": 15487 + }, + { + "epoch": 1.8405228758169936, + "grad_norm": 0.20611316995725162, + "learning_rate": 2.2989091144631444e-06, + "loss": 0.7934, + "num_tokens": 64735803152.0, + "step": 15488 + }, + { + "epoch": 1.8406417112299467, + "grad_norm": 0.2113297567432176, + "learning_rate": 2.2984666196603463e-06, + "loss": 0.8218, + "num_tokens": 64739976367.0, + "step": 15489 + }, + { + "epoch": 1.8407605466428996, + "grad_norm": 0.2154281802310283, + "learning_rate": 2.2980244471047497e-06, + "loss": 0.8285, + "num_tokens": 64744161646.0, + "step": 15490 + }, + { + "epoch": 1.8408793820558527, + "grad_norm": 0.2168810554857102, + "learning_rate": 2.29758259681273e-06, + "loss": 0.8162, + "num_tokens": 64748349428.0, + "step": 15491 + }, + { + "epoch": 1.8409982174688055, + "grad_norm": 0.21958777577130686, + "learning_rate": 2.2971410688006506e-06, + "loss": 0.8118, + "num_tokens": 64752514826.0, + "step": 15492 + }, + { + "epoch": 1.8411170528817586, + "grad_norm": 0.21922270547029155, + "learning_rate": 2.2966998630848625e-06, + "loss": 0.8174, + "num_tokens": 64756686795.0, + "step": 15493 + }, + { + "epoch": 1.8412358882947117, + "grad_norm": 0.21158934571544227, + "learning_rate": 2.2962589796817047e-06, + "loss": 0.8182, + "num_tokens": 64760844365.0, + "step": 15494 + }, + { + "epoch": 1.8413547237076648, + "grad_norm": 0.21352191007559165, + "learning_rate": 2.295818418607505e-06, + "loss": 0.8038, + "num_tokens": 64765022117.0, + "step": 15495 + }, + { + "epoch": 1.841473559120618, + "grad_norm": 0.20699446693073914, + "learning_rate": 2.2953781798785792e-06, + "loss": 0.8096, + "num_tokens": 64769211448.0, + "step": 15496 + }, + { + "epoch": 1.841592394533571, + "grad_norm": 0.21655844520266465, + "learning_rate": 2.2949382635112304e-06, + "loss": 0.7686, + "num_tokens": 64773399931.0, + "step": 15497 + }, + { + "epoch": 1.8417112299465241, + "grad_norm": 0.23542031106212105, + "learning_rate": 2.2944986695217506e-06, + "loss": 0.7894, + "num_tokens": 64777588963.0, + "step": 15498 + }, + { + "epoch": 1.8418300653594772, + "grad_norm": 0.22236968834324491, + "learning_rate": 2.294059397926417e-06, + "loss": 0.8239, + "num_tokens": 64781758260.0, + "step": 15499 + }, + { + "epoch": 1.8419489007724303, + "grad_norm": 0.21778098464461568, + "learning_rate": 2.2936204487415015e-06, + "loss": 0.8368, + "num_tokens": 64785917857.0, + "step": 15500 + }, + { + "epoch": 1.8420677361853832, + "grad_norm": 0.23813896135372872, + "learning_rate": 2.293181821983257e-06, + "loss": 0.8165, + "num_tokens": 64790107246.0, + "step": 15501 + }, + { + "epoch": 1.8421865715983363, + "grad_norm": 0.21511576820025505, + "learning_rate": 2.2927435176679285e-06, + "loss": 0.8385, + "num_tokens": 64794295270.0, + "step": 15502 + }, + { + "epoch": 1.8423054070112892, + "grad_norm": 0.22958508185035467, + "learning_rate": 2.2923055358117476e-06, + "loss": 0.8378, + "num_tokens": 64798460640.0, + "step": 15503 + }, + { + "epoch": 1.8424242424242423, + "grad_norm": 0.23173066635574802, + "learning_rate": 2.2918678764309343e-06, + "loss": 0.825, + "num_tokens": 64802646036.0, + "step": 15504 + }, + { + "epoch": 1.8425430778371954, + "grad_norm": 0.22977161084200196, + "learning_rate": 2.291430539541695e-06, + "loss": 0.8242, + "num_tokens": 64806836226.0, + "step": 15505 + }, + { + "epoch": 1.8426619132501485, + "grad_norm": 0.21196583762973625, + "learning_rate": 2.2909935251602267e-06, + "loss": 0.7991, + "num_tokens": 64811021918.0, + "step": 15506 + }, + { + "epoch": 1.8427807486631016, + "grad_norm": 0.21825566593820017, + "learning_rate": 2.2905568333027157e-06, + "loss": 0.7741, + "num_tokens": 64815210123.0, + "step": 15507 + }, + { + "epoch": 1.8428995840760547, + "grad_norm": 0.21382962746139503, + "learning_rate": 2.2901204639853336e-06, + "loss": 0.7782, + "num_tokens": 64819400054.0, + "step": 15508 + }, + { + "epoch": 1.8430184194890078, + "grad_norm": 0.24771814616399426, + "learning_rate": 2.289684417224238e-06, + "loss": 0.8161, + "num_tokens": 64823586132.0, + "step": 15509 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.221488085780149, + "learning_rate": 2.2892486930355783e-06, + "loss": 0.813, + "num_tokens": 64827775302.0, + "step": 15510 + }, + { + "epoch": 1.843256090314914, + "grad_norm": 0.20689087145215362, + "learning_rate": 2.2888132914354923e-06, + "loss": 0.7727, + "num_tokens": 64831950119.0, + "step": 15511 + }, + { + "epoch": 1.8433749257278669, + "grad_norm": 0.22148425944009612, + "learning_rate": 2.288378212440101e-06, + "loss": 0.7705, + "num_tokens": 64836139319.0, + "step": 15512 + }, + { + "epoch": 1.84349376114082, + "grad_norm": 0.23316869922022082, + "learning_rate": 2.287943456065522e-06, + "loss": 0.8189, + "num_tokens": 64840327285.0, + "step": 15513 + }, + { + "epoch": 1.843612596553773, + "grad_norm": 0.21202991387361134, + "learning_rate": 2.287509022327852e-06, + "loss": 0.7922, + "num_tokens": 64844516503.0, + "step": 15514 + }, + { + "epoch": 1.843731431966726, + "grad_norm": 0.2135995212543735, + "learning_rate": 2.2870749112431804e-06, + "loss": 0.7849, + "num_tokens": 64848704802.0, + "step": 15515 + }, + { + "epoch": 1.843850267379679, + "grad_norm": 0.24515580367158044, + "learning_rate": 2.2866411228275835e-06, + "loss": 0.8138, + "num_tokens": 64852833778.0, + "step": 15516 + }, + { + "epoch": 1.8439691027926322, + "grad_norm": 0.22006720316376002, + "learning_rate": 2.286207657097127e-06, + "loss": 0.83, + "num_tokens": 64856977548.0, + "step": 15517 + }, + { + "epoch": 1.8440879382055853, + "grad_norm": 0.22218646484598278, + "learning_rate": 2.2857745140678625e-06, + "loss": 0.7839, + "num_tokens": 64861167813.0, + "step": 15518 + }, + { + "epoch": 1.8442067736185384, + "grad_norm": 0.2201434804494754, + "learning_rate": 2.285341693755832e-06, + "loss": 0.7907, + "num_tokens": 64865356282.0, + "step": 15519 + }, + { + "epoch": 1.8443256090314915, + "grad_norm": 0.22400875318863964, + "learning_rate": 2.284909196177063e-06, + "loss": 0.8359, + "num_tokens": 64869545364.0, + "step": 15520 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.219108981743025, + "learning_rate": 2.2844770213475726e-06, + "loss": 0.8008, + "num_tokens": 64873734684.0, + "step": 15521 + }, + { + "epoch": 1.8445632798573977, + "grad_norm": 0.21759954238700605, + "learning_rate": 2.284045169283367e-06, + "loss": 0.817, + "num_tokens": 64877923879.0, + "step": 15522 + }, + { + "epoch": 1.8446821152703505, + "grad_norm": 0.2532256891760306, + "learning_rate": 2.283613640000437e-06, + "loss": 0.8227, + "num_tokens": 64882113528.0, + "step": 15523 + }, + { + "epoch": 1.8448009506833036, + "grad_norm": 0.22231049895483165, + "learning_rate": 2.2831824335147666e-06, + "loss": 0.8042, + "num_tokens": 64886302009.0, + "step": 15524 + }, + { + "epoch": 1.8449197860962567, + "grad_norm": 0.2185684530951455, + "learning_rate": 2.2827515498423204e-06, + "loss": 0.793, + "num_tokens": 64890491545.0, + "step": 15525 + }, + { + "epoch": 1.8450386215092096, + "grad_norm": 0.24673626671791532, + "learning_rate": 2.282320988999059e-06, + "loss": 0.7931, + "num_tokens": 64894655375.0, + "step": 15526 + }, + { + "epoch": 1.8451574569221627, + "grad_norm": 0.23217632539146066, + "learning_rate": 2.281890751000927e-06, + "loss": 0.8037, + "num_tokens": 64898845471.0, + "step": 15527 + }, + { + "epoch": 1.8452762923351158, + "grad_norm": 0.22340758472739902, + "learning_rate": 2.2814608358638567e-06, + "loss": 0.826, + "num_tokens": 64903014171.0, + "step": 15528 + }, + { + "epoch": 1.845395127748069, + "grad_norm": 0.23883663736820412, + "learning_rate": 2.281031243603771e-06, + "loss": 0.771, + "num_tokens": 64907203880.0, + "step": 15529 + }, + { + "epoch": 1.845513963161022, + "grad_norm": 0.24919847492336084, + "learning_rate": 2.2806019742365763e-06, + "loss": 0.8285, + "num_tokens": 64911393421.0, + "step": 15530 + }, + { + "epoch": 1.845632798573975, + "grad_norm": 0.21626431078724562, + "learning_rate": 2.2801730277781716e-06, + "loss": 0.7993, + "num_tokens": 64915555800.0, + "step": 15531 + }, + { + "epoch": 1.8457516339869282, + "grad_norm": 0.23891594183228887, + "learning_rate": 2.2797444042444415e-06, + "loss": 0.8253, + "num_tokens": 64919715962.0, + "step": 15532 + }, + { + "epoch": 1.8458704693998813, + "grad_norm": 0.24274417361750125, + "learning_rate": 2.2793161036512608e-06, + "loss": 0.8219, + "num_tokens": 64923893749.0, + "step": 15533 + }, + { + "epoch": 1.8459893048128342, + "grad_norm": 0.22998964150632564, + "learning_rate": 2.278888126014492e-06, + "loss": 0.7938, + "num_tokens": 64928082859.0, + "step": 15534 + }, + { + "epoch": 1.8461081402257873, + "grad_norm": 0.22219044678151162, + "learning_rate": 2.27846047134998e-06, + "loss": 0.8307, + "num_tokens": 64932270381.0, + "step": 15535 + }, + { + "epoch": 1.8462269756387404, + "grad_norm": 0.21138427644131896, + "learning_rate": 2.278033139673566e-06, + "loss": 0.8127, + "num_tokens": 64936436689.0, + "step": 15536 + }, + { + "epoch": 1.8463458110516933, + "grad_norm": 0.21261280735893642, + "learning_rate": 2.277606131001074e-06, + "loss": 0.7805, + "num_tokens": 64940625213.0, + "step": 15537 + }, + { + "epoch": 1.8464646464646464, + "grad_norm": 0.22509563404615482, + "learning_rate": 2.277179445348318e-06, + "loss": 0.805, + "num_tokens": 64944815643.0, + "step": 15538 + }, + { + "epoch": 1.8465834818775995, + "grad_norm": 0.21033546102012637, + "learning_rate": 2.2767530827311015e-06, + "loss": 0.7909, + "num_tokens": 64948980879.0, + "step": 15539 + }, + { + "epoch": 1.8467023172905526, + "grad_norm": 0.21867322933681219, + "learning_rate": 2.276327043165211e-06, + "loss": 0.7619, + "num_tokens": 64953170273.0, + "step": 15540 + }, + { + "epoch": 1.8468211527035057, + "grad_norm": 0.20799439850048818, + "learning_rate": 2.275901326666425e-06, + "loss": 0.7983, + "num_tokens": 64957300531.0, + "step": 15541 + }, + { + "epoch": 1.8469399881164588, + "grad_norm": 0.21231347799392059, + "learning_rate": 2.2754759332505103e-06, + "loss": 0.8485, + "num_tokens": 64961489215.0, + "step": 15542 + }, + { + "epoch": 1.8470588235294119, + "grad_norm": 0.2147762231456455, + "learning_rate": 2.2750508629332205e-06, + "loss": 0.8182, + "num_tokens": 64965589067.0, + "step": 15543 + }, + { + "epoch": 1.847177658942365, + "grad_norm": 0.2244682349968702, + "learning_rate": 2.274626115730296e-06, + "loss": 0.816, + "num_tokens": 64969778525.0, + "step": 15544 + }, + { + "epoch": 1.8472964943553178, + "grad_norm": 0.20755473279619305, + "learning_rate": 2.274201691657469e-06, + "loss": 0.8109, + "num_tokens": 64973917879.0, + "step": 15545 + }, + { + "epoch": 1.847415329768271, + "grad_norm": 0.229138086752845, + "learning_rate": 2.2737775907304545e-06, + "loss": 0.8063, + "num_tokens": 64978088422.0, + "step": 15546 + }, + { + "epoch": 1.847534165181224, + "grad_norm": 0.24008692305092025, + "learning_rate": 2.2733538129649614e-06, + "loss": 0.7755, + "num_tokens": 64982239340.0, + "step": 15547 + }, + { + "epoch": 1.847653000594177, + "grad_norm": 0.21893663824667559, + "learning_rate": 2.272930358376681e-06, + "loss": 0.8117, + "num_tokens": 64986429290.0, + "step": 15548 + }, + { + "epoch": 1.84777183600713, + "grad_norm": 0.2267379879385304, + "learning_rate": 2.2725072269812966e-06, + "loss": 0.7865, + "num_tokens": 64990619702.0, + "step": 15549 + }, + { + "epoch": 1.8478906714200831, + "grad_norm": 0.2590341667897927, + "learning_rate": 2.2720844187944773e-06, + "loss": 0.8085, + "num_tokens": 64994730185.0, + "step": 15550 + }, + { + "epoch": 1.8480095068330362, + "grad_norm": 0.21919571252684286, + "learning_rate": 2.271661933831883e-06, + "loss": 0.7944, + "num_tokens": 64998920326.0, + "step": 15551 + }, + { + "epoch": 1.8481283422459893, + "grad_norm": 0.24001317298767133, + "learning_rate": 2.2712397721091574e-06, + "loss": 0.8093, + "num_tokens": 65003110669.0, + "step": 15552 + }, + { + "epoch": 1.8482471776589424, + "grad_norm": 0.24825318945220273, + "learning_rate": 2.2708179336419363e-06, + "loss": 0.8096, + "num_tokens": 65007299405.0, + "step": 15553 + }, + { + "epoch": 1.8483660130718955, + "grad_norm": 0.22203973019426276, + "learning_rate": 2.270396418445841e-06, + "loss": 0.7851, + "num_tokens": 65011488618.0, + "step": 15554 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.2583617575009825, + "learning_rate": 2.269975226536483e-06, + "loss": 0.8073, + "num_tokens": 65015664402.0, + "step": 15555 + }, + { + "epoch": 1.8486036838978015, + "grad_norm": 0.23533993299681846, + "learning_rate": 2.2695543579294566e-06, + "loss": 0.8307, + "num_tokens": 65019854752.0, + "step": 15556 + }, + { + "epoch": 1.8487225193107546, + "grad_norm": 0.2246258878296649, + "learning_rate": 2.2691338126403534e-06, + "loss": 0.7844, + "num_tokens": 65024028876.0, + "step": 15557 + }, + { + "epoch": 1.8488413547237077, + "grad_norm": 0.23835456600643704, + "learning_rate": 2.2687135906847435e-06, + "loss": 0.7844, + "num_tokens": 65028156146.0, + "step": 15558 + }, + { + "epoch": 1.8489601901366606, + "grad_norm": 0.22769438491371163, + "learning_rate": 2.268293692078191e-06, + "loss": 0.8314, + "num_tokens": 65032345405.0, + "step": 15559 + }, + { + "epoch": 1.8490790255496137, + "grad_norm": 0.22186009522865086, + "learning_rate": 2.2678741168362456e-06, + "loss": 0.7786, + "num_tokens": 65036489375.0, + "step": 15560 + }, + { + "epoch": 1.8491978609625668, + "grad_norm": 0.21107705919305944, + "learning_rate": 2.267454864974445e-06, + "loss": 0.7685, + "num_tokens": 65040676795.0, + "step": 15561 + }, + { + "epoch": 1.8493166963755199, + "grad_norm": 0.22117304094047183, + "learning_rate": 2.2670359365083167e-06, + "loss": 0.7624, + "num_tokens": 65044865355.0, + "step": 15562 + }, + { + "epoch": 1.849435531788473, + "grad_norm": 0.24075582692840308, + "learning_rate": 2.266617331453374e-06, + "loss": 0.8373, + "num_tokens": 65049005212.0, + "step": 15563 + }, + { + "epoch": 1.849554367201426, + "grad_norm": 0.2193547525749503, + "learning_rate": 2.26619904982512e-06, + "loss": 0.8121, + "num_tokens": 65053145166.0, + "step": 15564 + }, + { + "epoch": 1.8496732026143792, + "grad_norm": 0.21075152726568347, + "learning_rate": 2.2657810916390462e-06, + "loss": 0.7781, + "num_tokens": 65057313047.0, + "step": 15565 + }, + { + "epoch": 1.8497920380273323, + "grad_norm": 0.22878026600099147, + "learning_rate": 2.265363456910629e-06, + "loss": 0.8068, + "num_tokens": 65061501811.0, + "step": 15566 + }, + { + "epoch": 1.8499108734402852, + "grad_norm": 0.21554647853579975, + "learning_rate": 2.264946145655335e-06, + "loss": 0.8229, + "num_tokens": 65065690146.0, + "step": 15567 + }, + { + "epoch": 1.8500297088532383, + "grad_norm": 0.2120095200512824, + "learning_rate": 2.264529157888619e-06, + "loss": 0.8146, + "num_tokens": 65069877417.0, + "step": 15568 + }, + { + "epoch": 1.8501485442661914, + "grad_norm": 0.20957024492790652, + "learning_rate": 2.2641124936259226e-06, + "loss": 0.837, + "num_tokens": 65074031240.0, + "step": 15569 + }, + { + "epoch": 1.8502673796791442, + "grad_norm": 0.23535522380426457, + "learning_rate": 2.2636961528826795e-06, + "loss": 0.7918, + "num_tokens": 65078182315.0, + "step": 15570 + }, + { + "epoch": 1.8503862150920973, + "grad_norm": 0.22404831414143397, + "learning_rate": 2.263280135674305e-06, + "loss": 0.8482, + "num_tokens": 65082370398.0, + "step": 15571 + }, + { + "epoch": 1.8505050505050504, + "grad_norm": 0.21686958793374833, + "learning_rate": 2.2628644420162066e-06, + "loss": 0.8179, + "num_tokens": 65086561363.0, + "step": 15572 + }, + { + "epoch": 1.8506238859180035, + "grad_norm": 0.2200111478221303, + "learning_rate": 2.262449071923779e-06, + "loss": 0.8155, + "num_tokens": 65090749423.0, + "step": 15573 + }, + { + "epoch": 1.8507427213309566, + "grad_norm": 0.21658801835375818, + "learning_rate": 2.2620340254124044e-06, + "loss": 0.7886, + "num_tokens": 65094939595.0, + "step": 15574 + }, + { + "epoch": 1.8508615567439097, + "grad_norm": 0.2225622686266278, + "learning_rate": 2.261619302497454e-06, + "loss": 0.7971, + "num_tokens": 65099130072.0, + "step": 15575 + }, + { + "epoch": 1.8509803921568628, + "grad_norm": 0.21567934005095754, + "learning_rate": 2.2612049031942854e-06, + "loss": 0.8041, + "num_tokens": 65103318209.0, + "step": 15576 + }, + { + "epoch": 1.851099227569816, + "grad_norm": 0.21970169518554872, + "learning_rate": 2.2607908275182473e-06, + "loss": 0.8077, + "num_tokens": 65107493848.0, + "step": 15577 + }, + { + "epoch": 1.851218062982769, + "grad_norm": 0.22195978545737693, + "learning_rate": 2.2603770754846705e-06, + "loss": 0.7867, + "num_tokens": 65111675888.0, + "step": 15578 + }, + { + "epoch": 1.851336898395722, + "grad_norm": 0.23496996845473075, + "learning_rate": 2.2599636471088813e-06, + "loss": 0.8361, + "num_tokens": 65115863201.0, + "step": 15579 + }, + { + "epoch": 1.851455733808675, + "grad_norm": 0.24378821626371516, + "learning_rate": 2.259550542406189e-06, + "loss": 0.8311, + "num_tokens": 65120051137.0, + "step": 15580 + }, + { + "epoch": 1.8515745692216279, + "grad_norm": 0.22222940545286224, + "learning_rate": 2.2591377613918923e-06, + "loss": 0.8142, + "num_tokens": 65124240633.0, + "step": 15581 + }, + { + "epoch": 1.851693404634581, + "grad_norm": 0.22341487974721597, + "learning_rate": 2.258725304081278e-06, + "loss": 0.7645, + "num_tokens": 65128406456.0, + "step": 15582 + }, + { + "epoch": 1.851812240047534, + "grad_norm": 0.22273943105165872, + "learning_rate": 2.2583131704896195e-06, + "loss": 0.8039, + "num_tokens": 65132596753.0, + "step": 15583 + }, + { + "epoch": 1.8519310754604872, + "grad_norm": 0.21601320016760972, + "learning_rate": 2.257901360632182e-06, + "loss": 0.8171, + "num_tokens": 65136754299.0, + "step": 15584 + }, + { + "epoch": 1.8520499108734403, + "grad_norm": 0.21646338626657322, + "learning_rate": 2.2574898745242138e-06, + "loss": 0.8041, + "num_tokens": 65140944418.0, + "step": 15585 + }, + { + "epoch": 1.8521687462863934, + "grad_norm": 0.2087810333065738, + "learning_rate": 2.2570787121809553e-06, + "loss": 0.8002, + "num_tokens": 65145133278.0, + "step": 15586 + }, + { + "epoch": 1.8522875816993465, + "grad_norm": 0.21891148237955713, + "learning_rate": 2.256667873617634e-06, + "loss": 0.8092, + "num_tokens": 65149322102.0, + "step": 15587 + }, + { + "epoch": 1.8524064171122996, + "grad_norm": 0.21274336616479023, + "learning_rate": 2.25625735884946e-06, + "loss": 0.8119, + "num_tokens": 65153509768.0, + "step": 15588 + }, + { + "epoch": 1.8525252525252527, + "grad_norm": 0.21940302265018616, + "learning_rate": 2.2558471678916417e-06, + "loss": 0.7839, + "num_tokens": 65157698761.0, + "step": 15589 + }, + { + "epoch": 1.8526440879382056, + "grad_norm": 0.22311802923629045, + "learning_rate": 2.255437300759366e-06, + "loss": 0.7974, + "num_tokens": 65161888939.0, + "step": 15590 + }, + { + "epoch": 1.8527629233511587, + "grad_norm": 0.2185119668504012, + "learning_rate": 2.2550277574678144e-06, + "loss": 0.8213, + "num_tokens": 65166077935.0, + "step": 15591 + }, + { + "epoch": 1.8528817587641115, + "grad_norm": 0.21967229884887143, + "learning_rate": 2.2546185380321526e-06, + "loss": 0.8052, + "num_tokens": 65170267930.0, + "step": 15592 + }, + { + "epoch": 1.8530005941770646, + "grad_norm": 0.21181706795161429, + "learning_rate": 2.2542096424675342e-06, + "loss": 0.8029, + "num_tokens": 65174456145.0, + "step": 15593 + }, + { + "epoch": 1.8531194295900177, + "grad_norm": 0.21448127966198977, + "learning_rate": 2.2538010707891037e-06, + "loss": 0.7876, + "num_tokens": 65178623684.0, + "step": 15594 + }, + { + "epoch": 1.8532382650029708, + "grad_norm": 0.22327222756103773, + "learning_rate": 2.2533928230119895e-06, + "loss": 0.8054, + "num_tokens": 65182786634.0, + "step": 15595 + }, + { + "epoch": 1.853357100415924, + "grad_norm": 0.20882683531261412, + "learning_rate": 2.2529848991513138e-06, + "loss": 0.8289, + "num_tokens": 65186929783.0, + "step": 15596 + }, + { + "epoch": 1.853475935828877, + "grad_norm": 0.21822739841342545, + "learning_rate": 2.252577299222182e-06, + "loss": 0.8109, + "num_tokens": 65191104962.0, + "step": 15597 + }, + { + "epoch": 1.8535947712418301, + "grad_norm": 0.21551415147941033, + "learning_rate": 2.252170023239688e-06, + "loss": 0.7737, + "num_tokens": 65195287698.0, + "step": 15598 + }, + { + "epoch": 1.8537136066547832, + "grad_norm": 0.21038463854746134, + "learning_rate": 2.251763071218914e-06, + "loss": 0.7804, + "num_tokens": 65199474477.0, + "step": 15599 + }, + { + "epoch": 1.8538324420677363, + "grad_norm": 0.2126089365888026, + "learning_rate": 2.251356443174933e-06, + "loss": 0.835, + "num_tokens": 65203664540.0, + "step": 15600 + }, + { + "epoch": 1.8539512774806892, + "grad_norm": 0.22495960234316498, + "learning_rate": 2.250950139122801e-06, + "loss": 0.7784, + "num_tokens": 65207855159.0, + "step": 15601 + }, + { + "epoch": 1.8540701128936423, + "grad_norm": 0.22046511089753815, + "learning_rate": 2.2505441590775692e-06, + "loss": 0.8148, + "num_tokens": 65212014089.0, + "step": 15602 + }, + { + "epoch": 1.8541889483065954, + "grad_norm": 0.21242280239336037, + "learning_rate": 2.250138503054269e-06, + "loss": 0.8279, + "num_tokens": 65216204503.0, + "step": 15603 + }, + { + "epoch": 1.8543077837195483, + "grad_norm": 0.23849604335758415, + "learning_rate": 2.249733171067924e-06, + "loss": 0.7824, + "num_tokens": 65220394167.0, + "step": 15604 + }, + { + "epoch": 1.8544266191325014, + "grad_norm": 0.22532748371740904, + "learning_rate": 2.249328163133544e-06, + "loss": 0.8056, + "num_tokens": 65224584941.0, + "step": 15605 + }, + { + "epoch": 1.8545454545454545, + "grad_norm": 0.22717579773176966, + "learning_rate": 2.2489234792661296e-06, + "loss": 0.7972, + "num_tokens": 65228734635.0, + "step": 15606 + }, + { + "epoch": 1.8546642899584076, + "grad_norm": 0.24430469253203282, + "learning_rate": 2.2485191194806672e-06, + "loss": 0.8181, + "num_tokens": 65232924226.0, + "step": 15607 + }, + { + "epoch": 1.8547831253713607, + "grad_norm": 0.22488202620875353, + "learning_rate": 2.248115083792131e-06, + "loss": 0.8299, + "num_tokens": 65237087994.0, + "step": 15608 + }, + { + "epoch": 1.8549019607843138, + "grad_norm": 0.23075905651827625, + "learning_rate": 2.2477113722154838e-06, + "loss": 0.788, + "num_tokens": 65241266830.0, + "step": 15609 + }, + { + "epoch": 1.855020796197267, + "grad_norm": 0.22355587336191313, + "learning_rate": 2.2473079847656765e-06, + "loss": 0.7892, + "num_tokens": 65245433549.0, + "step": 15610 + }, + { + "epoch": 1.85513963161022, + "grad_norm": 0.2057490022371423, + "learning_rate": 2.2469049214576485e-06, + "loss": 0.7994, + "num_tokens": 65249590602.0, + "step": 15611 + }, + { + "epoch": 1.8552584670231729, + "grad_norm": 0.21132272714607445, + "learning_rate": 2.246502182306326e-06, + "loss": 0.8182, + "num_tokens": 65253780715.0, + "step": 15612 + }, + { + "epoch": 1.855377302436126, + "grad_norm": 0.2146287742930452, + "learning_rate": 2.2460997673266253e-06, + "loss": 0.8202, + "num_tokens": 65257969887.0, + "step": 15613 + }, + { + "epoch": 1.855496137849079, + "grad_norm": 0.22487313928155164, + "learning_rate": 2.245697676533446e-06, + "loss": 0.7898, + "num_tokens": 65262159567.0, + "step": 15614 + }, + { + "epoch": 1.855614973262032, + "grad_norm": 0.22372866029440805, + "learning_rate": 2.2452959099416806e-06, + "loss": 0.8843, + "num_tokens": 65266348684.0, + "step": 15615 + }, + { + "epoch": 1.855733808674985, + "grad_norm": 0.23088478165082313, + "learning_rate": 2.2448944675662087e-06, + "loss": 0.8371, + "num_tokens": 65270537489.0, + "step": 15616 + }, + { + "epoch": 1.8558526440879382, + "grad_norm": 0.20917629934770907, + "learning_rate": 2.244493349421897e-06, + "loss": 0.802, + "num_tokens": 65274726240.0, + "step": 15617 + }, + { + "epoch": 1.8559714795008913, + "grad_norm": 0.241542925916078, + "learning_rate": 2.2440925555236002e-06, + "loss": 0.8398, + "num_tokens": 65278912228.0, + "step": 15618 + }, + { + "epoch": 1.8560903149138444, + "grad_norm": 0.22673677985427318, + "learning_rate": 2.243692085886159e-06, + "loss": 0.7901, + "num_tokens": 65283061780.0, + "step": 15619 + }, + { + "epoch": 1.8562091503267975, + "grad_norm": 0.23047891598059822, + "learning_rate": 2.2432919405244055e-06, + "loss": 0.7587, + "num_tokens": 65287234966.0, + "step": 15620 + }, + { + "epoch": 1.8563279857397506, + "grad_norm": 0.24109005197320268, + "learning_rate": 2.242892119453159e-06, + "loss": 0.7796, + "num_tokens": 65291416704.0, + "step": 15621 + }, + { + "epoch": 1.8564468211527037, + "grad_norm": 0.2248665170017497, + "learning_rate": 2.2424926226872256e-06, + "loss": 0.8136, + "num_tokens": 65295584522.0, + "step": 15622 + }, + { + "epoch": 1.8565656565656565, + "grad_norm": 0.21244064841386032, + "learning_rate": 2.242093450241401e-06, + "loss": 0.8022, + "num_tokens": 65299749794.0, + "step": 15623 + }, + { + "epoch": 1.8566844919786096, + "grad_norm": 0.23991534650351296, + "learning_rate": 2.2416946021304665e-06, + "loss": 0.7991, + "num_tokens": 65303939514.0, + "step": 15624 + }, + { + "epoch": 1.8568033273915627, + "grad_norm": 0.2123749281696992, + "learning_rate": 2.241296078369194e-06, + "loss": 0.7701, + "num_tokens": 65308130177.0, + "step": 15625 + }, + { + "epoch": 1.8569221628045156, + "grad_norm": 0.22418867748013005, + "learning_rate": 2.24089787897234e-06, + "loss": 0.7881, + "num_tokens": 65312318138.0, + "step": 15626 + }, + { + "epoch": 1.8570409982174687, + "grad_norm": 0.2331164477167347, + "learning_rate": 2.240500003954655e-06, + "loss": 0.7811, + "num_tokens": 65316491711.0, + "step": 15627 + }, + { + "epoch": 1.8571598336304218, + "grad_norm": 0.22205669241821982, + "learning_rate": 2.240102453330871e-06, + "loss": 0.7928, + "num_tokens": 65320681585.0, + "step": 15628 + }, + { + "epoch": 1.857278669043375, + "grad_norm": 0.2372200809293523, + "learning_rate": 2.23970522711571e-06, + "loss": 0.8306, + "num_tokens": 65324839535.0, + "step": 15629 + }, + { + "epoch": 1.857397504456328, + "grad_norm": 0.23501840055109083, + "learning_rate": 2.2393083253238833e-06, + "loss": 0.7809, + "num_tokens": 65329004160.0, + "step": 15630 + }, + { + "epoch": 1.857516339869281, + "grad_norm": 0.22094088810721524, + "learning_rate": 2.2389117479700914e-06, + "loss": 0.8209, + "num_tokens": 65333184489.0, + "step": 15631 + }, + { + "epoch": 1.8576351752822342, + "grad_norm": 0.22585951342553612, + "learning_rate": 2.238515495069019e-06, + "loss": 0.7849, + "num_tokens": 65337366428.0, + "step": 15632 + }, + { + "epoch": 1.8577540106951873, + "grad_norm": 0.23108948173792446, + "learning_rate": 2.23811956663534e-06, + "loss": 0.8679, + "num_tokens": 65341554145.0, + "step": 15633 + }, + { + "epoch": 1.8578728461081402, + "grad_norm": 0.22902818004815892, + "learning_rate": 2.237723962683719e-06, + "loss": 0.8425, + "num_tokens": 65345728043.0, + "step": 15634 + }, + { + "epoch": 1.8579916815210933, + "grad_norm": 0.22778656199494263, + "learning_rate": 2.237328683228806e-06, + "loss": 0.7916, + "num_tokens": 65349919066.0, + "step": 15635 + }, + { + "epoch": 1.8581105169340464, + "grad_norm": 0.2363305076903537, + "learning_rate": 2.2369337282852384e-06, + "loss": 0.7991, + "num_tokens": 65354106910.0, + "step": 15636 + }, + { + "epoch": 1.8582293523469993, + "grad_norm": 0.21054599713789024, + "learning_rate": 2.236539097867644e-06, + "loss": 0.7972, + "num_tokens": 65358294389.0, + "step": 15637 + }, + { + "epoch": 1.8583481877599524, + "grad_norm": 0.21074414162913388, + "learning_rate": 2.236144791990637e-06, + "loss": 0.7741, + "num_tokens": 65362478800.0, + "step": 15638 + }, + { + "epoch": 1.8584670231729055, + "grad_norm": 0.23131526415499465, + "learning_rate": 2.2357508106688192e-06, + "loss": 0.8151, + "num_tokens": 65366645482.0, + "step": 15639 + }, + { + "epoch": 1.8585858585858586, + "grad_norm": 0.22118124800857866, + "learning_rate": 2.235357153916781e-06, + "loss": 0.801, + "num_tokens": 65370815679.0, + "step": 15640 + }, + { + "epoch": 1.8587046939988117, + "grad_norm": 0.21453647452755248, + "learning_rate": 2.234963821749102e-06, + "loss": 0.8045, + "num_tokens": 65375003437.0, + "step": 15641 + }, + { + "epoch": 1.8588235294117648, + "grad_norm": 0.2375292142319136, + "learning_rate": 2.2345708141803473e-06, + "loss": 0.845, + "num_tokens": 65379192765.0, + "step": 15642 + }, + { + "epoch": 1.8589423648247179, + "grad_norm": 0.2163848349464273, + "learning_rate": 2.234178131225073e-06, + "loss": 0.8081, + "num_tokens": 65383359075.0, + "step": 15643 + }, + { + "epoch": 1.859061200237671, + "grad_norm": 0.22260352291975707, + "learning_rate": 2.2337857728978208e-06, + "loss": 0.8145, + "num_tokens": 65387539728.0, + "step": 15644 + }, + { + "epoch": 1.8591800356506238, + "grad_norm": 0.21959801738007947, + "learning_rate": 2.2333937392131185e-06, + "loss": 0.7797, + "num_tokens": 65391728916.0, + "step": 15645 + }, + { + "epoch": 1.859298871063577, + "grad_norm": 0.22563326326556302, + "learning_rate": 2.2330020301854875e-06, + "loss": 0.829, + "num_tokens": 65395908127.0, + "step": 15646 + }, + { + "epoch": 1.85941770647653, + "grad_norm": 0.23533273994002452, + "learning_rate": 2.2326106458294346e-06, + "loss": 0.825, + "num_tokens": 65400097076.0, + "step": 15647 + }, + { + "epoch": 1.859536541889483, + "grad_norm": 0.21128162576317147, + "learning_rate": 2.232219586159451e-06, + "loss": 0.7536, + "num_tokens": 65404281206.0, + "step": 15648 + }, + { + "epoch": 1.859655377302436, + "grad_norm": 0.22923877228246545, + "learning_rate": 2.231828851190023e-06, + "loss": 0.8095, + "num_tokens": 65408470772.0, + "step": 15649 + }, + { + "epoch": 1.8597742127153891, + "grad_norm": 0.22169712405943381, + "learning_rate": 2.231438440935617e-06, + "loss": 0.7961, + "num_tokens": 65412621662.0, + "step": 15650 + }, + { + "epoch": 1.8598930481283422, + "grad_norm": 0.22277608328871643, + "learning_rate": 2.2310483554106925e-06, + "loss": 0.7606, + "num_tokens": 65416812276.0, + "step": 15651 + }, + { + "epoch": 1.8600118835412953, + "grad_norm": 0.21872557975165663, + "learning_rate": 2.2306585946296953e-06, + "loss": 0.8294, + "num_tokens": 65420959587.0, + "step": 15652 + }, + { + "epoch": 1.8601307189542484, + "grad_norm": 0.23785028382941192, + "learning_rate": 2.230269158607062e-06, + "loss": 0.7891, + "num_tokens": 65425150086.0, + "step": 15653 + }, + { + "epoch": 1.8602495543672015, + "grad_norm": 0.23010491082813042, + "learning_rate": 2.2298800473572126e-06, + "loss": 0.8135, + "num_tokens": 65429309012.0, + "step": 15654 + }, + { + "epoch": 1.8603683897801546, + "grad_norm": 0.20749153811134133, + "learning_rate": 2.2294912608945566e-06, + "loss": 0.7899, + "num_tokens": 65433479190.0, + "step": 15655 + }, + { + "epoch": 1.8604872251931075, + "grad_norm": 0.21480542430900873, + "learning_rate": 2.2291027992334936e-06, + "loss": 0.8309, + "num_tokens": 65437666878.0, + "step": 15656 + }, + { + "epoch": 1.8606060606060606, + "grad_norm": 0.2123671309586476, + "learning_rate": 2.228714662388408e-06, + "loss": 0.8086, + "num_tokens": 65441856739.0, + "step": 15657 + }, + { + "epoch": 1.8607248960190137, + "grad_norm": 0.22219898687987932, + "learning_rate": 2.2283268503736755e-06, + "loss": 0.7909, + "num_tokens": 65446019863.0, + "step": 15658 + }, + { + "epoch": 1.8608437314319666, + "grad_norm": 0.22625066398131508, + "learning_rate": 2.227939363203658e-06, + "loss": 0.7814, + "num_tokens": 65450209181.0, + "step": 15659 + }, + { + "epoch": 1.8609625668449197, + "grad_norm": 0.20948855497825203, + "learning_rate": 2.227552200892704e-06, + "loss": 0.8305, + "num_tokens": 65454380183.0, + "step": 15660 + }, + { + "epoch": 1.8610814022578728, + "grad_norm": 0.2270183412266021, + "learning_rate": 2.227165363455154e-06, + "loss": 0.8117, + "num_tokens": 65458569892.0, + "step": 15661 + }, + { + "epoch": 1.8612002376708259, + "grad_norm": 0.22611718832944536, + "learning_rate": 2.2267788509053306e-06, + "loss": 0.832, + "num_tokens": 65462756633.0, + "step": 15662 + }, + { + "epoch": 1.861319073083779, + "grad_norm": 0.2166152899196071, + "learning_rate": 2.22639266325755e-06, + "loss": 0.8203, + "num_tokens": 65466946074.0, + "step": 15663 + }, + { + "epoch": 1.861437908496732, + "grad_norm": 0.23996100351193134, + "learning_rate": 2.226006800526113e-06, + "loss": 0.8225, + "num_tokens": 65471136434.0, + "step": 15664 + }, + { + "epoch": 1.8615567439096852, + "grad_norm": 0.22924734080299916, + "learning_rate": 2.22562126272531e-06, + "loss": 0.8417, + "num_tokens": 65475324366.0, + "step": 15665 + }, + { + "epoch": 1.8616755793226383, + "grad_norm": 0.2063496004712455, + "learning_rate": 2.2252360498694184e-06, + "loss": 0.7921, + "num_tokens": 65479513199.0, + "step": 15666 + }, + { + "epoch": 1.8617944147355912, + "grad_norm": 0.22557625958237013, + "learning_rate": 2.2248511619727045e-06, + "loss": 0.7796, + "num_tokens": 65483702802.0, + "step": 15667 + }, + { + "epoch": 1.8619132501485443, + "grad_norm": 0.24055240928948346, + "learning_rate": 2.2244665990494207e-06, + "loss": 0.8178, + "num_tokens": 65487893724.0, + "step": 15668 + }, + { + "epoch": 1.8620320855614974, + "grad_norm": 0.2217223758364161, + "learning_rate": 2.224082361113811e-06, + "loss": 0.8269, + "num_tokens": 65492082000.0, + "step": 15669 + }, + { + "epoch": 1.8621509209744502, + "grad_norm": 0.2318994966772772, + "learning_rate": 2.223698448180103e-06, + "loss": 0.8353, + "num_tokens": 65496271893.0, + "step": 15670 + }, + { + "epoch": 1.8622697563874033, + "grad_norm": 0.21552618201788495, + "learning_rate": 2.223314860262514e-06, + "loss": 0.8279, + "num_tokens": 65500461346.0, + "step": 15671 + }, + { + "epoch": 1.8623885918003564, + "grad_norm": 0.2233817209667979, + "learning_rate": 2.22293159737525e-06, + "loss": 0.7771, + "num_tokens": 65504631108.0, + "step": 15672 + }, + { + "epoch": 1.8625074272133095, + "grad_norm": 0.23518096186280843, + "learning_rate": 2.2225486595325062e-06, + "loss": 0.8172, + "num_tokens": 65508807783.0, + "step": 15673 + }, + { + "epoch": 1.8626262626262626, + "grad_norm": 0.24058934417655756, + "learning_rate": 2.222166046748462e-06, + "loss": 0.8077, + "num_tokens": 65512996836.0, + "step": 15674 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.21549099597666702, + "learning_rate": 2.2217837590372886e-06, + "loss": 0.7613, + "num_tokens": 65517171286.0, + "step": 15675 + }, + { + "epoch": 1.8628639334521688, + "grad_norm": 0.2363025512886852, + "learning_rate": 2.221401796413142e-06, + "loss": 0.8315, + "num_tokens": 65521360532.0, + "step": 15676 + }, + { + "epoch": 1.862982768865122, + "grad_norm": 0.23702347146641792, + "learning_rate": 2.2210201588901677e-06, + "loss": 0.7867, + "num_tokens": 65525550145.0, + "step": 15677 + }, + { + "epoch": 1.863101604278075, + "grad_norm": 0.2243148653727606, + "learning_rate": 2.2206388464824992e-06, + "loss": 0.825, + "num_tokens": 65529740407.0, + "step": 15678 + }, + { + "epoch": 1.863220439691028, + "grad_norm": 0.22824555639806185, + "learning_rate": 2.220257859204258e-06, + "loss": 0.7797, + "num_tokens": 65533930733.0, + "step": 15679 + }, + { + "epoch": 1.863339275103981, + "grad_norm": 0.2384256720481413, + "learning_rate": 2.2198771970695544e-06, + "loss": 0.7957, + "num_tokens": 65538119499.0, + "step": 15680 + }, + { + "epoch": 1.8634581105169339, + "grad_norm": 0.21442522264484312, + "learning_rate": 2.2194968600924834e-06, + "loss": 0.7653, + "num_tokens": 65542266737.0, + "step": 15681 + }, + { + "epoch": 1.863576945929887, + "grad_norm": 0.23699890980037913, + "learning_rate": 2.2191168482871312e-06, + "loss": 0.808, + "num_tokens": 65546456048.0, + "step": 15682 + }, + { + "epoch": 1.86369578134284, + "grad_norm": 0.22408070828322085, + "learning_rate": 2.2187371616675713e-06, + "loss": 0.8012, + "num_tokens": 65550630101.0, + "step": 15683 + }, + { + "epoch": 1.8638146167557932, + "grad_norm": 0.2214453691383917, + "learning_rate": 2.2183578002478635e-06, + "loss": 0.8145, + "num_tokens": 65554820511.0, + "step": 15684 + }, + { + "epoch": 1.8639334521687463, + "grad_norm": 0.22152786178090214, + "learning_rate": 2.2179787640420605e-06, + "loss": 0.7981, + "num_tokens": 65559010418.0, + "step": 15685 + }, + { + "epoch": 1.8640522875816994, + "grad_norm": 0.2159382952585287, + "learning_rate": 2.217600053064194e-06, + "loss": 0.8301, + "num_tokens": 65563146865.0, + "step": 15686 + }, + { + "epoch": 1.8641711229946525, + "grad_norm": 0.23962880284490676, + "learning_rate": 2.2172216673282927e-06, + "loss": 0.7973, + "num_tokens": 65567330019.0, + "step": 15687 + }, + { + "epoch": 1.8642899584076056, + "grad_norm": 0.21016890977208846, + "learning_rate": 2.216843606848368e-06, + "loss": 0.7857, + "num_tokens": 65571518364.0, + "step": 15688 + }, + { + "epoch": 1.8644087938205587, + "grad_norm": 0.2193688328672178, + "learning_rate": 2.216465871638421e-06, + "loss": 0.8213, + "num_tokens": 65575706787.0, + "step": 15689 + }, + { + "epoch": 1.8645276292335116, + "grad_norm": 0.2169210733233253, + "learning_rate": 2.2160884617124405e-06, + "loss": 0.8144, + "num_tokens": 65579896697.0, + "step": 15690 + }, + { + "epoch": 1.8646464646464647, + "grad_norm": 0.2208945351537006, + "learning_rate": 2.215711377084404e-06, + "loss": 0.8057, + "num_tokens": 65584061395.0, + "step": 15691 + }, + { + "epoch": 1.8647653000594175, + "grad_norm": 0.2213169175690466, + "learning_rate": 2.215334617768275e-06, + "loss": 0.826, + "num_tokens": 65588250907.0, + "step": 15692 + }, + { + "epoch": 1.8648841354723706, + "grad_norm": 0.23217823427790946, + "learning_rate": 2.214958183778007e-06, + "loss": 0.7948, + "num_tokens": 65592412938.0, + "step": 15693 + }, + { + "epoch": 1.8650029708853237, + "grad_norm": 0.21619855166177102, + "learning_rate": 2.2145820751275397e-06, + "loss": 0.8281, + "num_tokens": 65596602055.0, + "step": 15694 + }, + { + "epoch": 1.8651218062982768, + "grad_norm": 0.2279243255745225, + "learning_rate": 2.214206291830803e-06, + "loss": 0.7963, + "num_tokens": 65600773111.0, + "step": 15695 + }, + { + "epoch": 1.86524064171123, + "grad_norm": 0.22481517549303567, + "learning_rate": 2.213830833901713e-06, + "loss": 0.7886, + "num_tokens": 65604914747.0, + "step": 15696 + }, + { + "epoch": 1.865359477124183, + "grad_norm": 0.23495458227568514, + "learning_rate": 2.2134557013541735e-06, + "loss": 0.788, + "num_tokens": 65609104294.0, + "step": 15697 + }, + { + "epoch": 1.8654783125371361, + "grad_norm": 0.22472660875578315, + "learning_rate": 2.2130808942020775e-06, + "loss": 0.7841, + "num_tokens": 65613274910.0, + "step": 15698 + }, + { + "epoch": 1.8655971479500892, + "grad_norm": 0.2099228801710291, + "learning_rate": 2.2127064124593053e-06, + "loss": 0.7618, + "num_tokens": 65617463990.0, + "step": 15699 + }, + { + "epoch": 1.8657159833630423, + "grad_norm": 0.23510209559763473, + "learning_rate": 2.2123322561397254e-06, + "loss": 0.8116, + "num_tokens": 65621621136.0, + "step": 15700 + }, + { + "epoch": 1.8658348187759952, + "grad_norm": 0.22769015026974027, + "learning_rate": 2.211958425257194e-06, + "loss": 0.7902, + "num_tokens": 65625810633.0, + "step": 15701 + }, + { + "epoch": 1.8659536541889483, + "grad_norm": 0.2120174853159274, + "learning_rate": 2.2115849198255553e-06, + "loss": 0.799, + "num_tokens": 65629979727.0, + "step": 15702 + }, + { + "epoch": 1.8660724896019014, + "grad_norm": 0.20422724301239384, + "learning_rate": 2.2112117398586403e-06, + "loss": 0.7876, + "num_tokens": 65634168809.0, + "step": 15703 + }, + { + "epoch": 1.8661913250148543, + "grad_norm": 0.21285688730551264, + "learning_rate": 2.21083888537027e-06, + "loss": 0.8299, + "num_tokens": 65638357952.0, + "step": 15704 + }, + { + "epoch": 1.8663101604278074, + "grad_norm": 0.2250828459355047, + "learning_rate": 2.210466356374254e-06, + "loss": 0.7772, + "num_tokens": 65642510962.0, + "step": 15705 + }, + { + "epoch": 1.8664289958407605, + "grad_norm": 0.21618519722864007, + "learning_rate": 2.210094152884386e-06, + "loss": 0.7806, + "num_tokens": 65646670881.0, + "step": 15706 + }, + { + "epoch": 1.8665478312537136, + "grad_norm": 0.21188252544082892, + "learning_rate": 2.209722274914452e-06, + "loss": 0.7888, + "num_tokens": 65650857043.0, + "step": 15707 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.2049370358884699, + "learning_rate": 2.2093507224782222e-06, + "loss": 0.8023, + "num_tokens": 65655028795.0, + "step": 15708 + }, + { + "epoch": 1.8667855020796198, + "grad_norm": 0.22687282321473082, + "learning_rate": 2.208979495589456e-06, + "loss": 0.8182, + "num_tokens": 65659217567.0, + "step": 15709 + }, + { + "epoch": 1.866904337492573, + "grad_norm": 0.2115487846015412, + "learning_rate": 2.208608594261903e-06, + "loss": 0.7844, + "num_tokens": 65663391177.0, + "step": 15710 + }, + { + "epoch": 1.867023172905526, + "grad_norm": 0.20644530338293507, + "learning_rate": 2.2082380185092992e-06, + "loss": 0.7771, + "num_tokens": 65667580963.0, + "step": 15711 + }, + { + "epoch": 1.8671420083184789, + "grad_norm": 0.22144467878752747, + "learning_rate": 2.207867768345367e-06, + "loss": 0.8397, + "num_tokens": 65671729749.0, + "step": 15712 + }, + { + "epoch": 1.867260843731432, + "grad_norm": 0.22081287302233915, + "learning_rate": 2.207497843783817e-06, + "loss": 0.8059, + "num_tokens": 65675902699.0, + "step": 15713 + }, + { + "epoch": 1.867379679144385, + "grad_norm": 0.22361189265915224, + "learning_rate": 2.2071282448383513e-06, + "loss": 0.774, + "num_tokens": 65680091763.0, + "step": 15714 + }, + { + "epoch": 1.867498514557338, + "grad_norm": 0.2176683605298392, + "learning_rate": 2.2067589715226538e-06, + "loss": 0.797, + "num_tokens": 65684269979.0, + "step": 15715 + }, + { + "epoch": 1.867617349970291, + "grad_norm": 0.21295053366434158, + "learning_rate": 2.2063900238504042e-06, + "loss": 0.7917, + "num_tokens": 65688459609.0, + "step": 15716 + }, + { + "epoch": 1.8677361853832442, + "grad_norm": 0.21646800314555714, + "learning_rate": 2.206021401835264e-06, + "loss": 0.7852, + "num_tokens": 65692647751.0, + "step": 15717 + }, + { + "epoch": 1.8678550207961973, + "grad_norm": 0.22308781688093887, + "learning_rate": 2.2056531054908843e-06, + "loss": 0.805, + "num_tokens": 65696819261.0, + "step": 15718 + }, + { + "epoch": 1.8679738562091504, + "grad_norm": 0.20814668302316527, + "learning_rate": 2.2052851348309037e-06, + "loss": 0.8346, + "num_tokens": 65701005810.0, + "step": 15719 + }, + { + "epoch": 1.8680926916221035, + "grad_norm": 0.2199944757403906, + "learning_rate": 2.204917489868951e-06, + "loss": 0.7956, + "num_tokens": 65705194248.0, + "step": 15720 + }, + { + "epoch": 1.8682115270350566, + "grad_norm": 0.23611287797787806, + "learning_rate": 2.2045501706186393e-06, + "loss": 0.7946, + "num_tokens": 65709371859.0, + "step": 15721 + }, + { + "epoch": 1.8683303624480097, + "grad_norm": 0.21452182944463286, + "learning_rate": 2.204183177093574e-06, + "loss": 0.8374, + "num_tokens": 65713540884.0, + "step": 15722 + }, + { + "epoch": 1.8684491978609625, + "grad_norm": 0.22545494234132613, + "learning_rate": 2.203816509307346e-06, + "loss": 0.8083, + "num_tokens": 65717730374.0, + "step": 15723 + }, + { + "epoch": 1.8685680332739156, + "grad_norm": 0.22707898866927798, + "learning_rate": 2.203450167273531e-06, + "loss": 0.788, + "num_tokens": 65721919511.0, + "step": 15724 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.23460772686365222, + "learning_rate": 2.2030841510057e-06, + "loss": 0.7946, + "num_tokens": 65726109552.0, + "step": 15725 + }, + { + "epoch": 1.8688057040998216, + "grad_norm": 0.2175724119867059, + "learning_rate": 2.202718460517405e-06, + "loss": 0.8124, + "num_tokens": 65730257482.0, + "step": 15726 + }, + { + "epoch": 1.8689245395127747, + "grad_norm": 0.2168796186615508, + "learning_rate": 2.2023530958221896e-06, + "loss": 0.7952, + "num_tokens": 65734423775.0, + "step": 15727 + }, + { + "epoch": 1.8690433749257278, + "grad_norm": 0.21963658333930977, + "learning_rate": 2.201988056933585e-06, + "loss": 0.8326, + "num_tokens": 65738612441.0, + "step": 15728 + }, + { + "epoch": 1.869162210338681, + "grad_norm": 0.2126399422612002, + "learning_rate": 2.201623343865109e-06, + "loss": 0.7905, + "num_tokens": 65742801550.0, + "step": 15729 + }, + { + "epoch": 1.869281045751634, + "grad_norm": 0.23415501560108742, + "learning_rate": 2.2012589566302694e-06, + "loss": 0.7959, + "num_tokens": 65746990254.0, + "step": 15730 + }, + { + "epoch": 1.869399881164587, + "grad_norm": 0.23020794932172242, + "learning_rate": 2.2008948952425587e-06, + "loss": 0.8021, + "num_tokens": 65751180531.0, + "step": 15731 + }, + { + "epoch": 1.8695187165775402, + "grad_norm": 0.240767903467301, + "learning_rate": 2.2005311597154616e-06, + "loss": 0.8094, + "num_tokens": 65755368765.0, + "step": 15732 + }, + { + "epoch": 1.8696375519904933, + "grad_norm": 0.23266862911678662, + "learning_rate": 2.2001677500624473e-06, + "loss": 0.796, + "num_tokens": 65759556498.0, + "step": 15733 + }, + { + "epoch": 1.8697563874034462, + "grad_norm": 0.2276844215711736, + "learning_rate": 2.199804666296973e-06, + "loss": 0.8102, + "num_tokens": 65763745947.0, + "step": 15734 + }, + { + "epoch": 1.8698752228163993, + "grad_norm": 0.22950344098569897, + "learning_rate": 2.199441908432487e-06, + "loss": 0.8031, + "num_tokens": 65767936454.0, + "step": 15735 + }, + { + "epoch": 1.8699940582293524, + "grad_norm": 0.2249553434899033, + "learning_rate": 2.199079476482422e-06, + "loss": 0.8141, + "num_tokens": 65772123129.0, + "step": 15736 + }, + { + "epoch": 1.8701128936423053, + "grad_norm": 0.2167838696511727, + "learning_rate": 2.1987173704602014e-06, + "loss": 0.8183, + "num_tokens": 65776311636.0, + "step": 15737 + }, + { + "epoch": 1.8702317290552584, + "grad_norm": 0.2267435036866733, + "learning_rate": 2.1983555903792353e-06, + "loss": 0.8317, + "num_tokens": 65780500231.0, + "step": 15738 + }, + { + "epoch": 1.8703505644682115, + "grad_norm": 0.21277771933038708, + "learning_rate": 2.1979941362529197e-06, + "loss": 0.7675, + "num_tokens": 65784689855.0, + "step": 15739 + }, + { + "epoch": 1.8704693998811646, + "grad_norm": 0.2228741882212284, + "learning_rate": 2.1976330080946405e-06, + "loss": 0.7903, + "num_tokens": 65788879206.0, + "step": 15740 + }, + { + "epoch": 1.8705882352941177, + "grad_norm": 0.20976161110832894, + "learning_rate": 2.1972722059177733e-06, + "loss": 0.7887, + "num_tokens": 65793069140.0, + "step": 15741 + }, + { + "epoch": 1.8707070707070708, + "grad_norm": 0.21183522941221944, + "learning_rate": 2.1969117297356792e-06, + "loss": 0.8133, + "num_tokens": 65797259309.0, + "step": 15742 + }, + { + "epoch": 1.8708259061200239, + "grad_norm": 0.23005439419628504, + "learning_rate": 2.1965515795617074e-06, + "loss": 0.7839, + "num_tokens": 65801448203.0, + "step": 15743 + }, + { + "epoch": 1.870944741532977, + "grad_norm": 0.20812645637351096, + "learning_rate": 2.1961917554091966e-06, + "loss": 0.7886, + "num_tokens": 65805637154.0, + "step": 15744 + }, + { + "epoch": 1.8710635769459298, + "grad_norm": 0.22836809732777433, + "learning_rate": 2.195832257291471e-06, + "loss": 0.8009, + "num_tokens": 65809825879.0, + "step": 15745 + }, + { + "epoch": 1.871182412358883, + "grad_norm": 0.23943723020656166, + "learning_rate": 2.195473085221844e-06, + "loss": 0.8281, + "num_tokens": 65813982466.0, + "step": 15746 + }, + { + "epoch": 1.871301247771836, + "grad_norm": 0.21178942343435042, + "learning_rate": 2.1951142392136167e-06, + "loss": 0.8082, + "num_tokens": 65818156689.0, + "step": 15747 + }, + { + "epoch": 1.871420083184789, + "grad_norm": 0.21335193124153215, + "learning_rate": 2.1947557192800805e-06, + "loss": 0.7853, + "num_tokens": 65822296629.0, + "step": 15748 + }, + { + "epoch": 1.871538918597742, + "grad_norm": 0.2237940950664832, + "learning_rate": 2.1943975254345113e-06, + "loss": 0.7741, + "num_tokens": 65826485888.0, + "step": 15749 + }, + { + "epoch": 1.8716577540106951, + "grad_norm": 0.23277177645643315, + "learning_rate": 2.194039657690174e-06, + "loss": 0.8178, + "num_tokens": 65830674088.0, + "step": 15750 + }, + { + "epoch": 1.8717765894236482, + "grad_norm": 0.22799260949045091, + "learning_rate": 2.193682116060322e-06, + "loss": 0.8112, + "num_tokens": 65834837032.0, + "step": 15751 + }, + { + "epoch": 1.8718954248366013, + "grad_norm": 0.21490412264659525, + "learning_rate": 2.193324900558196e-06, + "loss": 0.7871, + "num_tokens": 65839025386.0, + "step": 15752 + }, + { + "epoch": 1.8720142602495544, + "grad_norm": 0.2481922668501916, + "learning_rate": 2.1929680111970242e-06, + "loss": 0.8066, + "num_tokens": 65843214728.0, + "step": 15753 + }, + { + "epoch": 1.8721330956625075, + "grad_norm": 0.20811127133574941, + "learning_rate": 2.192611447990025e-06, + "loss": 0.8457, + "num_tokens": 65847404680.0, + "step": 15754 + }, + { + "epoch": 1.8722519310754606, + "grad_norm": 0.22668479894160956, + "learning_rate": 2.192255210950402e-06, + "loss": 0.7986, + "num_tokens": 65851562182.0, + "step": 15755 + }, + { + "epoch": 1.8723707664884135, + "grad_norm": 0.21853837652697566, + "learning_rate": 2.1918993000913484e-06, + "loss": 0.8413, + "num_tokens": 65855750779.0, + "step": 15756 + }, + { + "epoch": 1.8724896019013666, + "grad_norm": 0.21107344828038613, + "learning_rate": 2.191543715426045e-06, + "loss": 0.8179, + "num_tokens": 65859940975.0, + "step": 15757 + }, + { + "epoch": 1.8726084373143197, + "grad_norm": 0.21595897870867511, + "learning_rate": 2.19118845696766e-06, + "loss": 0.8218, + "num_tokens": 65864126874.0, + "step": 15758 + }, + { + "epoch": 1.8727272727272726, + "grad_norm": 0.2426018474577931, + "learning_rate": 2.19083352472935e-06, + "loss": 0.7868, + "num_tokens": 65868316375.0, + "step": 15759 + }, + { + "epoch": 1.8728461081402257, + "grad_norm": 0.216249653713998, + "learning_rate": 2.1904789187242572e-06, + "loss": 0.7604, + "num_tokens": 65872468744.0, + "step": 15760 + }, + { + "epoch": 1.8729649435531788, + "grad_norm": 0.22226586330863363, + "learning_rate": 2.1901246389655174e-06, + "loss": 0.7988, + "num_tokens": 65876657750.0, + "step": 15761 + }, + { + "epoch": 1.8730837789661319, + "grad_norm": 0.2194380926889901, + "learning_rate": 2.189770685466248e-06, + "loss": 0.8088, + "num_tokens": 65880846505.0, + "step": 15762 + }, + { + "epoch": 1.873202614379085, + "grad_norm": 0.20671896122111574, + "learning_rate": 2.189417058239559e-06, + "loss": 0.8313, + "num_tokens": 65885035240.0, + "step": 15763 + }, + { + "epoch": 1.873321449792038, + "grad_norm": 0.2318231388074324, + "learning_rate": 2.189063757298546e-06, + "loss": 0.8132, + "num_tokens": 65889225244.0, + "step": 15764 + }, + { + "epoch": 1.8734402852049912, + "grad_norm": 0.20231408453058303, + "learning_rate": 2.1887107826562924e-06, + "loss": 0.8037, + "num_tokens": 65893384923.0, + "step": 15765 + }, + { + "epoch": 1.8735591206179443, + "grad_norm": 0.2113442572066319, + "learning_rate": 2.1883581343258693e-06, + "loss": 0.8368, + "num_tokens": 65897563705.0, + "step": 15766 + }, + { + "epoch": 1.8736779560308974, + "grad_norm": 0.21333103725609354, + "learning_rate": 2.188005812320338e-06, + "loss": 0.8026, + "num_tokens": 65901751193.0, + "step": 15767 + }, + { + "epoch": 1.8737967914438503, + "grad_norm": 0.2064677159287605, + "learning_rate": 2.1876538166527443e-06, + "loss": 0.8093, + "num_tokens": 65905940167.0, + "step": 15768 + }, + { + "epoch": 1.8739156268568034, + "grad_norm": 0.22019791025502236, + "learning_rate": 2.1873021473361273e-06, + "loss": 0.8276, + "num_tokens": 65910119473.0, + "step": 15769 + }, + { + "epoch": 1.8740344622697562, + "grad_norm": 0.2253419356154758, + "learning_rate": 2.186950804383507e-06, + "loss": 0.8, + "num_tokens": 65914279899.0, + "step": 15770 + }, + { + "epoch": 1.8741532976827093, + "grad_norm": 0.2235680887962197, + "learning_rate": 2.186599787807896e-06, + "loss": 0.8017, + "num_tokens": 65918455916.0, + "step": 15771 + }, + { + "epoch": 1.8742721330956624, + "grad_norm": 0.219815933257369, + "learning_rate": 2.186249097622294e-06, + "loss": 0.8138, + "num_tokens": 65922645093.0, + "step": 15772 + }, + { + "epoch": 1.8743909685086155, + "grad_norm": 0.2423741133395913, + "learning_rate": 2.1858987338396863e-06, + "loss": 0.7988, + "num_tokens": 65926835449.0, + "step": 15773 + }, + { + "epoch": 1.8745098039215686, + "grad_norm": 0.2307709787698782, + "learning_rate": 2.1855486964730524e-06, + "loss": 0.8287, + "num_tokens": 65931009548.0, + "step": 15774 + }, + { + "epoch": 1.8746286393345217, + "grad_norm": 0.22105409069105378, + "learning_rate": 2.185198985535351e-06, + "loss": 0.7962, + "num_tokens": 65935199445.0, + "step": 15775 + }, + { + "epoch": 1.8747474747474748, + "grad_norm": 0.2369760507277689, + "learning_rate": 2.1848496010395358e-06, + "loss": 0.8225, + "num_tokens": 65939387373.0, + "step": 15776 + }, + { + "epoch": 1.874866310160428, + "grad_norm": 0.2220244299920446, + "learning_rate": 2.184500542998543e-06, + "loss": 0.8241, + "num_tokens": 65943576233.0, + "step": 15777 + }, + { + "epoch": 1.874985145573381, + "grad_norm": 0.22917861178855395, + "learning_rate": 2.1841518114253024e-06, + "loss": 0.8001, + "num_tokens": 65947764470.0, + "step": 15778 + }, + { + "epoch": 1.875103980986334, + "grad_norm": 0.21524760690009334, + "learning_rate": 2.1838034063327264e-06, + "loss": 0.7912, + "num_tokens": 65951953133.0, + "step": 15779 + }, + { + "epoch": 1.875222816399287, + "grad_norm": 0.23448292018533445, + "learning_rate": 2.1834553277337183e-06, + "loss": 0.7739, + "num_tokens": 65956143281.0, + "step": 15780 + }, + { + "epoch": 1.8753416518122399, + "grad_norm": 0.2452917795508713, + "learning_rate": 2.18310757564117e-06, + "loss": 0.8236, + "num_tokens": 65960332647.0, + "step": 15781 + }, + { + "epoch": 1.875460487225193, + "grad_norm": 0.22645656836670042, + "learning_rate": 2.1827601500679576e-06, + "loss": 0.7881, + "num_tokens": 65964514357.0, + "step": 15782 + }, + { + "epoch": 1.875579322638146, + "grad_norm": 0.24702425382297796, + "learning_rate": 2.1824130510269493e-06, + "loss": 0.7971, + "num_tokens": 65968673280.0, + "step": 15783 + }, + { + "epoch": 1.8756981580510992, + "grad_norm": 0.21608327912516595, + "learning_rate": 2.182066278530999e-06, + "loss": 0.8227, + "num_tokens": 65972841091.0, + "step": 15784 + }, + { + "epoch": 1.8758169934640523, + "grad_norm": 0.2379236780942103, + "learning_rate": 2.1817198325929474e-06, + "loss": 0.7828, + "num_tokens": 65977030969.0, + "step": 15785 + }, + { + "epoch": 1.8759358288770054, + "grad_norm": 0.22986683521180026, + "learning_rate": 2.1813737132256274e-06, + "loss": 0.8085, + "num_tokens": 65981219969.0, + "step": 15786 + }, + { + "epoch": 1.8760546642899585, + "grad_norm": 0.21745511960709527, + "learning_rate": 2.1810279204418535e-06, + "loss": 0.7922, + "num_tokens": 65985409506.0, + "step": 15787 + }, + { + "epoch": 1.8761734997029116, + "grad_norm": 0.23563218193852992, + "learning_rate": 2.1806824542544345e-06, + "loss": 0.813, + "num_tokens": 65989597803.0, + "step": 15788 + }, + { + "epoch": 1.8762923351158647, + "grad_norm": 0.22124151503999034, + "learning_rate": 2.1803373146761624e-06, + "loss": 0.7915, + "num_tokens": 65993762314.0, + "step": 15789 + }, + { + "epoch": 1.8764111705288176, + "grad_norm": 0.2243221898656714, + "learning_rate": 2.1799925017198206e-06, + "loss": 0.817, + "num_tokens": 65997950903.0, + "step": 15790 + }, + { + "epoch": 1.8765300059417707, + "grad_norm": 0.21699985365563576, + "learning_rate": 2.179648015398177e-06, + "loss": 0.7898, + "num_tokens": 66002125685.0, + "step": 15791 + }, + { + "epoch": 1.8766488413547238, + "grad_norm": 0.21917880772548767, + "learning_rate": 2.179303855723989e-06, + "loss": 0.833, + "num_tokens": 66006315102.0, + "step": 15792 + }, + { + "epoch": 1.8767676767676766, + "grad_norm": 0.2278200331835411, + "learning_rate": 2.1789600227100036e-06, + "loss": 0.784, + "num_tokens": 66010506108.0, + "step": 15793 + }, + { + "epoch": 1.8768865121806297, + "grad_norm": 0.20906092418030886, + "learning_rate": 2.1786165163689527e-06, + "loss": 0.7912, + "num_tokens": 66014696089.0, + "step": 15794 + }, + { + "epoch": 1.8770053475935828, + "grad_norm": 0.23471946239607963, + "learning_rate": 2.1782733367135586e-06, + "loss": 0.8043, + "num_tokens": 66018885870.0, + "step": 15795 + }, + { + "epoch": 1.877124183006536, + "grad_norm": 0.2293561670952064, + "learning_rate": 2.1779304837565297e-06, + "loss": 0.8088, + "num_tokens": 66023074477.0, + "step": 15796 + }, + { + "epoch": 1.877243018419489, + "grad_norm": 0.21766844790396797, + "learning_rate": 2.177587957510563e-06, + "loss": 0.798, + "num_tokens": 66027263341.0, + "step": 15797 + }, + { + "epoch": 1.8773618538324421, + "grad_norm": 0.2212693717651192, + "learning_rate": 2.1772457579883426e-06, + "loss": 0.8166, + "num_tokens": 66031452467.0, + "step": 15798 + }, + { + "epoch": 1.8774806892453952, + "grad_norm": 0.22035456237355794, + "learning_rate": 2.176903885202543e-06, + "loss": 0.7915, + "num_tokens": 66035632768.0, + "step": 15799 + }, + { + "epoch": 1.8775995246583483, + "grad_norm": 0.22863201321418214, + "learning_rate": 2.176562339165825e-06, + "loss": 0.784, + "num_tokens": 66039821931.0, + "step": 15800 + }, + { + "epoch": 1.8777183600713012, + "grad_norm": 0.20832657507273417, + "learning_rate": 2.176221119890835e-06, + "loss": 0.8252, + "num_tokens": 66044010522.0, + "step": 15801 + }, + { + "epoch": 1.8778371954842543, + "grad_norm": 0.2241596039926404, + "learning_rate": 2.175880227390211e-06, + "loss": 0.8153, + "num_tokens": 66048200521.0, + "step": 15802 + }, + { + "epoch": 1.8779560308972074, + "grad_norm": 0.21830418456097284, + "learning_rate": 2.1755396616765777e-06, + "loss": 0.7699, + "num_tokens": 66052370952.0, + "step": 15803 + }, + { + "epoch": 1.8780748663101603, + "grad_norm": 0.20548941471890686, + "learning_rate": 2.1751994227625462e-06, + "loss": 0.8078, + "num_tokens": 66056559133.0, + "step": 15804 + }, + { + "epoch": 1.8781937017231134, + "grad_norm": 0.21677867054999062, + "learning_rate": 2.1748595106607187e-06, + "loss": 0.8291, + "num_tokens": 66060747095.0, + "step": 15805 + }, + { + "epoch": 1.8783125371360665, + "grad_norm": 0.2046909534634066, + "learning_rate": 2.1745199253836812e-06, + "loss": 0.782, + "num_tokens": 66064935892.0, + "step": 15806 + }, + { + "epoch": 1.8784313725490196, + "grad_norm": 0.2243402197306413, + "learning_rate": 2.17418066694401e-06, + "loss": 0.8121, + "num_tokens": 66069125791.0, + "step": 15807 + }, + { + "epoch": 1.8785502079619727, + "grad_norm": 0.21925318420416084, + "learning_rate": 2.1738417353542704e-06, + "loss": 0.8134, + "num_tokens": 66073314010.0, + "step": 15808 + }, + { + "epoch": 1.8786690433749258, + "grad_norm": 0.2026793974424486, + "learning_rate": 2.1735031306270133e-06, + "loss": 0.8079, + "num_tokens": 66077503748.0, + "step": 15809 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 0.20887698179014777, + "learning_rate": 2.1731648527747774e-06, + "loss": 0.768, + "num_tokens": 66081693289.0, + "step": 15810 + }, + { + "epoch": 1.878906714200832, + "grad_norm": 0.20726888667385082, + "learning_rate": 2.1728269018100916e-06, + "loss": 0.8221, + "num_tokens": 66085879136.0, + "step": 15811 + }, + { + "epoch": 1.8790255496137849, + "grad_norm": 0.21298375862053134, + "learning_rate": 2.172489277745472e-06, + "loss": 0.8256, + "num_tokens": 66090067329.0, + "step": 15812 + }, + { + "epoch": 1.879144385026738, + "grad_norm": 0.20522138125548692, + "learning_rate": 2.1721519805934203e-06, + "loss": 0.794, + "num_tokens": 66094256354.0, + "step": 15813 + }, + { + "epoch": 1.879263220439691, + "grad_norm": 0.21842684854852887, + "learning_rate": 2.1718150103664283e-06, + "loss": 0.8124, + "num_tokens": 66098421405.0, + "step": 15814 + }, + { + "epoch": 1.879382055852644, + "grad_norm": 0.2090229292766794, + "learning_rate": 2.171478367076975e-06, + "loss": 0.8107, + "num_tokens": 66102591481.0, + "step": 15815 + }, + { + "epoch": 1.879500891265597, + "grad_norm": 0.21536985989835278, + "learning_rate": 2.1711420507375282e-06, + "loss": 0.7846, + "num_tokens": 66106764639.0, + "step": 15816 + }, + { + "epoch": 1.8796197266785502, + "grad_norm": 0.21978073257221456, + "learning_rate": 2.1708060613605424e-06, + "loss": 0.7918, + "num_tokens": 66110926756.0, + "step": 15817 + }, + { + "epoch": 1.8797385620915033, + "grad_norm": 0.2045941613323207, + "learning_rate": 2.1704703989584603e-06, + "loss": 0.8021, + "num_tokens": 66115111779.0, + "step": 15818 + }, + { + "epoch": 1.8798573975044564, + "grad_norm": 0.21714955073100983, + "learning_rate": 2.170135063543712e-06, + "loss": 0.8192, + "num_tokens": 66119273462.0, + "step": 15819 + }, + { + "epoch": 1.8799762329174095, + "grad_norm": 0.21570888655819076, + "learning_rate": 2.169800055128717e-06, + "loss": 0.7762, + "num_tokens": 66123462298.0, + "step": 15820 + }, + { + "epoch": 1.8800950683303626, + "grad_norm": 0.2066477930380587, + "learning_rate": 2.1694653737258825e-06, + "loss": 0.7744, + "num_tokens": 66127649805.0, + "step": 15821 + }, + { + "epoch": 1.8802139037433157, + "grad_norm": 0.2162027392641292, + "learning_rate": 2.1691310193476025e-06, + "loss": 0.8452, + "num_tokens": 66131839745.0, + "step": 15822 + }, + { + "epoch": 1.8803327391562685, + "grad_norm": 0.2108839325610062, + "learning_rate": 2.168796992006257e-06, + "loss": 0.7521, + "num_tokens": 66136030079.0, + "step": 15823 + }, + { + "epoch": 1.8804515745692216, + "grad_norm": 0.2090370501759334, + "learning_rate": 2.1684632917142178e-06, + "loss": 0.824, + "num_tokens": 66140194342.0, + "step": 15824 + }, + { + "epoch": 1.8805704099821747, + "grad_norm": 0.22041529566792403, + "learning_rate": 2.168129918483844e-06, + "loss": 0.7961, + "num_tokens": 66144383762.0, + "step": 15825 + }, + { + "epoch": 1.8806892453951276, + "grad_norm": 0.2092271818319218, + "learning_rate": 2.1677968723274806e-06, + "loss": 0.8254, + "num_tokens": 66148573399.0, + "step": 15826 + }, + { + "epoch": 1.8808080808080807, + "grad_norm": 0.22478736348392234, + "learning_rate": 2.167464153257461e-06, + "loss": 0.8277, + "num_tokens": 66152753219.0, + "step": 15827 + }, + { + "epoch": 1.8809269162210338, + "grad_norm": 0.21073726351158154, + "learning_rate": 2.167131761286107e-06, + "loss": 0.7876, + "num_tokens": 66156942650.0, + "step": 15828 + }, + { + "epoch": 1.881045751633987, + "grad_norm": 0.2324907855024493, + "learning_rate": 2.1667996964257293e-06, + "loss": 0.8288, + "num_tokens": 66161129958.0, + "step": 15829 + }, + { + "epoch": 1.88116458704694, + "grad_norm": 0.20423228499077745, + "learning_rate": 2.1664679586886238e-06, + "loss": 0.7955, + "num_tokens": 66165310397.0, + "step": 15830 + }, + { + "epoch": 1.881283422459893, + "grad_norm": 0.20587580798082922, + "learning_rate": 2.166136548087078e-06, + "loss": 0.8072, + "num_tokens": 66169487600.0, + "step": 15831 + }, + { + "epoch": 1.8814022578728462, + "grad_norm": 0.22079843906513105, + "learning_rate": 2.165805464633362e-06, + "loss": 0.804, + "num_tokens": 66173676229.0, + "step": 15832 + }, + { + "epoch": 1.8815210932857993, + "grad_norm": 0.20900714850721036, + "learning_rate": 2.1654747083397403e-06, + "loss": 0.8444, + "num_tokens": 66177855742.0, + "step": 15833 + }, + { + "epoch": 1.8816399286987522, + "grad_norm": 0.21515581375838946, + "learning_rate": 2.1651442792184596e-06, + "loss": 0.8135, + "num_tokens": 66182044273.0, + "step": 15834 + }, + { + "epoch": 1.8817587641117053, + "grad_norm": 0.21392849671526754, + "learning_rate": 2.1648141772817575e-06, + "loss": 0.7562, + "num_tokens": 66186212274.0, + "step": 15835 + }, + { + "epoch": 1.8818775995246584, + "grad_norm": 0.2068020912037141, + "learning_rate": 2.16448440254186e-06, + "loss": 0.8094, + "num_tokens": 66190389223.0, + "step": 15836 + }, + { + "epoch": 1.8819964349376113, + "grad_norm": 0.22192828332490347, + "learning_rate": 2.164154955010977e-06, + "loss": 0.7977, + "num_tokens": 66194572455.0, + "step": 15837 + }, + { + "epoch": 1.8821152703505644, + "grad_norm": 0.21536005218920332, + "learning_rate": 2.1638258347013114e-06, + "loss": 0.8625, + "num_tokens": 66198760714.0, + "step": 15838 + }, + { + "epoch": 1.8822341057635175, + "grad_norm": 0.22328751421878784, + "learning_rate": 2.1634970416250513e-06, + "loss": 0.7958, + "num_tokens": 66202930021.0, + "step": 15839 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.2292549345423151, + "learning_rate": 2.163168575794373e-06, + "loss": 0.7987, + "num_tokens": 66207118928.0, + "step": 15840 + }, + { + "epoch": 1.8824717765894237, + "grad_norm": 0.22057207536163806, + "learning_rate": 2.162840437221439e-06, + "loss": 0.8081, + "num_tokens": 66211272289.0, + "step": 15841 + }, + { + "epoch": 1.8825906120023768, + "grad_norm": 0.22558104352711986, + "learning_rate": 2.1625126259184035e-06, + "loss": 0.8336, + "num_tokens": 66215461716.0, + "step": 15842 + }, + { + "epoch": 1.8827094474153299, + "grad_norm": 0.2111662880428688, + "learning_rate": 2.1621851418974064e-06, + "loss": 0.8074, + "num_tokens": 66219612033.0, + "step": 15843 + }, + { + "epoch": 1.882828282828283, + "grad_norm": 0.22698456541773154, + "learning_rate": 2.161857985170574e-06, + "loss": 0.8295, + "num_tokens": 66223797392.0, + "step": 15844 + }, + { + "epoch": 1.8829471182412358, + "grad_norm": 0.2284461002107288, + "learning_rate": 2.161531155750023e-06, + "loss": 0.7984, + "num_tokens": 66227937504.0, + "step": 15845 + }, + { + "epoch": 1.883065953654189, + "grad_norm": 0.2121684816875642, + "learning_rate": 2.1612046536478574e-06, + "loss": 0.7945, + "num_tokens": 66232126258.0, + "step": 15846 + }, + { + "epoch": 1.883184789067142, + "grad_norm": 0.21797655164596563, + "learning_rate": 2.160878478876167e-06, + "loss": 0.8084, + "num_tokens": 66236294887.0, + "step": 15847 + }, + { + "epoch": 1.883303624480095, + "grad_norm": 0.2145023243865174, + "learning_rate": 2.1605526314470343e-06, + "loss": 0.7963, + "num_tokens": 66240437097.0, + "step": 15848 + }, + { + "epoch": 1.883422459893048, + "grad_norm": 0.21912638963837558, + "learning_rate": 2.1602271113725224e-06, + "loss": 0.838, + "num_tokens": 66244615414.0, + "step": 15849 + }, + { + "epoch": 1.8835412953060011, + "grad_norm": 0.2136093409816148, + "learning_rate": 2.15990191866469e-06, + "loss": 0.8406, + "num_tokens": 66248803195.0, + "step": 15850 + }, + { + "epoch": 1.8836601307189542, + "grad_norm": 0.22171655932637357, + "learning_rate": 2.159577053335577e-06, + "loss": 0.7987, + "num_tokens": 66252992622.0, + "step": 15851 + }, + { + "epoch": 1.8837789661319073, + "grad_norm": 0.2223500172756283, + "learning_rate": 2.159252515397218e-06, + "loss": 0.8108, + "num_tokens": 66257180916.0, + "step": 15852 + }, + { + "epoch": 1.8838978015448604, + "grad_norm": 0.2139612916271829, + "learning_rate": 2.1589283048616286e-06, + "loss": 0.8415, + "num_tokens": 66261368323.0, + "step": 15853 + }, + { + "epoch": 1.8840166369578135, + "grad_norm": 0.2166608797831556, + "learning_rate": 2.1586044217408165e-06, + "loss": 0.7909, + "num_tokens": 66265559199.0, + "step": 15854 + }, + { + "epoch": 1.8841354723707666, + "grad_norm": 0.23154236338699374, + "learning_rate": 2.1582808660467753e-06, + "loss": 0.8227, + "num_tokens": 66269730519.0, + "step": 15855 + }, + { + "epoch": 1.8842543077837195, + "grad_norm": 0.2192736161980772, + "learning_rate": 2.157957637791489e-06, + "loss": 0.8199, + "num_tokens": 66273920245.0, + "step": 15856 + }, + { + "epoch": 1.8843731431966726, + "grad_norm": 0.21308666575889235, + "learning_rate": 2.1576347369869273e-06, + "loss": 0.8274, + "num_tokens": 66278080131.0, + "step": 15857 + }, + { + "epoch": 1.8844919786096257, + "grad_norm": 0.22962775861831441, + "learning_rate": 2.1573121636450476e-06, + "loss": 0.8046, + "num_tokens": 66282253965.0, + "step": 15858 + }, + { + "epoch": 1.8846108140225786, + "grad_norm": 0.23189917104998684, + "learning_rate": 2.1569899177777973e-06, + "loss": 0.8152, + "num_tokens": 66286443118.0, + "step": 15859 + }, + { + "epoch": 1.8847296494355317, + "grad_norm": 0.23031793889528823, + "learning_rate": 2.156667999397108e-06, + "loss": 0.826, + "num_tokens": 66290600091.0, + "step": 15860 + }, + { + "epoch": 1.8848484848484848, + "grad_norm": 0.22493074310844025, + "learning_rate": 2.156346408514903e-06, + "loss": 0.7518, + "num_tokens": 66294790804.0, + "step": 15861 + }, + { + "epoch": 1.8849673202614379, + "grad_norm": 0.2735750817344723, + "learning_rate": 2.156025145143091e-06, + "loss": 0.8048, + "num_tokens": 66298979633.0, + "step": 15862 + }, + { + "epoch": 1.885086155674391, + "grad_norm": 0.219225468456018, + "learning_rate": 2.1557042092935718e-06, + "loss": 0.7798, + "num_tokens": 66303166976.0, + "step": 15863 + }, + { + "epoch": 1.885204991087344, + "grad_norm": 0.21589604024311543, + "learning_rate": 2.1553836009782277e-06, + "loss": 0.8114, + "num_tokens": 66307356210.0, + "step": 15864 + }, + { + "epoch": 1.8853238265002972, + "grad_norm": 0.22041842124029387, + "learning_rate": 2.1550633202089336e-06, + "loss": 0.7998, + "num_tokens": 66311490709.0, + "step": 15865 + }, + { + "epoch": 1.8854426619132503, + "grad_norm": 0.2254007549093547, + "learning_rate": 2.1547433669975497e-06, + "loss": 0.8424, + "num_tokens": 66315679834.0, + "step": 15866 + }, + { + "epoch": 1.8855614973262034, + "grad_norm": 0.21358268846561576, + "learning_rate": 2.154423741355926e-06, + "loss": 0.8076, + "num_tokens": 66319829266.0, + "step": 15867 + }, + { + "epoch": 1.8856803327391563, + "grad_norm": 0.21693535292025629, + "learning_rate": 2.1541044432958986e-06, + "loss": 0.8106, + "num_tokens": 66324017709.0, + "step": 15868 + }, + { + "epoch": 1.8857991681521094, + "grad_norm": 0.21005046086290277, + "learning_rate": 2.153785472829292e-06, + "loss": 0.7949, + "num_tokens": 66328206889.0, + "step": 15869 + }, + { + "epoch": 1.8859180035650622, + "grad_norm": 0.2119230633679664, + "learning_rate": 2.1534668299679194e-06, + "loss": 0.7825, + "num_tokens": 66332395023.0, + "step": 15870 + }, + { + "epoch": 1.8860368389780153, + "grad_norm": 0.2167500566488188, + "learning_rate": 2.1531485147235815e-06, + "loss": 0.8103, + "num_tokens": 66336583295.0, + "step": 15871 + }, + { + "epoch": 1.8861556743909684, + "grad_norm": 0.20704978268921218, + "learning_rate": 2.1528305271080656e-06, + "loss": 0.7893, + "num_tokens": 66340768544.0, + "step": 15872 + }, + { + "epoch": 1.8862745098039215, + "grad_norm": 0.20411019864848626, + "learning_rate": 2.152512867133148e-06, + "loss": 0.8196, + "num_tokens": 66344925058.0, + "step": 15873 + }, + { + "epoch": 1.8863933452168746, + "grad_norm": 0.2175720100289981, + "learning_rate": 2.152195534810593e-06, + "loss": 0.8142, + "num_tokens": 66349061710.0, + "step": 15874 + }, + { + "epoch": 1.8865121806298277, + "grad_norm": 0.2083551372392971, + "learning_rate": 2.151878530152153e-06, + "loss": 0.7994, + "num_tokens": 66353222756.0, + "step": 15875 + }, + { + "epoch": 1.8866310160427808, + "grad_norm": 0.20784963130750791, + "learning_rate": 2.1515618531695665e-06, + "loss": 0.842, + "num_tokens": 66357411587.0, + "step": 15876 + }, + { + "epoch": 1.886749851455734, + "grad_norm": 0.21713536098721248, + "learning_rate": 2.1512455038745618e-06, + "loss": 0.826, + "num_tokens": 66361599235.0, + "step": 15877 + }, + { + "epoch": 1.886868686868687, + "grad_norm": 0.21873384605342713, + "learning_rate": 2.1509294822788547e-06, + "loss": 0.7811, + "num_tokens": 66365781668.0, + "step": 15878 + }, + { + "epoch": 1.88698752228164, + "grad_norm": 0.21071391026122885, + "learning_rate": 2.1506137883941492e-06, + "loss": 0.8047, + "num_tokens": 66369954133.0, + "step": 15879 + }, + { + "epoch": 1.887106357694593, + "grad_norm": 0.22700177647193873, + "learning_rate": 2.1502984222321353e-06, + "loss": 0.8016, + "num_tokens": 66374133121.0, + "step": 15880 + }, + { + "epoch": 1.8872251931075459, + "grad_norm": 0.2176279376217721, + "learning_rate": 2.1499833838044913e-06, + "loss": 0.7951, + "num_tokens": 66378321190.0, + "step": 15881 + }, + { + "epoch": 1.887344028520499, + "grad_norm": 0.2178202862048008, + "learning_rate": 2.1496686731228855e-06, + "loss": 0.7725, + "num_tokens": 66382508874.0, + "step": 15882 + }, + { + "epoch": 1.887462863933452, + "grad_norm": 0.21742747049100986, + "learning_rate": 2.1493542901989724e-06, + "loss": 0.789, + "num_tokens": 66386698893.0, + "step": 15883 + }, + { + "epoch": 1.8875816993464052, + "grad_norm": 0.20827053965034964, + "learning_rate": 2.1490402350443964e-06, + "loss": 0.7954, + "num_tokens": 66390870910.0, + "step": 15884 + }, + { + "epoch": 1.8877005347593583, + "grad_norm": 0.21301088589809267, + "learning_rate": 2.148726507670785e-06, + "loss": 0.7852, + "num_tokens": 66395061748.0, + "step": 15885 + }, + { + "epoch": 1.8878193701723114, + "grad_norm": 0.21039844617654854, + "learning_rate": 2.148413108089757e-06, + "loss": 0.8497, + "num_tokens": 66399226930.0, + "step": 15886 + }, + { + "epoch": 1.8879382055852645, + "grad_norm": 0.20956775517686765, + "learning_rate": 2.14810003631292e-06, + "loss": 0.8201, + "num_tokens": 66403416300.0, + "step": 15887 + }, + { + "epoch": 1.8880570409982176, + "grad_norm": 0.20608190481186772, + "learning_rate": 2.1477872923518673e-06, + "loss": 0.8138, + "num_tokens": 66407603658.0, + "step": 15888 + }, + { + "epoch": 1.8881758764111707, + "grad_norm": 0.2118357431027472, + "learning_rate": 2.1474748762181835e-06, + "loss": 0.8145, + "num_tokens": 66411793278.0, + "step": 15889 + }, + { + "epoch": 1.8882947118241236, + "grad_norm": 0.2105420203272982, + "learning_rate": 2.1471627879234342e-06, + "loss": 0.7828, + "num_tokens": 66415983983.0, + "step": 15890 + }, + { + "epoch": 1.8884135472370767, + "grad_norm": 0.2098462701526106, + "learning_rate": 2.146851027479179e-06, + "loss": 0.7898, + "num_tokens": 66420172167.0, + "step": 15891 + }, + { + "epoch": 1.8885323826500298, + "grad_norm": 0.21382651357294274, + "learning_rate": 2.146539594896964e-06, + "loss": 0.7975, + "num_tokens": 66424360260.0, + "step": 15892 + }, + { + "epoch": 1.8886512180629826, + "grad_norm": 0.2120850341147547, + "learning_rate": 2.1462284901883214e-06, + "loss": 0.8022, + "num_tokens": 66428545815.0, + "step": 15893 + }, + { + "epoch": 1.8887700534759357, + "grad_norm": 0.21258550066842236, + "learning_rate": 2.1459177133647737e-06, + "loss": 0.795, + "num_tokens": 66432707152.0, + "step": 15894 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.21423697351547447, + "learning_rate": 2.145607264437829e-06, + "loss": 0.8301, + "num_tokens": 66436897324.0, + "step": 15895 + }, + { + "epoch": 1.889007724301842, + "grad_norm": 0.2092755780890291, + "learning_rate": 2.145297143418985e-06, + "loss": 0.7818, + "num_tokens": 66441087020.0, + "step": 15896 + }, + { + "epoch": 1.889126559714795, + "grad_norm": 0.21143515546068645, + "learning_rate": 2.1449873503197264e-06, + "loss": 0.8006, + "num_tokens": 66445276150.0, + "step": 15897 + }, + { + "epoch": 1.8892453951277481, + "grad_norm": 0.22248824116705423, + "learning_rate": 2.1446778851515254e-06, + "loss": 0.801, + "num_tokens": 66449464876.0, + "step": 15898 + }, + { + "epoch": 1.8893642305407012, + "grad_norm": 0.21003081440363275, + "learning_rate": 2.1443687479258434e-06, + "loss": 0.7602, + "num_tokens": 66453653606.0, + "step": 15899 + }, + { + "epoch": 1.8894830659536543, + "grad_norm": 0.2173904659643525, + "learning_rate": 2.1440599386541285e-06, + "loss": 0.8056, + "num_tokens": 66457827518.0, + "step": 15900 + }, + { + "epoch": 1.8896019013666072, + "grad_norm": 0.2224678599494593, + "learning_rate": 2.143751457347815e-06, + "loss": 0.8009, + "num_tokens": 66461995782.0, + "step": 15901 + }, + { + "epoch": 1.8897207367795603, + "grad_norm": 0.21748965474689685, + "learning_rate": 2.1434433040183307e-06, + "loss": 0.8146, + "num_tokens": 66466161132.0, + "step": 15902 + }, + { + "epoch": 1.8898395721925134, + "grad_norm": 0.23135256122689532, + "learning_rate": 2.1431354786770847e-06, + "loss": 0.8201, + "num_tokens": 66470331828.0, + "step": 15903 + }, + { + "epoch": 1.8899584076054663, + "grad_norm": 0.23232978275898197, + "learning_rate": 2.142827981335477e-06, + "loss": 0.7853, + "num_tokens": 66474521359.0, + "step": 15904 + }, + { + "epoch": 1.8900772430184194, + "grad_norm": 0.22584787367135872, + "learning_rate": 2.142520812004898e-06, + "loss": 0.7655, + "num_tokens": 66478704298.0, + "step": 15905 + }, + { + "epoch": 1.8901960784313725, + "grad_norm": 0.24831969930744272, + "learning_rate": 2.142213970696719e-06, + "loss": 0.8073, + "num_tokens": 66482891122.0, + "step": 15906 + }, + { + "epoch": 1.8903149138443256, + "grad_norm": 0.2540560244226821, + "learning_rate": 2.1419074574223065e-06, + "loss": 0.7706, + "num_tokens": 66487082280.0, + "step": 15907 + }, + { + "epoch": 1.8904337492572787, + "grad_norm": 0.2427626758901731, + "learning_rate": 2.1416012721930108e-06, + "loss": 0.7923, + "num_tokens": 66491272343.0, + "step": 15908 + }, + { + "epoch": 1.8905525846702318, + "grad_norm": 0.23342698521427083, + "learning_rate": 2.1412954150201697e-06, + "loss": 0.7986, + "num_tokens": 66495447779.0, + "step": 15909 + }, + { + "epoch": 1.890671420083185, + "grad_norm": 0.24943375036751997, + "learning_rate": 2.140989885915114e-06, + "loss": 0.8235, + "num_tokens": 66499588978.0, + "step": 15910 + }, + { + "epoch": 1.890790255496138, + "grad_norm": 0.2274852083831456, + "learning_rate": 2.1406846848891542e-06, + "loss": 0.7846, + "num_tokens": 66503776712.0, + "step": 15911 + }, + { + "epoch": 1.8909090909090909, + "grad_norm": 0.22771910819690344, + "learning_rate": 2.1403798119535944e-06, + "loss": 0.8164, + "num_tokens": 66507967172.0, + "step": 15912 + }, + { + "epoch": 1.891027926322044, + "grad_norm": 0.22588434974624108, + "learning_rate": 2.140075267119725e-06, + "loss": 0.7812, + "num_tokens": 66512129142.0, + "step": 15913 + }, + { + "epoch": 1.891146761734997, + "grad_norm": 0.25405209962542696, + "learning_rate": 2.1397710503988255e-06, + "loss": 0.7931, + "num_tokens": 66516304940.0, + "step": 15914 + }, + { + "epoch": 1.89126559714795, + "grad_norm": 0.21368234126810381, + "learning_rate": 2.1394671618021607e-06, + "loss": 0.8262, + "num_tokens": 66520494181.0, + "step": 15915 + }, + { + "epoch": 1.891384432560903, + "grad_norm": 0.2458311196578673, + "learning_rate": 2.139163601340985e-06, + "loss": 0.7956, + "num_tokens": 66524647917.0, + "step": 15916 + }, + { + "epoch": 1.8915032679738562, + "grad_norm": 0.2277482438416937, + "learning_rate": 2.138860369026541e-06, + "loss": 0.7887, + "num_tokens": 66528836867.0, + "step": 15917 + }, + { + "epoch": 1.8916221033868093, + "grad_norm": 0.2121410654710994, + "learning_rate": 2.138557464870058e-06, + "loss": 0.7932, + "num_tokens": 66533006443.0, + "step": 15918 + }, + { + "epoch": 1.8917409387997624, + "grad_norm": 0.22518199272014872, + "learning_rate": 2.1382548888827517e-06, + "loss": 0.7893, + "num_tokens": 66537185137.0, + "step": 15919 + }, + { + "epoch": 1.8918597742127155, + "grad_norm": 0.21594055587053881, + "learning_rate": 2.13795264107583e-06, + "loss": 0.8304, + "num_tokens": 66541374415.0, + "step": 15920 + }, + { + "epoch": 1.8919786096256686, + "grad_norm": 0.22456080833486716, + "learning_rate": 2.1376507214604854e-06, + "loss": 0.8118, + "num_tokens": 66545563154.0, + "step": 15921 + }, + { + "epoch": 1.8920974450386217, + "grad_norm": 0.22025617128552344, + "learning_rate": 2.1373491300478984e-06, + "loss": 0.8247, + "num_tokens": 66549753276.0, + "step": 15922 + }, + { + "epoch": 1.8922162804515745, + "grad_norm": 0.22399753406225417, + "learning_rate": 2.137047866849239e-06, + "loss": 0.8014, + "num_tokens": 66553943858.0, + "step": 15923 + }, + { + "epoch": 1.8923351158645276, + "grad_norm": 0.220178359930885, + "learning_rate": 2.1367469318756634e-06, + "loss": 0.8161, + "num_tokens": 66558132845.0, + "step": 15924 + }, + { + "epoch": 1.8924539512774807, + "grad_norm": 0.21625507263997631, + "learning_rate": 2.136446325138316e-06, + "loss": 0.8333, + "num_tokens": 66562261939.0, + "step": 15925 + }, + { + "epoch": 1.8925727866904336, + "grad_norm": 0.2166293670993875, + "learning_rate": 2.13614604664833e-06, + "loss": 0.7937, + "num_tokens": 66566429400.0, + "step": 15926 + }, + { + "epoch": 1.8926916221033867, + "grad_norm": 0.22526805047939347, + "learning_rate": 2.135846096416825e-06, + "loss": 0.8097, + "num_tokens": 66570617319.0, + "step": 15927 + }, + { + "epoch": 1.8928104575163398, + "grad_norm": 0.22925758824435036, + "learning_rate": 2.135546474454909e-06, + "loss": 0.7911, + "num_tokens": 66574806829.0, + "step": 15928 + }, + { + "epoch": 1.892929292929293, + "grad_norm": 0.21912747514643643, + "learning_rate": 2.13524718077368e-06, + "loss": 0.7845, + "num_tokens": 66578996967.0, + "step": 15929 + }, + { + "epoch": 1.893048128342246, + "grad_norm": 0.2381102497090794, + "learning_rate": 2.134948215384219e-06, + "loss": 0.7997, + "num_tokens": 66583141997.0, + "step": 15930 + }, + { + "epoch": 1.893166963755199, + "grad_norm": 0.2175222073356891, + "learning_rate": 2.1346495782976003e-06, + "loss": 0.8018, + "num_tokens": 66587319846.0, + "step": 15931 + }, + { + "epoch": 1.8932857991681522, + "grad_norm": 0.23043974812941087, + "learning_rate": 2.13435126952488e-06, + "loss": 0.8105, + "num_tokens": 66591508029.0, + "step": 15932 + }, + { + "epoch": 1.8934046345811053, + "grad_norm": 0.21830694740631662, + "learning_rate": 2.1340532890771087e-06, + "loss": 0.8019, + "num_tokens": 66595698048.0, + "step": 15933 + }, + { + "epoch": 1.8935234699940582, + "grad_norm": 0.22493779689542698, + "learning_rate": 2.1337556369653205e-06, + "loss": 0.8661, + "num_tokens": 66599869246.0, + "step": 15934 + }, + { + "epoch": 1.8936423054070113, + "grad_norm": 0.2198838809801964, + "learning_rate": 2.133458313200539e-06, + "loss": 0.8107, + "num_tokens": 66604057224.0, + "step": 15935 + }, + { + "epoch": 1.8937611408199644, + "grad_norm": 0.2278295739295857, + "learning_rate": 2.1331613177937753e-06, + "loss": 0.85, + "num_tokens": 66608221313.0, + "step": 15936 + }, + { + "epoch": 1.8938799762329173, + "grad_norm": 0.23678033736490053, + "learning_rate": 2.1328646507560267e-06, + "loss": 0.8139, + "num_tokens": 66612383880.0, + "step": 15937 + }, + { + "epoch": 1.8939988116458704, + "grad_norm": 0.21960969155166296, + "learning_rate": 2.1325683120982803e-06, + "loss": 0.7609, + "num_tokens": 66616573720.0, + "step": 15938 + }, + { + "epoch": 1.8941176470588235, + "grad_norm": 0.22299180279527323, + "learning_rate": 2.1322723018315108e-06, + "loss": 0.7824, + "num_tokens": 66620737766.0, + "step": 15939 + }, + { + "epoch": 1.8942364824717766, + "grad_norm": 0.21944047804058667, + "learning_rate": 2.1319766199666802e-06, + "loss": 0.7949, + "num_tokens": 66624907039.0, + "step": 15940 + }, + { + "epoch": 1.8943553178847297, + "grad_norm": 0.22737804860551317, + "learning_rate": 2.1316812665147397e-06, + "loss": 0.7881, + "num_tokens": 66629096510.0, + "step": 15941 + }, + { + "epoch": 1.8944741532976828, + "grad_norm": 0.22386344898379792, + "learning_rate": 2.1313862414866264e-06, + "loss": 0.7825, + "num_tokens": 66633285443.0, + "step": 15942 + }, + { + "epoch": 1.8945929887106359, + "grad_norm": 0.2167708771759297, + "learning_rate": 2.131091544893266e-06, + "loss": 0.8053, + "num_tokens": 66637473536.0, + "step": 15943 + }, + { + "epoch": 1.894711824123589, + "grad_norm": 0.2220107510474609, + "learning_rate": 2.1307971767455706e-06, + "loss": 0.7995, + "num_tokens": 66641660731.0, + "step": 15944 + }, + { + "epoch": 1.8948306595365418, + "grad_norm": 0.22024741928266406, + "learning_rate": 2.130503137054445e-06, + "loss": 0.8015, + "num_tokens": 66645849817.0, + "step": 15945 + }, + { + "epoch": 1.894949494949495, + "grad_norm": 0.2176192969789804, + "learning_rate": 2.130209425830777e-06, + "loss": 0.8058, + "num_tokens": 66650019510.0, + "step": 15946 + }, + { + "epoch": 1.895068330362448, + "grad_norm": 0.209730470950664, + "learning_rate": 2.129916043085442e-06, + "loss": 0.7961, + "num_tokens": 66654209458.0, + "step": 15947 + }, + { + "epoch": 1.895187165775401, + "grad_norm": 0.23648362506040252, + "learning_rate": 2.129622988829307e-06, + "loss": 0.8153, + "num_tokens": 66658371020.0, + "step": 15948 + }, + { + "epoch": 1.895306001188354, + "grad_norm": 0.222927298899941, + "learning_rate": 2.129330263073224e-06, + "loss": 0.8115, + "num_tokens": 66662561200.0, + "step": 15949 + }, + { + "epoch": 1.8954248366013071, + "grad_norm": 0.2325089145448991, + "learning_rate": 2.129037865828035e-06, + "loss": 0.8321, + "num_tokens": 66666750455.0, + "step": 15950 + }, + { + "epoch": 1.8955436720142602, + "grad_norm": 0.21275545500916573, + "learning_rate": 2.128745797104566e-06, + "loss": 0.8219, + "num_tokens": 66670939297.0, + "step": 15951 + }, + { + "epoch": 1.8956625074272133, + "grad_norm": 0.21442951827713055, + "learning_rate": 2.1284540569136345e-06, + "loss": 0.8085, + "num_tokens": 66675128193.0, + "step": 15952 + }, + { + "epoch": 1.8957813428401664, + "grad_norm": 0.22181200313612207, + "learning_rate": 2.128162645266046e-06, + "loss": 0.8328, + "num_tokens": 66679306451.0, + "step": 15953 + }, + { + "epoch": 1.8959001782531195, + "grad_norm": 0.20791023731953573, + "learning_rate": 2.1278715621725897e-06, + "loss": 0.8401, + "num_tokens": 66683497064.0, + "step": 15954 + }, + { + "epoch": 1.8960190136660726, + "grad_norm": 0.21992352597005593, + "learning_rate": 2.1275808076440475e-06, + "loss": 0.807, + "num_tokens": 66687657751.0, + "step": 15955 + }, + { + "epoch": 1.8961378490790255, + "grad_norm": 0.21312468880611485, + "learning_rate": 2.1272903816911853e-06, + "loss": 0.7881, + "num_tokens": 66691822657.0, + "step": 15956 + }, + { + "epoch": 1.8962566844919786, + "grad_norm": 0.20656896700148678, + "learning_rate": 2.1270002843247604e-06, + "loss": 0.8056, + "num_tokens": 66696012657.0, + "step": 15957 + }, + { + "epoch": 1.8963755199049317, + "grad_norm": 0.20705198512832831, + "learning_rate": 2.1267105155555164e-06, + "loss": 0.7901, + "num_tokens": 66700202370.0, + "step": 15958 + }, + { + "epoch": 1.8964943553178846, + "grad_norm": 0.22214041263715864, + "learning_rate": 2.126421075394182e-06, + "loss": 0.771, + "num_tokens": 66704318216.0, + "step": 15959 + }, + { + "epoch": 1.8966131907308377, + "grad_norm": 0.21779301296753856, + "learning_rate": 2.1261319638514776e-06, + "loss": 0.7811, + "num_tokens": 66708509557.0, + "step": 15960 + }, + { + "epoch": 1.8967320261437908, + "grad_norm": 0.2262564399554972, + "learning_rate": 2.12584318093811e-06, + "loss": 0.8085, + "num_tokens": 66712689721.0, + "step": 15961 + }, + { + "epoch": 1.8968508615567439, + "grad_norm": 0.2191418722545569, + "learning_rate": 2.1255547266647742e-06, + "loss": 0.7939, + "num_tokens": 66716876574.0, + "step": 15962 + }, + { + "epoch": 1.896969696969697, + "grad_norm": 0.21267803207299432, + "learning_rate": 2.1252666010421514e-06, + "loss": 0.7828, + "num_tokens": 66721066229.0, + "step": 15963 + }, + { + "epoch": 1.89708853238265, + "grad_norm": 0.21823042764236272, + "learning_rate": 2.1249788040809117e-06, + "loss": 0.8217, + "num_tokens": 66725254983.0, + "step": 15964 + }, + { + "epoch": 1.8972073677956032, + "grad_norm": 0.22607779058060967, + "learning_rate": 2.124691335791715e-06, + "loss": 0.8112, + "num_tokens": 66729425940.0, + "step": 15965 + }, + { + "epoch": 1.8973262032085563, + "grad_norm": 0.2182105511300625, + "learning_rate": 2.124404196185207e-06, + "loss": 0.7896, + "num_tokens": 66733562717.0, + "step": 15966 + }, + { + "epoch": 1.8974450386215094, + "grad_norm": 0.21225672879222604, + "learning_rate": 2.12411738527202e-06, + "loss": 0.7804, + "num_tokens": 66737733530.0, + "step": 15967 + }, + { + "epoch": 1.8975638740344623, + "grad_norm": 0.22249871820203707, + "learning_rate": 2.123830903062777e-06, + "loss": 0.8261, + "num_tokens": 66741923201.0, + "step": 15968 + }, + { + "epoch": 1.8976827094474154, + "grad_norm": 0.23553623566218151, + "learning_rate": 2.123544749568086e-06, + "loss": 0.8248, + "num_tokens": 66746112436.0, + "step": 15969 + }, + { + "epoch": 1.8978015448603682, + "grad_norm": 0.21972449913673767, + "learning_rate": 2.1232589247985434e-06, + "loss": 0.7412, + "num_tokens": 66750287521.0, + "step": 15970 + }, + { + "epoch": 1.8979203802733213, + "grad_norm": 0.21987576547438747, + "learning_rate": 2.1229734287647378e-06, + "loss": 0.8137, + "num_tokens": 66754476155.0, + "step": 15971 + }, + { + "epoch": 1.8980392156862744, + "grad_norm": 0.2230906697604457, + "learning_rate": 2.1226882614772396e-06, + "loss": 0.8066, + "num_tokens": 66758665952.0, + "step": 15972 + }, + { + "epoch": 1.8981580510992275, + "grad_norm": 0.21661801008103265, + "learning_rate": 2.1224034229466107e-06, + "loss": 0.7987, + "num_tokens": 66762799119.0, + "step": 15973 + }, + { + "epoch": 1.8982768865121806, + "grad_norm": 0.2165982389116012, + "learning_rate": 2.122118913183398e-06, + "loss": 0.8251, + "num_tokens": 66766987030.0, + "step": 15974 + }, + { + "epoch": 1.8983957219251337, + "grad_norm": 0.21201960106110515, + "learning_rate": 2.1218347321981383e-06, + "loss": 0.7868, + "num_tokens": 66771176372.0, + "step": 15975 + }, + { + "epoch": 1.8985145573380868, + "grad_norm": 0.21080237064417445, + "learning_rate": 2.1215508800013572e-06, + "loss": 0.7942, + "num_tokens": 66775357771.0, + "step": 15976 + }, + { + "epoch": 1.89863339275104, + "grad_norm": 0.21920581196417338, + "learning_rate": 2.121267356603565e-06, + "loss": 0.7965, + "num_tokens": 66779547879.0, + "step": 15977 + }, + { + "epoch": 1.898752228163993, + "grad_norm": 0.20621648975925014, + "learning_rate": 2.120984162015264e-06, + "loss": 0.7947, + "num_tokens": 66783737939.0, + "step": 15978 + }, + { + "epoch": 1.898871063576946, + "grad_norm": 0.21062591503229153, + "learning_rate": 2.1207012962469384e-06, + "loss": 0.789, + "num_tokens": 66787911457.0, + "step": 15979 + }, + { + "epoch": 1.898989898989899, + "grad_norm": 0.21781053705015888, + "learning_rate": 2.120418759309066e-06, + "loss": 0.8061, + "num_tokens": 66792074955.0, + "step": 15980 + }, + { + "epoch": 1.8991087344028519, + "grad_norm": 0.2108292907628564, + "learning_rate": 2.1201365512121097e-06, + "loss": 0.807, + "num_tokens": 66796264716.0, + "step": 15981 + }, + { + "epoch": 1.899227569815805, + "grad_norm": 0.22403975316883273, + "learning_rate": 2.1198546719665186e-06, + "loss": 0.7912, + "num_tokens": 66800454108.0, + "step": 15982 + }, + { + "epoch": 1.899346405228758, + "grad_norm": 0.23134780645672162, + "learning_rate": 2.1195731215827364e-06, + "loss": 0.7959, + "num_tokens": 66804633545.0, + "step": 15983 + }, + { + "epoch": 1.8994652406417112, + "grad_norm": 0.21707564556001505, + "learning_rate": 2.1192919000711847e-06, + "loss": 0.8303, + "num_tokens": 66808821460.0, + "step": 15984 + }, + { + "epoch": 1.8995840760546643, + "grad_norm": 0.21759822621237, + "learning_rate": 2.1190110074422822e-06, + "loss": 0.8086, + "num_tokens": 66812980689.0, + "step": 15985 + }, + { + "epoch": 1.8997029114676174, + "grad_norm": 0.22614707477684634, + "learning_rate": 2.118730443706428e-06, + "loss": 0.8002, + "num_tokens": 66817122989.0, + "step": 15986 + }, + { + "epoch": 1.8998217468805705, + "grad_norm": 0.20850460861213674, + "learning_rate": 2.118450208874014e-06, + "loss": 0.8083, + "num_tokens": 66821305411.0, + "step": 15987 + }, + { + "epoch": 1.8999405822935236, + "grad_norm": 0.21226416355956548, + "learning_rate": 2.118170302955419e-06, + "loss": 0.8122, + "num_tokens": 66825487777.0, + "step": 15988 + }, + { + "epoch": 1.9000594177064767, + "grad_norm": 0.21764349000778996, + "learning_rate": 2.1178907259610065e-06, + "loss": 0.8118, + "num_tokens": 66829678133.0, + "step": 15989 + }, + { + "epoch": 1.9001782531194296, + "grad_norm": 0.2205574223761649, + "learning_rate": 2.117611477901132e-06, + "loss": 0.8234, + "num_tokens": 66833842604.0, + "step": 15990 + }, + { + "epoch": 1.9002970885323827, + "grad_norm": 0.222089346678245, + "learning_rate": 2.1173325587861366e-06, + "loss": 0.8318, + "num_tokens": 66838004175.0, + "step": 15991 + }, + { + "epoch": 1.9004159239453358, + "grad_norm": 0.20854157411824967, + "learning_rate": 2.117053968626349e-06, + "loss": 0.8178, + "num_tokens": 66842194374.0, + "step": 15992 + }, + { + "epoch": 1.9005347593582886, + "grad_norm": 0.2260031052509804, + "learning_rate": 2.116775707432088e-06, + "loss": 0.8035, + "num_tokens": 66846356385.0, + "step": 15993 + }, + { + "epoch": 1.9006535947712417, + "grad_norm": 0.21958777304304564, + "learning_rate": 2.1164977752136578e-06, + "loss": 0.7885, + "num_tokens": 66850546544.0, + "step": 15994 + }, + { + "epoch": 1.9007724301841948, + "grad_norm": 0.22164167647842542, + "learning_rate": 2.1162201719813493e-06, + "loss": 0.7971, + "num_tokens": 66854733534.0, + "step": 15995 + }, + { + "epoch": 1.900891265597148, + "grad_norm": 0.21500434993729076, + "learning_rate": 2.115942897745445e-06, + "loss": 0.8022, + "num_tokens": 66858923142.0, + "step": 15996 + }, + { + "epoch": 1.901010101010101, + "grad_norm": 0.23586834589888464, + "learning_rate": 2.1156659525162133e-06, + "loss": 0.8257, + "num_tokens": 66863083803.0, + "step": 15997 + }, + { + "epoch": 1.9011289364230541, + "grad_norm": 0.2171160909652168, + "learning_rate": 2.1153893363039095e-06, + "loss": 0.8151, + "num_tokens": 66867265722.0, + "step": 15998 + }, + { + "epoch": 1.9012477718360072, + "grad_norm": 0.21632575005428456, + "learning_rate": 2.1151130491187795e-06, + "loss": 0.7807, + "num_tokens": 66871456380.0, + "step": 15999 + }, + { + "epoch": 1.9013666072489603, + "grad_norm": 0.23673053605018468, + "learning_rate": 2.1148370909710527e-06, + "loss": 0.8198, + "num_tokens": 66875645999.0, + "step": 16000 + }, + { + "epoch": 1.9014854426619132, + "grad_norm": 0.23121056511017835, + "learning_rate": 2.1145614618709497e-06, + "loss": 0.7753, + "num_tokens": 66879836364.0, + "step": 16001 + }, + { + "epoch": 1.9016042780748663, + "grad_norm": 0.22931531403833136, + "learning_rate": 2.1142861618286776e-06, + "loss": 0.8278, + "num_tokens": 66884003934.0, + "step": 16002 + }, + { + "epoch": 1.9017231134878194, + "grad_norm": 0.2245086342080649, + "learning_rate": 2.114011190854434e-06, + "loss": 0.7628, + "num_tokens": 66888193299.0, + "step": 16003 + }, + { + "epoch": 1.9018419489007723, + "grad_norm": 0.2366956359112473, + "learning_rate": 2.1137365489583994e-06, + "loss": 0.8255, + "num_tokens": 66892332858.0, + "step": 16004 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 0.22465589620420004, + "learning_rate": 2.1134622361507456e-06, + "loss": 0.8331, + "num_tokens": 66896523178.0, + "step": 16005 + }, + { + "epoch": 1.9020796197266785, + "grad_norm": 0.23373089134923875, + "learning_rate": 2.1131882524416312e-06, + "loss": 0.8149, + "num_tokens": 66900713119.0, + "step": 16006 + }, + { + "epoch": 1.9021984551396316, + "grad_norm": 0.23791106788450656, + "learning_rate": 2.1129145978412033e-06, + "loss": 0.8454, + "num_tokens": 66904903724.0, + "step": 16007 + }, + { + "epoch": 1.9023172905525847, + "grad_norm": 0.20836472520262334, + "learning_rate": 2.1126412723595944e-06, + "loss": 0.8098, + "num_tokens": 66909094522.0, + "step": 16008 + }, + { + "epoch": 1.9024361259655378, + "grad_norm": 0.2243399938335725, + "learning_rate": 2.1123682760069296e-06, + "loss": 0.7842, + "num_tokens": 66913255840.0, + "step": 16009 + }, + { + "epoch": 1.902554961378491, + "grad_norm": 0.23495923881960845, + "learning_rate": 2.112095608793317e-06, + "loss": 0.7992, + "num_tokens": 66917423359.0, + "step": 16010 + }, + { + "epoch": 1.902673796791444, + "grad_norm": 0.20971307002948936, + "learning_rate": 2.1118232707288542e-06, + "loss": 0.8079, + "num_tokens": 66921612417.0, + "step": 16011 + }, + { + "epoch": 1.9027926322043969, + "grad_norm": 0.21749566526587136, + "learning_rate": 2.1115512618236273e-06, + "loss": 0.796, + "num_tokens": 66925788749.0, + "step": 16012 + }, + { + "epoch": 1.90291146761735, + "grad_norm": 0.22172222151173845, + "learning_rate": 2.11127958208771e-06, + "loss": 0.7642, + "num_tokens": 66929978598.0, + "step": 16013 + }, + { + "epoch": 1.903030303030303, + "grad_norm": 0.22502441920003002, + "learning_rate": 2.1110082315311634e-06, + "loss": 0.7604, + "num_tokens": 66934140922.0, + "step": 16014 + }, + { + "epoch": 1.903149138443256, + "grad_norm": 0.22557831688521351, + "learning_rate": 2.110737210164036e-06, + "loss": 0.7884, + "num_tokens": 66938330671.0, + "step": 16015 + }, + { + "epoch": 1.903267973856209, + "grad_norm": 0.22820387308952217, + "learning_rate": 2.1104665179963652e-06, + "loss": 0.8452, + "num_tokens": 66942518167.0, + "step": 16016 + }, + { + "epoch": 1.9033868092691621, + "grad_norm": 0.22686098786771497, + "learning_rate": 2.1101961550381756e-06, + "loss": 0.8314, + "num_tokens": 66946708592.0, + "step": 16017 + }, + { + "epoch": 1.9035056446821152, + "grad_norm": 0.2114108878111789, + "learning_rate": 2.109926121299479e-06, + "loss": 0.7851, + "num_tokens": 66950899548.0, + "step": 16018 + }, + { + "epoch": 1.9036244800950683, + "grad_norm": 0.21926787690802324, + "learning_rate": 2.1096564167902773e-06, + "loss": 0.7693, + "num_tokens": 66955088606.0, + "step": 16019 + }, + { + "epoch": 1.9037433155080214, + "grad_norm": 0.22166598921155356, + "learning_rate": 2.1093870415205568e-06, + "loss": 0.7785, + "num_tokens": 66959255688.0, + "step": 16020 + }, + { + "epoch": 1.9038621509209745, + "grad_norm": 0.2130507188696818, + "learning_rate": 2.1091179955002937e-06, + "loss": 0.8167, + "num_tokens": 66963440759.0, + "step": 16021 + }, + { + "epoch": 1.9039809863339276, + "grad_norm": 0.22770092025545113, + "learning_rate": 2.108849278739453e-06, + "loss": 0.818, + "num_tokens": 66967630125.0, + "step": 16022 + }, + { + "epoch": 1.9040998217468805, + "grad_norm": 0.22484412495731784, + "learning_rate": 2.1085808912479836e-06, + "loss": 0.8272, + "num_tokens": 66971819243.0, + "step": 16023 + }, + { + "epoch": 1.9042186571598336, + "grad_norm": 0.2078488354965303, + "learning_rate": 2.1083128330358274e-06, + "loss": 0.834, + "num_tokens": 66975978593.0, + "step": 16024 + }, + { + "epoch": 1.9043374925727867, + "grad_norm": 0.22516057765119651, + "learning_rate": 2.108045104112911e-06, + "loss": 0.8066, + "num_tokens": 66980168561.0, + "step": 16025 + }, + { + "epoch": 1.9044563279857396, + "grad_norm": 0.21547173155787958, + "learning_rate": 2.1077777044891484e-06, + "loss": 0.8086, + "num_tokens": 66984356152.0, + "step": 16026 + }, + { + "epoch": 1.9045751633986927, + "grad_norm": 0.22945584900696495, + "learning_rate": 2.107510634174442e-06, + "loss": 0.8187, + "num_tokens": 66988544545.0, + "step": 16027 + }, + { + "epoch": 1.9046939988116458, + "grad_norm": 0.2098015970516306, + "learning_rate": 2.1072438931786835e-06, + "loss": 0.7942, + "num_tokens": 66992730454.0, + "step": 16028 + }, + { + "epoch": 1.904812834224599, + "grad_norm": 0.21941631968125733, + "learning_rate": 2.106977481511751e-06, + "loss": 0.801, + "num_tokens": 66996920382.0, + "step": 16029 + }, + { + "epoch": 1.904931669637552, + "grad_norm": 0.2411575433931395, + "learning_rate": 2.106711399183511e-06, + "loss": 0.7875, + "num_tokens": 67001105824.0, + "step": 16030 + }, + { + "epoch": 1.905050505050505, + "grad_norm": 0.21977070827742612, + "learning_rate": 2.1064456462038157e-06, + "loss": 0.8065, + "num_tokens": 67005295638.0, + "step": 16031 + }, + { + "epoch": 1.9051693404634582, + "grad_norm": 0.21130142330512855, + "learning_rate": 2.106180222582508e-06, + "loss": 0.8099, + "num_tokens": 67009482184.0, + "step": 16032 + }, + { + "epoch": 1.9052881758764113, + "grad_norm": 0.21858326525188218, + "learning_rate": 2.105915128329417e-06, + "loss": 0.8281, + "num_tokens": 67013666067.0, + "step": 16033 + }, + { + "epoch": 1.9054070112893642, + "grad_norm": 0.22335953424251526, + "learning_rate": 2.1056503634543617e-06, + "loss": 0.7625, + "num_tokens": 67017854831.0, + "step": 16034 + }, + { + "epoch": 1.9055258467023173, + "grad_norm": 0.21451931665171722, + "learning_rate": 2.1053859279671453e-06, + "loss": 0.8128, + "num_tokens": 67022027466.0, + "step": 16035 + }, + { + "epoch": 1.9056446821152704, + "grad_norm": 0.23403649266901846, + "learning_rate": 2.105121821877561e-06, + "loss": 0.8108, + "num_tokens": 67026191343.0, + "step": 16036 + }, + { + "epoch": 1.9057635175282233, + "grad_norm": 0.22574802762606488, + "learning_rate": 2.1048580451953907e-06, + "loss": 0.8289, + "num_tokens": 67030380870.0, + "step": 16037 + }, + { + "epoch": 1.9058823529411764, + "grad_norm": 0.22831114282914047, + "learning_rate": 2.1045945979304012e-06, + "loss": 0.7711, + "num_tokens": 67034570614.0, + "step": 16038 + }, + { + "epoch": 1.9060011883541295, + "grad_norm": 0.2197774803672414, + "learning_rate": 2.104331480092351e-06, + "loss": 0.8346, + "num_tokens": 67038757782.0, + "step": 16039 + }, + { + "epoch": 1.9061200237670826, + "grad_norm": 0.2231289992603484, + "learning_rate": 2.1040686916909823e-06, + "loss": 0.8106, + "num_tokens": 67042905110.0, + "step": 16040 + }, + { + "epoch": 1.9062388591800357, + "grad_norm": 0.21780688792564798, + "learning_rate": 2.1038062327360283e-06, + "loss": 0.8037, + "num_tokens": 67047093593.0, + "step": 16041 + }, + { + "epoch": 1.9063576945929888, + "grad_norm": 0.22126976735898968, + "learning_rate": 2.103544103237208e-06, + "loss": 0.7857, + "num_tokens": 67051284246.0, + "step": 16042 + }, + { + "epoch": 1.9064765300059419, + "grad_norm": 0.21923794657902265, + "learning_rate": 2.1032823032042287e-06, + "loss": 0.8112, + "num_tokens": 67055474516.0, + "step": 16043 + }, + { + "epoch": 1.906595365418895, + "grad_norm": 0.2202264230788187, + "learning_rate": 2.103020832646786e-06, + "loss": 0.8193, + "num_tokens": 67059662065.0, + "step": 16044 + }, + { + "epoch": 1.9067142008318478, + "grad_norm": 0.2157014814128213, + "learning_rate": 2.102759691574565e-06, + "loss": 0.8036, + "num_tokens": 67063852501.0, + "step": 16045 + }, + { + "epoch": 1.906833036244801, + "grad_norm": 0.21076431983553884, + "learning_rate": 2.1024988799972335e-06, + "loss": 0.8212, + "num_tokens": 67068041774.0, + "step": 16046 + }, + { + "epoch": 1.906951871657754, + "grad_norm": 0.21977725183877017, + "learning_rate": 2.1022383979244516e-06, + "loss": 0.8423, + "num_tokens": 67072207171.0, + "step": 16047 + }, + { + "epoch": 1.907070707070707, + "grad_norm": 0.2168073317294761, + "learning_rate": 2.101978245365866e-06, + "loss": 0.8179, + "num_tokens": 67076369955.0, + "step": 16048 + }, + { + "epoch": 1.90718954248366, + "grad_norm": 0.22253221091437508, + "learning_rate": 2.1017184223311114e-06, + "loss": 0.8054, + "num_tokens": 67080560923.0, + "step": 16049 + }, + { + "epoch": 1.9073083778966131, + "grad_norm": 0.20830407928295377, + "learning_rate": 2.101458928829809e-06, + "loss": 0.7833, + "num_tokens": 67084723877.0, + "step": 16050 + }, + { + "epoch": 1.9074272133095662, + "grad_norm": 0.21661725644746582, + "learning_rate": 2.10119976487157e-06, + "loss": 0.7877, + "num_tokens": 67088912479.0, + "step": 16051 + }, + { + "epoch": 1.9075460487225193, + "grad_norm": 0.22663244456052842, + "learning_rate": 2.10094093046599e-06, + "loss": 0.8279, + "num_tokens": 67093100980.0, + "step": 16052 + }, + { + "epoch": 1.9076648841354724, + "grad_norm": 0.21030554301496715, + "learning_rate": 2.100682425622656e-06, + "loss": 0.8032, + "num_tokens": 67097291849.0, + "step": 16053 + }, + { + "epoch": 1.9077837195484255, + "grad_norm": 0.21707915179588202, + "learning_rate": 2.1004242503511417e-06, + "loss": 0.8122, + "num_tokens": 67101475140.0, + "step": 16054 + }, + { + "epoch": 1.9079025549613786, + "grad_norm": 0.2223852049746097, + "learning_rate": 2.1001664046610075e-06, + "loss": 0.7593, + "num_tokens": 67105665086.0, + "step": 16055 + }, + { + "epoch": 1.9080213903743317, + "grad_norm": 0.20418091966851898, + "learning_rate": 2.099908888561803e-06, + "loss": 0.7807, + "num_tokens": 67109853296.0, + "step": 16056 + }, + { + "epoch": 1.9081402257872846, + "grad_norm": 0.23563650987466225, + "learning_rate": 2.099651702063063e-06, + "loss": 0.8185, + "num_tokens": 67114041628.0, + "step": 16057 + }, + { + "epoch": 1.9082590612002377, + "grad_norm": 0.2178536592758944, + "learning_rate": 2.099394845174314e-06, + "loss": 0.7935, + "num_tokens": 67118204475.0, + "step": 16058 + }, + { + "epoch": 1.9083778966131906, + "grad_norm": 0.21179848659224104, + "learning_rate": 2.099138317905066e-06, + "loss": 0.8354, + "num_tokens": 67122393123.0, + "step": 16059 + }, + { + "epoch": 1.9084967320261437, + "grad_norm": 0.21695725446527908, + "learning_rate": 2.0988821202648226e-06, + "loss": 0.819, + "num_tokens": 67126581784.0, + "step": 16060 + }, + { + "epoch": 1.9086155674390968, + "grad_norm": 0.20761354913666738, + "learning_rate": 2.098626252263069e-06, + "loss": 0.8091, + "num_tokens": 67130757369.0, + "step": 16061 + }, + { + "epoch": 1.9087344028520499, + "grad_norm": 0.22324432434181732, + "learning_rate": 2.098370713909282e-06, + "loss": 0.7656, + "num_tokens": 67134947822.0, + "step": 16062 + }, + { + "epoch": 1.908853238265003, + "grad_norm": 0.21564696386762144, + "learning_rate": 2.098115505212924e-06, + "loss": 0.8211, + "num_tokens": 67139136688.0, + "step": 16063 + }, + { + "epoch": 1.908972073677956, + "grad_norm": 0.22450655283031457, + "learning_rate": 2.097860626183447e-06, + "loss": 0.8049, + "num_tokens": 67143325267.0, + "step": 16064 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.21487079767996944, + "learning_rate": 2.0976060768302893e-06, + "loss": 0.8737, + "num_tokens": 67147497575.0, + "step": 16065 + }, + { + "epoch": 1.9092097445038623, + "grad_norm": 0.2442328673928579, + "learning_rate": 2.097351857162879e-06, + "loss": 0.823, + "num_tokens": 67151688154.0, + "step": 16066 + }, + { + "epoch": 1.9093285799168154, + "grad_norm": 0.21477008795808047, + "learning_rate": 2.097097967190629e-06, + "loss": 0.7727, + "num_tokens": 67155848932.0, + "step": 16067 + }, + { + "epoch": 1.9094474153297682, + "grad_norm": 0.23496199733314366, + "learning_rate": 2.096844406922944e-06, + "loss": 0.8064, + "num_tokens": 67160037460.0, + "step": 16068 + }, + { + "epoch": 1.9095662507427213, + "grad_norm": 0.21879281463411654, + "learning_rate": 2.0965911763692125e-06, + "loss": 0.7994, + "num_tokens": 67164226680.0, + "step": 16069 + }, + { + "epoch": 1.9096850861556742, + "grad_norm": 0.21378014338042256, + "learning_rate": 2.096338275538812e-06, + "loss": 0.8136, + "num_tokens": 67168415986.0, + "step": 16070 + }, + { + "epoch": 1.9098039215686273, + "grad_norm": 0.2329021118659406, + "learning_rate": 2.0960857044411106e-06, + "loss": 0.7959, + "num_tokens": 67172604644.0, + "step": 16071 + }, + { + "epoch": 1.9099227569815804, + "grad_norm": 0.20752129656997936, + "learning_rate": 2.095833463085459e-06, + "loss": 0.7724, + "num_tokens": 67176795030.0, + "step": 16072 + }, + { + "epoch": 1.9100415923945335, + "grad_norm": 0.2108728296909378, + "learning_rate": 2.0955815514812015e-06, + "loss": 0.8047, + "num_tokens": 67180982591.0, + "step": 16073 + }, + { + "epoch": 1.9101604278074866, + "grad_norm": 0.20835006409563533, + "learning_rate": 2.095329969637665e-06, + "loss": 0.7893, + "num_tokens": 67185172476.0, + "step": 16074 + }, + { + "epoch": 1.9102792632204397, + "grad_norm": 0.22301799925128651, + "learning_rate": 2.095078717564166e-06, + "loss": 0.7885, + "num_tokens": 67189360920.0, + "step": 16075 + }, + { + "epoch": 1.9103980986333928, + "grad_norm": 0.22223357199804397, + "learning_rate": 2.094827795270011e-06, + "loss": 0.8136, + "num_tokens": 67193528868.0, + "step": 16076 + }, + { + "epoch": 1.910516934046346, + "grad_norm": 0.2158136728406519, + "learning_rate": 2.0945772027644933e-06, + "loss": 0.8067, + "num_tokens": 67197719503.0, + "step": 16077 + }, + { + "epoch": 1.910635769459299, + "grad_norm": 0.21496900736772875, + "learning_rate": 2.09432694005689e-06, + "loss": 0.797, + "num_tokens": 67201873758.0, + "step": 16078 + }, + { + "epoch": 1.910754604872252, + "grad_norm": 0.20483723655489705, + "learning_rate": 2.0940770071564712e-06, + "loss": 0.815, + "num_tokens": 67206056324.0, + "step": 16079 + }, + { + "epoch": 1.910873440285205, + "grad_norm": 0.2336926227357826, + "learning_rate": 2.0938274040724934e-06, + "loss": 0.7979, + "num_tokens": 67210244501.0, + "step": 16080 + }, + { + "epoch": 1.910992275698158, + "grad_norm": 0.225428972973788, + "learning_rate": 2.093578130814199e-06, + "loss": 0.8137, + "num_tokens": 67214434273.0, + "step": 16081 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.23909606860246607, + "learning_rate": 2.093329187390818e-06, + "loss": 0.7901, + "num_tokens": 67218621972.0, + "step": 16082 + }, + { + "epoch": 1.911229946524064, + "grad_norm": 0.23107754730203045, + "learning_rate": 2.093080573811574e-06, + "loss": 0.7742, + "num_tokens": 67222756991.0, + "step": 16083 + }, + { + "epoch": 1.9113487819370172, + "grad_norm": 0.2220255880108292, + "learning_rate": 2.0928322900856703e-06, + "loss": 0.8008, + "num_tokens": 67226920908.0, + "step": 16084 + }, + { + "epoch": 1.9114676173499703, + "grad_norm": 0.2251972428543778, + "learning_rate": 2.0925843362223013e-06, + "loss": 0.8204, + "num_tokens": 67231108860.0, + "step": 16085 + }, + { + "epoch": 1.9115864527629234, + "grad_norm": 0.21358552804066586, + "learning_rate": 2.092336712230653e-06, + "loss": 0.7857, + "num_tokens": 67235293990.0, + "step": 16086 + }, + { + "epoch": 1.9117052881758765, + "grad_norm": 0.2319921628104363, + "learning_rate": 2.092089418119893e-06, + "loss": 0.8146, + "num_tokens": 67239480223.0, + "step": 16087 + }, + { + "epoch": 1.9118241235888296, + "grad_norm": 0.2290321121527336, + "learning_rate": 2.0918424538991803e-06, + "loss": 0.8059, + "num_tokens": 67243670037.0, + "step": 16088 + }, + { + "epoch": 1.9119429590017827, + "grad_norm": 0.2200217642646916, + "learning_rate": 2.0915958195776614e-06, + "loss": 0.772, + "num_tokens": 67247858933.0, + "step": 16089 + }, + { + "epoch": 1.9120617944147356, + "grad_norm": 0.22897960054833172, + "learning_rate": 2.0913495151644687e-06, + "loss": 0.8049, + "num_tokens": 67252039758.0, + "step": 16090 + }, + { + "epoch": 1.9121806298276887, + "grad_norm": 0.2314643015167144, + "learning_rate": 2.0911035406687227e-06, + "loss": 0.7988, + "num_tokens": 67256219504.0, + "step": 16091 + }, + { + "epoch": 1.9122994652406418, + "grad_norm": 0.24124506084379418, + "learning_rate": 2.0908578960995364e-06, + "loss": 0.8099, + "num_tokens": 67260409062.0, + "step": 16092 + }, + { + "epoch": 1.9124183006535946, + "grad_norm": 0.2258988270329641, + "learning_rate": 2.0906125814660042e-06, + "loss": 0.8072, + "num_tokens": 67264600108.0, + "step": 16093 + }, + { + "epoch": 1.9125371360665477, + "grad_norm": 0.21818087266019684, + "learning_rate": 2.0903675967772117e-06, + "loss": 0.8516, + "num_tokens": 67268788614.0, + "step": 16094 + }, + { + "epoch": 1.9126559714795008, + "grad_norm": 0.2094305637276754, + "learning_rate": 2.090122942042231e-06, + "loss": 0.8004, + "num_tokens": 67272974356.0, + "step": 16095 + }, + { + "epoch": 1.912774806892454, + "grad_norm": 0.22541633600747168, + "learning_rate": 2.089878617270123e-06, + "loss": 0.7748, + "num_tokens": 67277164814.0, + "step": 16096 + }, + { + "epoch": 1.912893642305407, + "grad_norm": 0.22592526008322394, + "learning_rate": 2.089634622469935e-06, + "loss": 0.8208, + "num_tokens": 67281352463.0, + "step": 16097 + }, + { + "epoch": 1.9130124777183601, + "grad_norm": 0.21522880727708618, + "learning_rate": 2.0893909576507042e-06, + "loss": 0.8158, + "num_tokens": 67285521059.0, + "step": 16098 + }, + { + "epoch": 1.9131313131313132, + "grad_norm": 0.22526065521661112, + "learning_rate": 2.089147622821454e-06, + "loss": 0.7517, + "num_tokens": 67289709945.0, + "step": 16099 + }, + { + "epoch": 1.9132501485442663, + "grad_norm": 0.22800082522809248, + "learning_rate": 2.0889046179911947e-06, + "loss": 0.7644, + "num_tokens": 67293900234.0, + "step": 16100 + }, + { + "epoch": 1.9133689839572192, + "grad_norm": 0.2113047339295729, + "learning_rate": 2.0886619431689265e-06, + "loss": 0.8296, + "num_tokens": 67298088767.0, + "step": 16101 + }, + { + "epoch": 1.9134878193701723, + "grad_norm": 0.22290612510942165, + "learning_rate": 2.0884195983636367e-06, + "loss": 0.8124, + "num_tokens": 67302247640.0, + "step": 16102 + }, + { + "epoch": 1.9136066547831254, + "grad_norm": 0.21389740380482958, + "learning_rate": 2.088177583584301e-06, + "loss": 0.8178, + "num_tokens": 67306435703.0, + "step": 16103 + }, + { + "epoch": 1.9137254901960783, + "grad_norm": 0.21567029468697996, + "learning_rate": 2.0879358988398805e-06, + "loss": 0.778, + "num_tokens": 67310625122.0, + "step": 16104 + }, + { + "epoch": 1.9138443256090314, + "grad_norm": 0.22087201218928892, + "learning_rate": 2.087694544139325e-06, + "loss": 0.8045, + "num_tokens": 67314789582.0, + "step": 16105 + }, + { + "epoch": 1.9139631610219845, + "grad_norm": 0.23164183486919146, + "learning_rate": 2.0874535194915747e-06, + "loss": 0.7896, + "num_tokens": 67318971995.0, + "step": 16106 + }, + { + "epoch": 1.9140819964349376, + "grad_norm": 0.22683717373404183, + "learning_rate": 2.0872128249055545e-06, + "loss": 0.8084, + "num_tokens": 67323150356.0, + "step": 16107 + }, + { + "epoch": 1.9142008318478907, + "grad_norm": 0.21480954812287173, + "learning_rate": 2.0869724603901782e-06, + "loss": 0.8059, + "num_tokens": 67327338604.0, + "step": 16108 + }, + { + "epoch": 1.9143196672608438, + "grad_norm": 0.21703165855260975, + "learning_rate": 2.0867324259543474e-06, + "loss": 0.8519, + "num_tokens": 67331513920.0, + "step": 16109 + }, + { + "epoch": 1.914438502673797, + "grad_norm": 0.21707704125888316, + "learning_rate": 2.0864927216069512e-06, + "loss": 0.7944, + "num_tokens": 67335702466.0, + "step": 16110 + }, + { + "epoch": 1.91455733808675, + "grad_norm": 0.20765567502988885, + "learning_rate": 2.0862533473568665e-06, + "loss": 0.8013, + "num_tokens": 67339882583.0, + "step": 16111 + }, + { + "epoch": 1.9146761734997029, + "grad_norm": 0.22437883638947584, + "learning_rate": 2.086014303212958e-06, + "loss": 0.8487, + "num_tokens": 67344047186.0, + "step": 16112 + }, + { + "epoch": 1.914795008912656, + "grad_norm": 0.21617619511336886, + "learning_rate": 2.0857755891840804e-06, + "loss": 0.8023, + "num_tokens": 67348183801.0, + "step": 16113 + }, + { + "epoch": 1.914913844325609, + "grad_norm": 0.21990016901027606, + "learning_rate": 2.0855372052790715e-06, + "loss": 0.7922, + "num_tokens": 67352372827.0, + "step": 16114 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.23156101290079575, + "learning_rate": 2.085299151506761e-06, + "loss": 0.7794, + "num_tokens": 67356561480.0, + "step": 16115 + }, + { + "epoch": 1.915151515151515, + "grad_norm": 0.21822118079273056, + "learning_rate": 2.085061427875962e-06, + "loss": 0.8234, + "num_tokens": 67360750659.0, + "step": 16116 + }, + { + "epoch": 1.9152703505644681, + "grad_norm": 0.21531756035978342, + "learning_rate": 2.084824034395483e-06, + "loss": 0.8236, + "num_tokens": 67364941221.0, + "step": 16117 + }, + { + "epoch": 1.9153891859774212, + "grad_norm": 0.21847063256185784, + "learning_rate": 2.0845869710741118e-06, + "loss": 0.806, + "num_tokens": 67369130270.0, + "step": 16118 + }, + { + "epoch": 1.9155080213903743, + "grad_norm": 0.22632504259208716, + "learning_rate": 2.0843502379206296e-06, + "loss": 0.7953, + "num_tokens": 67373287229.0, + "step": 16119 + }, + { + "epoch": 1.9156268568033274, + "grad_norm": 0.21782059039772647, + "learning_rate": 2.0841138349438023e-06, + "loss": 0.8291, + "num_tokens": 67377449191.0, + "step": 16120 + }, + { + "epoch": 1.9157456922162805, + "grad_norm": 0.21233559602366267, + "learning_rate": 2.083877762152385e-06, + "loss": 0.763, + "num_tokens": 67381639894.0, + "step": 16121 + }, + { + "epoch": 1.9158645276292336, + "grad_norm": 0.21868443917087194, + "learning_rate": 2.0836420195551186e-06, + "loss": 0.8302, + "num_tokens": 67385829825.0, + "step": 16122 + }, + { + "epoch": 1.9159833630421865, + "grad_norm": 0.21845219948746797, + "learning_rate": 2.0834066071607374e-06, + "loss": 0.8513, + "num_tokens": 67390019919.0, + "step": 16123 + }, + { + "epoch": 1.9161021984551396, + "grad_norm": 0.21788114723179305, + "learning_rate": 2.0831715249779566e-06, + "loss": 0.7857, + "num_tokens": 67394198818.0, + "step": 16124 + }, + { + "epoch": 1.9162210338680927, + "grad_norm": 0.22532369711464867, + "learning_rate": 2.082936773015483e-06, + "loss": 0.8423, + "num_tokens": 67398388723.0, + "step": 16125 + }, + { + "epoch": 1.9163398692810456, + "grad_norm": 0.2259341556450288, + "learning_rate": 2.0827023512820096e-06, + "loss": 0.7802, + "num_tokens": 67402577657.0, + "step": 16126 + }, + { + "epoch": 1.9164587046939987, + "grad_norm": 0.21515323680822523, + "learning_rate": 2.082468259786218e-06, + "loss": 0.8375, + "num_tokens": 67406754505.0, + "step": 16127 + }, + { + "epoch": 1.9165775401069518, + "grad_norm": 0.22454229587473562, + "learning_rate": 2.0822344985367776e-06, + "loss": 0.8003, + "num_tokens": 67410945195.0, + "step": 16128 + }, + { + "epoch": 1.916696375519905, + "grad_norm": 0.22682737469450398, + "learning_rate": 2.0820010675423453e-06, + "loss": 0.7965, + "num_tokens": 67415136032.0, + "step": 16129 + }, + { + "epoch": 1.916815210932858, + "grad_norm": 0.21980705002133918, + "learning_rate": 2.0817679668115666e-06, + "loss": 0.7715, + "num_tokens": 67419324897.0, + "step": 16130 + }, + { + "epoch": 1.916934046345811, + "grad_norm": 0.22170559418528044, + "learning_rate": 2.0815351963530735e-06, + "loss": 0.8214, + "num_tokens": 67423513618.0, + "step": 16131 + }, + { + "epoch": 1.9170528817587642, + "grad_norm": 0.2195759829723847, + "learning_rate": 2.081302756175485e-06, + "loss": 0.7915, + "num_tokens": 67427670466.0, + "step": 16132 + }, + { + "epoch": 1.9171717171717173, + "grad_norm": 0.21558497406928312, + "learning_rate": 2.0810706462874113e-06, + "loss": 0.819, + "num_tokens": 67431841620.0, + "step": 16133 + }, + { + "epoch": 1.9172905525846702, + "grad_norm": 0.20621504328927454, + "learning_rate": 2.0808388666974467e-06, + "loss": 0.773, + "num_tokens": 67436030060.0, + "step": 16134 + }, + { + "epoch": 1.9174093879976233, + "grad_norm": 0.21265252631272352, + "learning_rate": 2.0806074174141755e-06, + "loss": 0.8005, + "num_tokens": 67440220418.0, + "step": 16135 + }, + { + "epoch": 1.9175282234105764, + "grad_norm": 0.21572794687438268, + "learning_rate": 2.080376298446169e-06, + "loss": 0.8062, + "num_tokens": 67444409231.0, + "step": 16136 + }, + { + "epoch": 1.9176470588235293, + "grad_norm": 0.21638544268307414, + "learning_rate": 2.0801455098019856e-06, + "loss": 0.7999, + "num_tokens": 67448575573.0, + "step": 16137 + }, + { + "epoch": 1.9177658942364824, + "grad_norm": 0.21858098817821517, + "learning_rate": 2.0799150514901736e-06, + "loss": 0.8181, + "num_tokens": 67452753810.0, + "step": 16138 + }, + { + "epoch": 1.9178847296494355, + "grad_norm": 0.21089617602341532, + "learning_rate": 2.0796849235192657e-06, + "loss": 0.7999, + "num_tokens": 67456916249.0, + "step": 16139 + }, + { + "epoch": 1.9180035650623886, + "grad_norm": 0.2125280019909066, + "learning_rate": 2.0794551258977862e-06, + "loss": 0.7945, + "num_tokens": 67461091856.0, + "step": 16140 + }, + { + "epoch": 1.9181224004753417, + "grad_norm": 0.21950257416417338, + "learning_rate": 2.079225658634244e-06, + "loss": 0.8055, + "num_tokens": 67465281151.0, + "step": 16141 + }, + { + "epoch": 1.9182412358882948, + "grad_norm": 0.21702669828826573, + "learning_rate": 2.0789965217371367e-06, + "loss": 0.8099, + "num_tokens": 67469470434.0, + "step": 16142 + }, + { + "epoch": 1.9183600713012479, + "grad_norm": 0.22043640150415314, + "learning_rate": 2.0787677152149518e-06, + "loss": 0.8362, + "num_tokens": 67473659465.0, + "step": 16143 + }, + { + "epoch": 1.918478906714201, + "grad_norm": 0.21041479843884414, + "learning_rate": 2.078539239076162e-06, + "loss": 0.7801, + "num_tokens": 67477825758.0, + "step": 16144 + }, + { + "epoch": 1.9185977421271538, + "grad_norm": 0.21817188086598596, + "learning_rate": 2.078311093329229e-06, + "loss": 0.769, + "num_tokens": 67482015966.0, + "step": 16145 + }, + { + "epoch": 1.918716577540107, + "grad_norm": 0.2216676374261654, + "learning_rate": 2.0780832779825996e-06, + "loss": 0.7922, + "num_tokens": 67486205131.0, + "step": 16146 + }, + { + "epoch": 1.91883541295306, + "grad_norm": 0.22922616425459147, + "learning_rate": 2.077855793044713e-06, + "loss": 0.7738, + "num_tokens": 67490393339.0, + "step": 16147 + }, + { + "epoch": 1.918954248366013, + "grad_norm": 0.2253612097241727, + "learning_rate": 2.0776286385239925e-06, + "loss": 0.8229, + "num_tokens": 67494582132.0, + "step": 16148 + }, + { + "epoch": 1.919073083778966, + "grad_norm": 0.2296326405538575, + "learning_rate": 2.07740181442885e-06, + "loss": 0.8277, + "num_tokens": 67498740972.0, + "step": 16149 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.2375818853258047, + "learning_rate": 2.0771753207676876e-06, + "loss": 0.8571, + "num_tokens": 67502929781.0, + "step": 16150 + }, + { + "epoch": 1.9193107546048722, + "grad_norm": 0.22469456696072687, + "learning_rate": 2.076949157548892e-06, + "loss": 0.8147, + "num_tokens": 67507116591.0, + "step": 16151 + }, + { + "epoch": 1.9194295900178253, + "grad_norm": 0.23445202602014448, + "learning_rate": 2.076723324780838e-06, + "loss": 0.79, + "num_tokens": 67511306891.0, + "step": 16152 + }, + { + "epoch": 1.9195484254307784, + "grad_norm": 0.24784139058585725, + "learning_rate": 2.07649782247189e-06, + "loss": 0.8116, + "num_tokens": 67515471517.0, + "step": 16153 + }, + { + "epoch": 1.9196672608437315, + "grad_norm": 0.21313339772160872, + "learning_rate": 2.0762726506303975e-06, + "loss": 0.8267, + "num_tokens": 67519659045.0, + "step": 16154 + }, + { + "epoch": 1.9197860962566846, + "grad_norm": 0.2228603554361658, + "learning_rate": 2.076047809264702e-06, + "loss": 0.8019, + "num_tokens": 67523826879.0, + "step": 16155 + }, + { + "epoch": 1.9199049316696377, + "grad_norm": 0.2520212496170649, + "learning_rate": 2.075823298383128e-06, + "loss": 0.8144, + "num_tokens": 67528016089.0, + "step": 16156 + }, + { + "epoch": 1.9200237670825906, + "grad_norm": 0.2142977487429079, + "learning_rate": 2.0755991179939916e-06, + "loss": 0.797, + "num_tokens": 67532176457.0, + "step": 16157 + }, + { + "epoch": 1.9201426024955437, + "grad_norm": 0.2247823037777697, + "learning_rate": 2.0753752681055934e-06, + "loss": 0.7957, + "num_tokens": 67536366153.0, + "step": 16158 + }, + { + "epoch": 1.9202614379084966, + "grad_norm": 0.21807838058261636, + "learning_rate": 2.075151748726223e-06, + "loss": 0.815, + "num_tokens": 67540556233.0, + "step": 16159 + }, + { + "epoch": 1.9203802733214497, + "grad_norm": 0.20199524204537647, + "learning_rate": 2.07492855986416e-06, + "loss": 0.8013, + "num_tokens": 67544739766.0, + "step": 16160 + }, + { + "epoch": 1.9204991087344028, + "grad_norm": 0.22287679199967747, + "learning_rate": 2.074705701527668e-06, + "loss": 0.7664, + "num_tokens": 67548929629.0, + "step": 16161 + }, + { + "epoch": 1.9206179441473559, + "grad_norm": 0.22322776619437207, + "learning_rate": 2.0744831737250022e-06, + "loss": 0.7746, + "num_tokens": 67553119042.0, + "step": 16162 + }, + { + "epoch": 1.920736779560309, + "grad_norm": 0.20505323323310948, + "learning_rate": 2.0742609764644016e-06, + "loss": 0.7778, + "num_tokens": 67557309073.0, + "step": 16163 + }, + { + "epoch": 1.920855614973262, + "grad_norm": 0.22081623837387787, + "learning_rate": 2.074039109754096e-06, + "loss": 0.8149, + "num_tokens": 67561497647.0, + "step": 16164 + }, + { + "epoch": 1.9209744503862152, + "grad_norm": 0.23462591675344607, + "learning_rate": 2.0738175736023006e-06, + "loss": 0.7973, + "num_tokens": 67565686571.0, + "step": 16165 + }, + { + "epoch": 1.9210932857991683, + "grad_norm": 0.21230820200999717, + "learning_rate": 2.073596368017222e-06, + "loss": 0.7962, + "num_tokens": 67569843160.0, + "step": 16166 + }, + { + "epoch": 1.9212121212121214, + "grad_norm": 0.2196158217448848, + "learning_rate": 2.0733754930070497e-06, + "loss": 0.8294, + "num_tokens": 67574033006.0, + "step": 16167 + }, + { + "epoch": 1.9213309566250742, + "grad_norm": 0.22502275298693544, + "learning_rate": 2.0731549485799647e-06, + "loss": 0.8401, + "num_tokens": 67578222799.0, + "step": 16168 + }, + { + "epoch": 1.9214497920380273, + "grad_norm": 0.22166312965403626, + "learning_rate": 2.072934734744134e-06, + "loss": 0.8199, + "num_tokens": 67582411860.0, + "step": 16169 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.2168768268226899, + "learning_rate": 2.0727148515077144e-06, + "loss": 0.7842, + "num_tokens": 67586575028.0, + "step": 16170 + }, + { + "epoch": 1.9216874628639333, + "grad_norm": 0.21452095545192845, + "learning_rate": 2.0724952988788472e-06, + "loss": 0.7677, + "num_tokens": 67590742782.0, + "step": 16171 + }, + { + "epoch": 1.9218062982768864, + "grad_norm": 0.22808758448912106, + "learning_rate": 2.072276076865664e-06, + "loss": 0.7939, + "num_tokens": 67594932003.0, + "step": 16172 + }, + { + "epoch": 1.9219251336898395, + "grad_norm": 0.21082253096569697, + "learning_rate": 2.0720571854762816e-06, + "loss": 0.8084, + "num_tokens": 67599121185.0, + "step": 16173 + }, + { + "epoch": 1.9220439691027926, + "grad_norm": 0.22167068279305488, + "learning_rate": 2.0718386247188092e-06, + "loss": 0.8125, + "num_tokens": 67603304020.0, + "step": 16174 + }, + { + "epoch": 1.9221628045157457, + "grad_norm": 0.22592688758880267, + "learning_rate": 2.0716203946013386e-06, + "loss": 0.7865, + "num_tokens": 67607472155.0, + "step": 16175 + }, + { + "epoch": 1.9222816399286988, + "grad_norm": 0.2043589684921129, + "learning_rate": 2.0714024951319534e-06, + "loss": 0.8431, + "num_tokens": 67611662080.0, + "step": 16176 + }, + { + "epoch": 1.922400475341652, + "grad_norm": 0.23307571011506292, + "learning_rate": 2.071184926318721e-06, + "loss": 0.7941, + "num_tokens": 67615800824.0, + "step": 16177 + }, + { + "epoch": 1.922519310754605, + "grad_norm": 0.23593522470935563, + "learning_rate": 2.0709676881697004e-06, + "loss": 0.8227, + "num_tokens": 67619990251.0, + "step": 16178 + }, + { + "epoch": 1.922638146167558, + "grad_norm": 0.22253845270646447, + "learning_rate": 2.0707507806929355e-06, + "loss": 0.8317, + "num_tokens": 67624179197.0, + "step": 16179 + }, + { + "epoch": 1.922756981580511, + "grad_norm": 0.23749107965118027, + "learning_rate": 2.0705342038964594e-06, + "loss": 0.8313, + "num_tokens": 67628368412.0, + "step": 16180 + }, + { + "epoch": 1.922875816993464, + "grad_norm": 0.2209570830857045, + "learning_rate": 2.0703179577882947e-06, + "loss": 0.7972, + "num_tokens": 67632557797.0, + "step": 16181 + }, + { + "epoch": 1.922994652406417, + "grad_norm": 0.2278373503805289, + "learning_rate": 2.070102042376446e-06, + "loss": 0.7809, + "num_tokens": 67636746050.0, + "step": 16182 + }, + { + "epoch": 1.92311348781937, + "grad_norm": 0.22217147419135402, + "learning_rate": 2.069886457668913e-06, + "loss": 0.7971, + "num_tokens": 67640934842.0, + "step": 16183 + }, + { + "epoch": 1.9232323232323232, + "grad_norm": 0.23126573797225447, + "learning_rate": 2.0696712036736767e-06, + "loss": 0.8027, + "num_tokens": 67645097934.0, + "step": 16184 + }, + { + "epoch": 1.9233511586452763, + "grad_norm": 0.21467104681467913, + "learning_rate": 2.0694562803987108e-06, + "loss": 0.8011, + "num_tokens": 67649286341.0, + "step": 16185 + }, + { + "epoch": 1.9234699940582294, + "grad_norm": 0.22625799178395342, + "learning_rate": 2.0692416878519726e-06, + "loss": 0.7872, + "num_tokens": 67653476428.0, + "step": 16186 + }, + { + "epoch": 1.9235888294711825, + "grad_norm": 0.2159626547867117, + "learning_rate": 2.0690274260414108e-06, + "loss": 0.824, + "num_tokens": 67657646490.0, + "step": 16187 + }, + { + "epoch": 1.9237076648841356, + "grad_norm": 0.22312509535747893, + "learning_rate": 2.06881349497496e-06, + "loss": 0.8223, + "num_tokens": 67661836771.0, + "step": 16188 + }, + { + "epoch": 1.9238265002970887, + "grad_norm": 0.21913718203656707, + "learning_rate": 2.0685998946605428e-06, + "loss": 0.7822, + "num_tokens": 67665992539.0, + "step": 16189 + }, + { + "epoch": 1.9239453357100416, + "grad_norm": 0.21673068101972479, + "learning_rate": 2.0683866251060685e-06, + "loss": 0.7918, + "num_tokens": 67670162645.0, + "step": 16190 + }, + { + "epoch": 1.9240641711229947, + "grad_norm": 0.20468452006437182, + "learning_rate": 2.068173686319437e-06, + "loss": 0.8084, + "num_tokens": 67674351131.0, + "step": 16191 + }, + { + "epoch": 1.9241830065359478, + "grad_norm": 0.2134545814277279, + "learning_rate": 2.0679610783085326e-06, + "loss": 0.7994, + "num_tokens": 67678507024.0, + "step": 16192 + }, + { + "epoch": 1.9243018419489006, + "grad_norm": 0.21463709994602795, + "learning_rate": 2.067748801081229e-06, + "loss": 0.803, + "num_tokens": 67682665110.0, + "step": 16193 + }, + { + "epoch": 1.9244206773618537, + "grad_norm": 0.21330798642923443, + "learning_rate": 2.0675368546453885e-06, + "loss": 0.8231, + "num_tokens": 67686822855.0, + "step": 16194 + }, + { + "epoch": 1.9245395127748068, + "grad_norm": 0.2110541497826271, + "learning_rate": 2.0673252390088594e-06, + "loss": 0.7877, + "num_tokens": 67691012717.0, + "step": 16195 + }, + { + "epoch": 1.92465834818776, + "grad_norm": 0.2087997160707222, + "learning_rate": 2.0671139541794783e-06, + "loss": 0.7797, + "num_tokens": 67695201897.0, + "step": 16196 + }, + { + "epoch": 1.924777183600713, + "grad_norm": 0.24206178416750979, + "learning_rate": 2.0669030001650714e-06, + "loss": 0.8076, + "num_tokens": 67699389607.0, + "step": 16197 + }, + { + "epoch": 1.9248960190136661, + "grad_norm": 0.23140695705636866, + "learning_rate": 2.0666923769734496e-06, + "loss": 0.8284, + "num_tokens": 67703579624.0, + "step": 16198 + }, + { + "epoch": 1.9250148544266192, + "grad_norm": 0.22696282341895027, + "learning_rate": 2.066482084612412e-06, + "loss": 0.8442, + "num_tokens": 67707767321.0, + "step": 16199 + }, + { + "epoch": 1.9251336898395723, + "grad_norm": 0.22128545164438926, + "learning_rate": 2.066272123089749e-06, + "loss": 0.8001, + "num_tokens": 67711955270.0, + "step": 16200 + }, + { + "epoch": 1.9252525252525252, + "grad_norm": 0.22866426781196147, + "learning_rate": 2.0660624924132342e-06, + "loss": 0.7817, + "num_tokens": 67716144535.0, + "step": 16201 + }, + { + "epoch": 1.9253713606654783, + "grad_norm": 0.22328099161852058, + "learning_rate": 2.0658531925906327e-06, + "loss": 0.7673, + "num_tokens": 67720320685.0, + "step": 16202 + }, + { + "epoch": 1.9254901960784314, + "grad_norm": 0.20887732966841732, + "learning_rate": 2.0656442236296946e-06, + "loss": 0.8186, + "num_tokens": 67724510720.0, + "step": 16203 + }, + { + "epoch": 1.9256090314913843, + "grad_norm": 0.21159440323805406, + "learning_rate": 2.065435585538158e-06, + "loss": 0.8435, + "num_tokens": 67728700663.0, + "step": 16204 + }, + { + "epoch": 1.9257278669043374, + "grad_norm": 0.22186126659635122, + "learning_rate": 2.0652272783237493e-06, + "loss": 0.7899, + "num_tokens": 67732890646.0, + "step": 16205 + }, + { + "epoch": 1.9258467023172905, + "grad_norm": 0.21301207972096028, + "learning_rate": 2.0650193019941843e-06, + "loss": 0.8234, + "num_tokens": 67737057311.0, + "step": 16206 + }, + { + "epoch": 1.9259655377302436, + "grad_norm": 0.20937854708687617, + "learning_rate": 2.0648116565571656e-06, + "loss": 0.8136, + "num_tokens": 67741246261.0, + "step": 16207 + }, + { + "epoch": 1.9260843731431967, + "grad_norm": 0.21696396645165855, + "learning_rate": 2.064604342020381e-06, + "loss": 0.7666, + "num_tokens": 67745424612.0, + "step": 16208 + }, + { + "epoch": 1.9262032085561498, + "grad_norm": 0.2186163809861782, + "learning_rate": 2.064397358391509e-06, + "loss": 0.8221, + "num_tokens": 67749611242.0, + "step": 16209 + }, + { + "epoch": 1.926322043969103, + "grad_norm": 0.21780827602277564, + "learning_rate": 2.064190705678215e-06, + "loss": 0.8355, + "num_tokens": 67753800028.0, + "step": 16210 + }, + { + "epoch": 1.926440879382056, + "grad_norm": 0.21580745453622946, + "learning_rate": 2.0639843838881503e-06, + "loss": 0.7693, + "num_tokens": 67757991443.0, + "step": 16211 + }, + { + "epoch": 1.9265597147950089, + "grad_norm": 0.21914909237468266, + "learning_rate": 2.063778393028959e-06, + "loss": 0.7739, + "num_tokens": 67762179652.0, + "step": 16212 + }, + { + "epoch": 1.926678550207962, + "grad_norm": 0.223785654919924, + "learning_rate": 2.063572733108268e-06, + "loss": 0.7979, + "num_tokens": 67766368830.0, + "step": 16213 + }, + { + "epoch": 1.926797385620915, + "grad_norm": 0.2170918013395361, + "learning_rate": 2.063367404133693e-06, + "loss": 0.8218, + "num_tokens": 67770547754.0, + "step": 16214 + }, + { + "epoch": 1.926916221033868, + "grad_norm": 0.2283083293380313, + "learning_rate": 2.0631624061128373e-06, + "loss": 0.8288, + "num_tokens": 67774735732.0, + "step": 16215 + }, + { + "epoch": 1.927035056446821, + "grad_norm": 0.2128500517845567, + "learning_rate": 2.062957739053295e-06, + "loss": 0.8074, + "num_tokens": 67778922853.0, + "step": 16216 + }, + { + "epoch": 1.9271538918597741, + "grad_norm": 0.2135294326265269, + "learning_rate": 2.0627534029626446e-06, + "loss": 0.7718, + "num_tokens": 67783113876.0, + "step": 16217 + }, + { + "epoch": 1.9272727272727272, + "grad_norm": 0.22966003298594667, + "learning_rate": 2.0625493978484527e-06, + "loss": 0.8084, + "num_tokens": 67787283809.0, + "step": 16218 + }, + { + "epoch": 1.9273915626856803, + "grad_norm": 0.21821070152254385, + "learning_rate": 2.0623457237182756e-06, + "loss": 0.7767, + "num_tokens": 67791456267.0, + "step": 16219 + }, + { + "epoch": 1.9275103980986334, + "grad_norm": 0.21699407145051727, + "learning_rate": 2.062142380579655e-06, + "loss": 0.7935, + "num_tokens": 67795646290.0, + "step": 16220 + }, + { + "epoch": 1.9276292335115865, + "grad_norm": 0.22706351817285683, + "learning_rate": 2.061939368440121e-06, + "loss": 0.7949, + "num_tokens": 67799836380.0, + "step": 16221 + }, + { + "epoch": 1.9277480689245396, + "grad_norm": 0.225970772114222, + "learning_rate": 2.0617366873071933e-06, + "loss": 0.7802, + "num_tokens": 67803957538.0, + "step": 16222 + }, + { + "epoch": 1.9278669043374925, + "grad_norm": 0.21425542424299254, + "learning_rate": 2.061534337188377e-06, + "loss": 0.7854, + "num_tokens": 67808139007.0, + "step": 16223 + }, + { + "epoch": 1.9279857397504456, + "grad_norm": 0.24942197209069167, + "learning_rate": 2.0613323180911655e-06, + "loss": 0.8037, + "num_tokens": 67812325994.0, + "step": 16224 + }, + { + "epoch": 1.9281045751633987, + "grad_norm": 0.2199152309592917, + "learning_rate": 2.0611306300230406e-06, + "loss": 0.7933, + "num_tokens": 67816512303.0, + "step": 16225 + }, + { + "epoch": 1.9282234105763516, + "grad_norm": 0.23478797575696475, + "learning_rate": 2.0609292729914715e-06, + "loss": 0.7986, + "num_tokens": 67820701910.0, + "step": 16226 + }, + { + "epoch": 1.9283422459893047, + "grad_norm": 0.21373375178334086, + "learning_rate": 2.060728247003915e-06, + "loss": 0.8074, + "num_tokens": 67824826986.0, + "step": 16227 + }, + { + "epoch": 1.9284610814022578, + "grad_norm": 0.21528302777737454, + "learning_rate": 2.060527552067816e-06, + "loss": 0.8592, + "num_tokens": 67829014814.0, + "step": 16228 + }, + { + "epoch": 1.928579916815211, + "grad_norm": 0.224694778645608, + "learning_rate": 2.0603271881906078e-06, + "loss": 0.8034, + "num_tokens": 67833203509.0, + "step": 16229 + }, + { + "epoch": 1.928698752228164, + "grad_norm": 0.20780291504907913, + "learning_rate": 2.060127155379709e-06, + "loss": 0.8419, + "num_tokens": 67837392709.0, + "step": 16230 + }, + { + "epoch": 1.928817587641117, + "grad_norm": 0.21584658478179147, + "learning_rate": 2.0599274536425273e-06, + "loss": 0.8165, + "num_tokens": 67841580408.0, + "step": 16231 + }, + { + "epoch": 1.9289364230540702, + "grad_norm": 0.2151638243735762, + "learning_rate": 2.059728082986459e-06, + "loss": 0.7212, + "num_tokens": 67845769710.0, + "step": 16232 + }, + { + "epoch": 1.9290552584670233, + "grad_norm": 0.21482706254028858, + "learning_rate": 2.0595290434188882e-06, + "loss": 0.7698, + "num_tokens": 67849928405.0, + "step": 16233 + }, + { + "epoch": 1.9291740938799762, + "grad_norm": 0.23410042708500894, + "learning_rate": 2.059330334947186e-06, + "loss": 0.7799, + "num_tokens": 67854118163.0, + "step": 16234 + }, + { + "epoch": 1.9292929292929293, + "grad_norm": 0.2211086496641111, + "learning_rate": 2.05913195757871e-06, + "loss": 0.8096, + "num_tokens": 67858307360.0, + "step": 16235 + }, + { + "epoch": 1.9294117647058824, + "grad_norm": 0.21317757140683583, + "learning_rate": 2.058933911320808e-06, + "loss": 0.8122, + "num_tokens": 67862497574.0, + "step": 16236 + }, + { + "epoch": 1.9295306001188353, + "grad_norm": 0.21524265498418066, + "learning_rate": 2.0587361961808118e-06, + "loss": 0.8098, + "num_tokens": 67866686793.0, + "step": 16237 + }, + { + "epoch": 1.9296494355317884, + "grad_norm": 0.21549235217870052, + "learning_rate": 2.058538812166047e-06, + "loss": 0.819, + "num_tokens": 67870853095.0, + "step": 16238 + }, + { + "epoch": 1.9297682709447415, + "grad_norm": 0.21748984041693742, + "learning_rate": 2.0583417592838215e-06, + "loss": 0.7914, + "num_tokens": 67875041757.0, + "step": 16239 + }, + { + "epoch": 1.9298871063576946, + "grad_norm": 0.2130742417327027, + "learning_rate": 2.0581450375414337e-06, + "loss": 0.8337, + "num_tokens": 67879229265.0, + "step": 16240 + }, + { + "epoch": 1.9300059417706477, + "grad_norm": 0.215041031117656, + "learning_rate": 2.0579486469461674e-06, + "loss": 0.791, + "num_tokens": 67883407507.0, + "step": 16241 + }, + { + "epoch": 1.9301247771836008, + "grad_norm": 0.21207358351361827, + "learning_rate": 2.057752587505297e-06, + "loss": 0.8305, + "num_tokens": 67887575257.0, + "step": 16242 + }, + { + "epoch": 1.9302436125965539, + "grad_norm": 0.20968378954860142, + "learning_rate": 2.057556859226082e-06, + "loss": 0.8007, + "num_tokens": 67891743683.0, + "step": 16243 + }, + { + "epoch": 1.930362448009507, + "grad_norm": 0.20529445066276314, + "learning_rate": 2.0573614621157726e-06, + "loss": 0.8253, + "num_tokens": 67895933482.0, + "step": 16244 + }, + { + "epoch": 1.93048128342246, + "grad_norm": 0.22339605282843708, + "learning_rate": 2.0571663961816035e-06, + "loss": 0.7992, + "num_tokens": 67900118564.0, + "step": 16245 + }, + { + "epoch": 1.930600118835413, + "grad_norm": 0.20913210830702383, + "learning_rate": 2.056971661430799e-06, + "loss": 0.8068, + "num_tokens": 67904309617.0, + "step": 16246 + }, + { + "epoch": 1.930718954248366, + "grad_norm": 0.21093777941164474, + "learning_rate": 2.056777257870571e-06, + "loss": 0.7748, + "num_tokens": 67908497989.0, + "step": 16247 + }, + { + "epoch": 1.930837789661319, + "grad_norm": 0.22410330077869792, + "learning_rate": 2.0565831855081196e-06, + "loss": 0.8246, + "num_tokens": 67912687545.0, + "step": 16248 + }, + { + "epoch": 1.930956625074272, + "grad_norm": 0.22993338449619327, + "learning_rate": 2.0563894443506303e-06, + "loss": 0.8349, + "num_tokens": 67916875475.0, + "step": 16249 + }, + { + "epoch": 1.9310754604872251, + "grad_norm": 0.2227626667682998, + "learning_rate": 2.056196034405279e-06, + "loss": 0.8157, + "num_tokens": 67921041984.0, + "step": 16250 + }, + { + "epoch": 1.9311942959001782, + "grad_norm": 0.21188355419118537, + "learning_rate": 2.0560029556792286e-06, + "loss": 0.8096, + "num_tokens": 67925230343.0, + "step": 16251 + }, + { + "epoch": 1.9313131313131313, + "grad_norm": 0.2250158295135832, + "learning_rate": 2.0558102081796295e-06, + "loss": 0.8065, + "num_tokens": 67929386381.0, + "step": 16252 + }, + { + "epoch": 1.9314319667260844, + "grad_norm": 0.21297282896118486, + "learning_rate": 2.055617791913619e-06, + "loss": 0.7878, + "num_tokens": 67933576079.0, + "step": 16253 + }, + { + "epoch": 1.9315508021390375, + "grad_norm": 0.2116240049605663, + "learning_rate": 2.0554257068883234e-06, + "loss": 0.7997, + "num_tokens": 67937747645.0, + "step": 16254 + }, + { + "epoch": 1.9316696375519906, + "grad_norm": 0.2199721414666088, + "learning_rate": 2.055233953110857e-06, + "loss": 0.7966, + "num_tokens": 67941935827.0, + "step": 16255 + }, + { + "epoch": 1.9317884729649437, + "grad_norm": 0.2170759374667172, + "learning_rate": 2.055042530588319e-06, + "loss": 0.824, + "num_tokens": 67946124521.0, + "step": 16256 + }, + { + "epoch": 1.9319073083778966, + "grad_norm": 0.21984499384628123, + "learning_rate": 2.0548514393277998e-06, + "loss": 0.7716, + "num_tokens": 67950310418.0, + "step": 16257 + }, + { + "epoch": 1.9320261437908497, + "grad_norm": 0.21604626030476812, + "learning_rate": 2.054660679336377e-06, + "loss": 0.7686, + "num_tokens": 67954498633.0, + "step": 16258 + }, + { + "epoch": 1.9321449792038026, + "grad_norm": 0.21159599807705534, + "learning_rate": 2.0544702506211134e-06, + "loss": 0.8036, + "num_tokens": 67958655942.0, + "step": 16259 + }, + { + "epoch": 1.9322638146167557, + "grad_norm": 0.2117589821407976, + "learning_rate": 2.0542801531890627e-06, + "loss": 0.8205, + "num_tokens": 67962845979.0, + "step": 16260 + }, + { + "epoch": 1.9323826500297088, + "grad_norm": 0.22257767680485685, + "learning_rate": 2.0540903870472644e-06, + "loss": 0.795, + "num_tokens": 67967013693.0, + "step": 16261 + }, + { + "epoch": 1.9325014854426619, + "grad_norm": 0.21242352681006074, + "learning_rate": 2.053900952202744e-06, + "loss": 0.7922, + "num_tokens": 67971202535.0, + "step": 16262 + }, + { + "epoch": 1.932620320855615, + "grad_norm": 0.20909403223012632, + "learning_rate": 2.0537118486625205e-06, + "loss": 0.7823, + "num_tokens": 67975389860.0, + "step": 16263 + }, + { + "epoch": 1.932739156268568, + "grad_norm": 0.21453859256502178, + "learning_rate": 2.053523076433594e-06, + "loss": 0.8275, + "num_tokens": 67979549113.0, + "step": 16264 + }, + { + "epoch": 1.9328579916815212, + "grad_norm": 0.22365052852464617, + "learning_rate": 2.0533346355229584e-06, + "loss": 0.8132, + "num_tokens": 67983737264.0, + "step": 16265 + }, + { + "epoch": 1.9329768270944743, + "grad_norm": 0.2160138766577029, + "learning_rate": 2.05314652593759e-06, + "loss": 0.8008, + "num_tokens": 67987894465.0, + "step": 16266 + }, + { + "epoch": 1.9330956625074274, + "grad_norm": 0.2092928307555119, + "learning_rate": 2.052958747684455e-06, + "loss": 0.8201, + "num_tokens": 67992084351.0, + "step": 16267 + }, + { + "epoch": 1.9332144979203802, + "grad_norm": 0.2185958691656455, + "learning_rate": 2.0527713007705096e-06, + "loss": 0.7829, + "num_tokens": 67996274341.0, + "step": 16268 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.20666722221286676, + "learning_rate": 2.052584185202692e-06, + "loss": 0.8099, + "num_tokens": 68000462512.0, + "step": 16269 + }, + { + "epoch": 1.9334521687462864, + "grad_norm": 0.2155900513559602, + "learning_rate": 2.0523974009879364e-06, + "loss": 0.7872, + "num_tokens": 68004652163.0, + "step": 16270 + }, + { + "epoch": 1.9335710041592393, + "grad_norm": 0.2270992919056301, + "learning_rate": 2.0522109481331565e-06, + "loss": 0.7864, + "num_tokens": 68008828093.0, + "step": 16271 + }, + { + "epoch": 1.9336898395721924, + "grad_norm": 0.22829051263262773, + "learning_rate": 2.0520248266452588e-06, + "loss": 0.7929, + "num_tokens": 68013018776.0, + "step": 16272 + }, + { + "epoch": 1.9338086749851455, + "grad_norm": 0.2252694654015694, + "learning_rate": 2.0518390365311343e-06, + "loss": 0.8038, + "num_tokens": 68017207613.0, + "step": 16273 + }, + { + "epoch": 1.9339275103980986, + "grad_norm": 0.22621561678023017, + "learning_rate": 2.051653577797666e-06, + "loss": 0.8188, + "num_tokens": 68021395182.0, + "step": 16274 + }, + { + "epoch": 1.9340463458110517, + "grad_norm": 0.21045341141477458, + "learning_rate": 2.051468450451721e-06, + "loss": 0.8158, + "num_tokens": 68025584557.0, + "step": 16275 + }, + { + "epoch": 1.9341651812240048, + "grad_norm": 0.2384406760236546, + "learning_rate": 2.0512836545001543e-06, + "loss": 0.8537, + "num_tokens": 68029744474.0, + "step": 16276 + }, + { + "epoch": 1.934284016636958, + "grad_norm": 0.23145444542119106, + "learning_rate": 2.0510991899498103e-06, + "loss": 0.7825, + "num_tokens": 68033934720.0, + "step": 16277 + }, + { + "epoch": 1.934402852049911, + "grad_norm": 0.2312005722773256, + "learning_rate": 2.050915056807521e-06, + "loss": 0.8254, + "num_tokens": 68038125140.0, + "step": 16278 + }, + { + "epoch": 1.934521687462864, + "grad_norm": 0.22972792210661974, + "learning_rate": 2.0507312550801036e-06, + "loss": 0.835, + "num_tokens": 68042314516.0, + "step": 16279 + }, + { + "epoch": 1.934640522875817, + "grad_norm": 0.24641162369669226, + "learning_rate": 2.050547784774366e-06, + "loss": 0.8028, + "num_tokens": 68046503139.0, + "step": 16280 + }, + { + "epoch": 1.93475935828877, + "grad_norm": 0.2212803202838317, + "learning_rate": 2.0503646458971035e-06, + "loss": 0.8077, + "num_tokens": 68050693842.0, + "step": 16281 + }, + { + "epoch": 1.934878193701723, + "grad_norm": 0.2072177915094015, + "learning_rate": 2.0501818384550975e-06, + "loss": 0.8001, + "num_tokens": 68054882675.0, + "step": 16282 + }, + { + "epoch": 1.934997029114676, + "grad_norm": 0.21828674372730325, + "learning_rate": 2.0499993624551174e-06, + "loss": 0.8145, + "num_tokens": 68059042153.0, + "step": 16283 + }, + { + "epoch": 1.9351158645276292, + "grad_norm": 0.22183846521305792, + "learning_rate": 2.0498172179039226e-06, + "loss": 0.8307, + "num_tokens": 68063198936.0, + "step": 16284 + }, + { + "epoch": 1.9352346999405823, + "grad_norm": 0.22456492802588565, + "learning_rate": 2.0496354048082566e-06, + "loss": 0.8515, + "num_tokens": 68067387822.0, + "step": 16285 + }, + { + "epoch": 1.9353535353535354, + "grad_norm": 0.225651265750968, + "learning_rate": 2.0494539231748546e-06, + "loss": 0.7771, + "num_tokens": 68071576435.0, + "step": 16286 + }, + { + "epoch": 1.9354723707664885, + "grad_norm": 0.22002866084323644, + "learning_rate": 2.0492727730104344e-06, + "loss": 0.8085, + "num_tokens": 68075721961.0, + "step": 16287 + }, + { + "epoch": 1.9355912061794416, + "grad_norm": 0.2321918816557197, + "learning_rate": 2.0490919543217076e-06, + "loss": 0.8104, + "num_tokens": 68079911656.0, + "step": 16288 + }, + { + "epoch": 1.9357100415923947, + "grad_norm": 0.21636196433624627, + "learning_rate": 2.04891146711537e-06, + "loss": 0.8187, + "num_tokens": 68084100135.0, + "step": 16289 + }, + { + "epoch": 1.9358288770053476, + "grad_norm": 0.22191089764121458, + "learning_rate": 2.048731311398105e-06, + "loss": 0.8157, + "num_tokens": 68088288740.0, + "step": 16290 + }, + { + "epoch": 1.9359477124183007, + "grad_norm": 0.23306948660575869, + "learning_rate": 2.0485514871765844e-06, + "loss": 0.7951, + "num_tokens": 68092459059.0, + "step": 16291 + }, + { + "epoch": 1.9360665478312538, + "grad_norm": 0.2257600178932368, + "learning_rate": 2.0483719944574677e-06, + "loss": 0.7786, + "num_tokens": 68096648865.0, + "step": 16292 + }, + { + "epoch": 1.9361853832442066, + "grad_norm": 0.21914230060430556, + "learning_rate": 2.0481928332474017e-06, + "loss": 0.7879, + "num_tokens": 68100832589.0, + "step": 16293 + }, + { + "epoch": 1.9363042186571597, + "grad_norm": 0.21911155894271536, + "learning_rate": 2.048014003553022e-06, + "loss": 0.8351, + "num_tokens": 68105021706.0, + "step": 16294 + }, + { + "epoch": 1.9364230540701128, + "grad_norm": 0.23068998823930356, + "learning_rate": 2.047835505380951e-06, + "loss": 0.7989, + "num_tokens": 68109210160.0, + "step": 16295 + }, + { + "epoch": 1.936541889483066, + "grad_norm": 0.22207792929916007, + "learning_rate": 2.0476573387378004e-06, + "loss": 0.7915, + "num_tokens": 68113399517.0, + "step": 16296 + }, + { + "epoch": 1.936660724896019, + "grad_norm": 0.22890783471186052, + "learning_rate": 2.0474795036301664e-06, + "loss": 0.8204, + "num_tokens": 68117588184.0, + "step": 16297 + }, + { + "epoch": 1.9367795603089721, + "grad_norm": 0.23312032155449067, + "learning_rate": 2.0473020000646353e-06, + "loss": 0.7978, + "num_tokens": 68121761830.0, + "step": 16298 + }, + { + "epoch": 1.9368983957219252, + "grad_norm": 0.21823339523659238, + "learning_rate": 2.047124828047781e-06, + "loss": 0.7892, + "num_tokens": 68125929109.0, + "step": 16299 + }, + { + "epoch": 1.9370172311348783, + "grad_norm": 0.21988429016398123, + "learning_rate": 2.0469479875861645e-06, + "loss": 0.8361, + "num_tokens": 68130117774.0, + "step": 16300 + }, + { + "epoch": 1.9371360665478312, + "grad_norm": 0.21315016680054383, + "learning_rate": 2.046771478686336e-06, + "loss": 0.8129, + "num_tokens": 68134307841.0, + "step": 16301 + }, + { + "epoch": 1.9372549019607843, + "grad_norm": 0.21765552968758878, + "learning_rate": 2.046595301354831e-06, + "loss": 0.7681, + "num_tokens": 68138498303.0, + "step": 16302 + }, + { + "epoch": 1.9373737373737374, + "grad_norm": 0.20912077274546886, + "learning_rate": 2.046419455598173e-06, + "loss": 0.7769, + "num_tokens": 68142688090.0, + "step": 16303 + }, + { + "epoch": 1.9374925727866903, + "grad_norm": 0.22074606433601762, + "learning_rate": 2.0462439414228766e-06, + "loss": 0.8426, + "num_tokens": 68146877167.0, + "step": 16304 + }, + { + "epoch": 1.9376114081996434, + "grad_norm": 0.23267615173710837, + "learning_rate": 2.0460687588354393e-06, + "loss": 0.7646, + "num_tokens": 68151067580.0, + "step": 16305 + }, + { + "epoch": 1.9377302436125965, + "grad_norm": 0.2173766464085404, + "learning_rate": 2.0458939078423508e-06, + "loss": 0.8253, + "num_tokens": 68155254389.0, + "step": 16306 + }, + { + "epoch": 1.9378490790255496, + "grad_norm": 0.22503527846260551, + "learning_rate": 2.045719388450085e-06, + "loss": 0.8092, + "num_tokens": 68159443011.0, + "step": 16307 + }, + { + "epoch": 1.9379679144385027, + "grad_norm": 0.21471752475451752, + "learning_rate": 2.0455452006651057e-06, + "loss": 0.7756, + "num_tokens": 68163611462.0, + "step": 16308 + }, + { + "epoch": 1.9380867498514558, + "grad_norm": 0.2295927318371528, + "learning_rate": 2.045371344493863e-06, + "loss": 0.8134, + "num_tokens": 68167799002.0, + "step": 16309 + }, + { + "epoch": 1.938205585264409, + "grad_norm": 0.22175415071641766, + "learning_rate": 2.0451978199427957e-06, + "loss": 0.8343, + "num_tokens": 68171987308.0, + "step": 16310 + }, + { + "epoch": 1.938324420677362, + "grad_norm": 0.21869446708270474, + "learning_rate": 2.0450246270183298e-06, + "loss": 0.7494, + "num_tokens": 68176175073.0, + "step": 16311 + }, + { + "epoch": 1.9384432560903149, + "grad_norm": 0.2272610481873522, + "learning_rate": 2.04485176572688e-06, + "loss": 0.8361, + "num_tokens": 68180309990.0, + "step": 16312 + }, + { + "epoch": 1.938562091503268, + "grad_norm": 0.22086703153319742, + "learning_rate": 2.044679236074847e-06, + "loss": 0.8209, + "num_tokens": 68184499154.0, + "step": 16313 + }, + { + "epoch": 1.938680926916221, + "grad_norm": 0.2080325992644738, + "learning_rate": 2.044507038068621e-06, + "loss": 0.7955, + "num_tokens": 68188688495.0, + "step": 16314 + }, + { + "epoch": 1.938799762329174, + "grad_norm": 0.22446069854453957, + "learning_rate": 2.0443351717145784e-06, + "loss": 0.7867, + "num_tokens": 68192879013.0, + "step": 16315 + }, + { + "epoch": 1.938918597742127, + "grad_norm": 0.22650929888597537, + "learning_rate": 2.0441636370190836e-06, + "loss": 0.8355, + "num_tokens": 68197056660.0, + "step": 16316 + }, + { + "epoch": 1.9390374331550801, + "grad_norm": 0.21429518756246502, + "learning_rate": 2.0439924339884915e-06, + "loss": 0.8376, + "num_tokens": 68201246833.0, + "step": 16317 + }, + { + "epoch": 1.9391562685680332, + "grad_norm": 0.22349796301656755, + "learning_rate": 2.0438215626291387e-06, + "loss": 0.8605, + "num_tokens": 68205435100.0, + "step": 16318 + }, + { + "epoch": 1.9392751039809863, + "grad_norm": 0.21365781305971987, + "learning_rate": 2.0436510229473564e-06, + "loss": 0.802, + "num_tokens": 68209606831.0, + "step": 16319 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.21528566757719997, + "learning_rate": 2.043480814949457e-06, + "loss": 0.7804, + "num_tokens": 68213797576.0, + "step": 16320 + }, + { + "epoch": 1.9395127748068925, + "grad_norm": 0.23250905443036649, + "learning_rate": 2.0433109386417465e-06, + "loss": 0.7838, + "num_tokens": 68217985731.0, + "step": 16321 + }, + { + "epoch": 1.9396316102198456, + "grad_norm": 0.21570833873373652, + "learning_rate": 2.043141394030516e-06, + "loss": 0.77, + "num_tokens": 68222174403.0, + "step": 16322 + }, + { + "epoch": 1.9397504456327985, + "grad_norm": 0.21780189689164034, + "learning_rate": 2.042972181122043e-06, + "loss": 0.8019, + "num_tokens": 68226335697.0, + "step": 16323 + }, + { + "epoch": 1.9398692810457516, + "grad_norm": 0.21443451166010455, + "learning_rate": 2.042803299922595e-06, + "loss": 0.7917, + "num_tokens": 68230494185.0, + "step": 16324 + }, + { + "epoch": 1.9399881164587047, + "grad_norm": 0.2212484930603749, + "learning_rate": 2.0426347504384253e-06, + "loss": 0.8028, + "num_tokens": 68234684164.0, + "step": 16325 + }, + { + "epoch": 1.9401069518716576, + "grad_norm": 0.22159677865218153, + "learning_rate": 2.042466532675776e-06, + "loss": 0.7891, + "num_tokens": 68238874268.0, + "step": 16326 + }, + { + "epoch": 1.9402257872846107, + "grad_norm": 0.2166846694579942, + "learning_rate": 2.042298646640879e-06, + "loss": 0.8057, + "num_tokens": 68243062753.0, + "step": 16327 + }, + { + "epoch": 1.9403446226975638, + "grad_norm": 0.21283122471866148, + "learning_rate": 2.042131092339948e-06, + "loss": 0.8013, + "num_tokens": 68247253053.0, + "step": 16328 + }, + { + "epoch": 1.940463458110517, + "grad_norm": 0.21942358979067625, + "learning_rate": 2.0419638697791905e-06, + "loss": 0.7897, + "num_tokens": 68251441277.0, + "step": 16329 + }, + { + "epoch": 1.94058229352347, + "grad_norm": 0.22948603988741434, + "learning_rate": 2.0417969789647984e-06, + "loss": 0.8099, + "num_tokens": 68255604728.0, + "step": 16330 + }, + { + "epoch": 1.940701128936423, + "grad_norm": 0.2304694938068297, + "learning_rate": 2.0416304199029528e-06, + "loss": 0.7518, + "num_tokens": 68259773441.0, + "step": 16331 + }, + { + "epoch": 1.9408199643493762, + "grad_norm": 0.22446037801802754, + "learning_rate": 2.0414641925998213e-06, + "loss": 0.8155, + "num_tokens": 68263951571.0, + "step": 16332 + }, + { + "epoch": 1.9409387997623293, + "grad_norm": 0.22573366492476377, + "learning_rate": 2.041298297061561e-06, + "loss": 0.8027, + "num_tokens": 68268141694.0, + "step": 16333 + }, + { + "epoch": 1.9410576351752822, + "grad_norm": 0.2311677926767026, + "learning_rate": 2.041132733294315e-06, + "loss": 0.7884, + "num_tokens": 68272313636.0, + "step": 16334 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.2248755245241568, + "learning_rate": 2.0409675013042133e-06, + "loss": 0.777, + "num_tokens": 68276471949.0, + "step": 16335 + }, + { + "epoch": 1.9412953060011884, + "grad_norm": 0.219325592397031, + "learning_rate": 2.040802601097377e-06, + "loss": 0.7481, + "num_tokens": 68280639739.0, + "step": 16336 + }, + { + "epoch": 1.9414141414141413, + "grad_norm": 0.2147063938535436, + "learning_rate": 2.040638032679913e-06, + "loss": 0.8079, + "num_tokens": 68284800228.0, + "step": 16337 + }, + { + "epoch": 1.9415329768270944, + "grad_norm": 0.22238854248676163, + "learning_rate": 2.0404737960579133e-06, + "loss": 0.831, + "num_tokens": 68288942398.0, + "step": 16338 + }, + { + "epoch": 1.9416518122400475, + "grad_norm": 0.21194181374125573, + "learning_rate": 2.0403098912374625e-06, + "loss": 0.7733, + "num_tokens": 68293131813.0, + "step": 16339 + }, + { + "epoch": 1.9417706476530006, + "grad_norm": 0.22018690439998898, + "learning_rate": 2.040146318224631e-06, + "loss": 0.8113, + "num_tokens": 68297320180.0, + "step": 16340 + }, + { + "epoch": 1.9418894830659537, + "grad_norm": 0.21813883946579024, + "learning_rate": 2.039983077025474e-06, + "loss": 0.7994, + "num_tokens": 68301466588.0, + "step": 16341 + }, + { + "epoch": 1.9420083184789068, + "grad_norm": 0.2355494783572125, + "learning_rate": 2.0398201676460383e-06, + "loss": 0.8459, + "num_tokens": 68305655850.0, + "step": 16342 + }, + { + "epoch": 1.9421271538918599, + "grad_norm": 0.2099163510167057, + "learning_rate": 2.039657590092357e-06, + "loss": 0.8048, + "num_tokens": 68309844362.0, + "step": 16343 + }, + { + "epoch": 1.942245989304813, + "grad_norm": 0.22383570979563877, + "learning_rate": 2.0394953443704513e-06, + "loss": 0.8055, + "num_tokens": 68314034005.0, + "step": 16344 + }, + { + "epoch": 1.942364824717766, + "grad_norm": 0.2287403503652844, + "learning_rate": 2.0393334304863282e-06, + "loss": 0.8068, + "num_tokens": 68318223508.0, + "step": 16345 + }, + { + "epoch": 1.942483660130719, + "grad_norm": 0.22146638465351864, + "learning_rate": 2.0391718484459856e-06, + "loss": 0.8058, + "num_tokens": 68322400149.0, + "step": 16346 + }, + { + "epoch": 1.942602495543672, + "grad_norm": 0.21762999965235913, + "learning_rate": 2.0390105982554063e-06, + "loss": 0.7592, + "num_tokens": 68326589976.0, + "step": 16347 + }, + { + "epoch": 1.942721330956625, + "grad_norm": 0.2188642498513524, + "learning_rate": 2.0388496799205632e-06, + "loss": 0.8077, + "num_tokens": 68330778776.0, + "step": 16348 + }, + { + "epoch": 1.942840166369578, + "grad_norm": 0.21639407274423647, + "learning_rate": 2.038689093447414e-06, + "loss": 0.8285, + "num_tokens": 68334967128.0, + "step": 16349 + }, + { + "epoch": 1.9429590017825311, + "grad_norm": 0.23221783907088284, + "learning_rate": 2.0385288388419074e-06, + "loss": 0.8174, + "num_tokens": 68339157151.0, + "step": 16350 + }, + { + "epoch": 1.9430778371954842, + "grad_norm": 0.21086943154072332, + "learning_rate": 2.0383689161099764e-06, + "loss": 0.7988, + "num_tokens": 68343326612.0, + "step": 16351 + }, + { + "epoch": 1.9431966726084373, + "grad_norm": 0.22315881844964433, + "learning_rate": 2.038209325257545e-06, + "loss": 0.8048, + "num_tokens": 68347514561.0, + "step": 16352 + }, + { + "epoch": 1.9433155080213904, + "grad_norm": 0.21692263781817975, + "learning_rate": 2.0380500662905217e-06, + "loss": 0.8359, + "num_tokens": 68351694322.0, + "step": 16353 + }, + { + "epoch": 1.9434343434343435, + "grad_norm": 0.22125249321500204, + "learning_rate": 2.0378911392148073e-06, + "loss": 0.8155, + "num_tokens": 68355884366.0, + "step": 16354 + }, + { + "epoch": 1.9435531788472966, + "grad_norm": 0.22685865717567474, + "learning_rate": 2.037732544036285e-06, + "loss": 0.8183, + "num_tokens": 68360062325.0, + "step": 16355 + }, + { + "epoch": 1.9436720142602497, + "grad_norm": 0.22599302423247983, + "learning_rate": 2.0375742807608284e-06, + "loss": 0.807, + "num_tokens": 68364251176.0, + "step": 16356 + }, + { + "epoch": 1.9437908496732026, + "grad_norm": 0.21288796107753993, + "learning_rate": 2.037416349394299e-06, + "loss": 0.8188, + "num_tokens": 68368440079.0, + "step": 16357 + }, + { + "epoch": 1.9439096850861557, + "grad_norm": 0.22811806503291637, + "learning_rate": 2.037258749942545e-06, + "loss": 0.8229, + "num_tokens": 68372619555.0, + "step": 16358 + }, + { + "epoch": 1.9440285204991086, + "grad_norm": 0.2271651411202391, + "learning_rate": 2.037101482411404e-06, + "loss": 0.8014, + "num_tokens": 68376808093.0, + "step": 16359 + }, + { + "epoch": 1.9441473559120617, + "grad_norm": 0.22211064678236658, + "learning_rate": 2.036944546806699e-06, + "loss": 0.8212, + "num_tokens": 68380996273.0, + "step": 16360 + }, + { + "epoch": 1.9442661913250148, + "grad_norm": 0.24126571375615966, + "learning_rate": 2.0367879431342417e-06, + "loss": 0.8423, + "num_tokens": 68385183593.0, + "step": 16361 + }, + { + "epoch": 1.9443850267379679, + "grad_norm": 0.22394890726692349, + "learning_rate": 2.036631671399833e-06, + "loss": 0.7918, + "num_tokens": 68389355649.0, + "step": 16362 + }, + { + "epoch": 1.944503862150921, + "grad_norm": 0.2278039019491303, + "learning_rate": 2.0364757316092594e-06, + "loss": 0.8356, + "num_tokens": 68393544730.0, + "step": 16363 + }, + { + "epoch": 1.944622697563874, + "grad_norm": 0.21637512192903463, + "learning_rate": 2.036320123768295e-06, + "loss": 0.8288, + "num_tokens": 68397732770.0, + "step": 16364 + }, + { + "epoch": 1.9447415329768272, + "grad_norm": 0.2148353593721219, + "learning_rate": 2.0361648478827046e-06, + "loss": 0.7754, + "num_tokens": 68401923094.0, + "step": 16365 + }, + { + "epoch": 1.9448603683897803, + "grad_norm": 0.2161814614282371, + "learning_rate": 2.0360099039582366e-06, + "loss": 0.8058, + "num_tokens": 68406112325.0, + "step": 16366 + }, + { + "epoch": 1.9449792038027334, + "grad_norm": 0.2074102038400218, + "learning_rate": 2.03585529200063e-06, + "loss": 0.7832, + "num_tokens": 68410300024.0, + "step": 16367 + }, + { + "epoch": 1.9450980392156862, + "grad_norm": 0.2108265730607908, + "learning_rate": 2.0357010120156097e-06, + "loss": 0.7996, + "num_tokens": 68414490149.0, + "step": 16368 + }, + { + "epoch": 1.9452168746286393, + "grad_norm": 0.2166012323486805, + "learning_rate": 2.035547064008891e-06, + "loss": 0.7866, + "num_tokens": 68418679155.0, + "step": 16369 + }, + { + "epoch": 1.9453357100415924, + "grad_norm": 0.21142672065590726, + "learning_rate": 2.035393447986174e-06, + "loss": 0.7972, + "num_tokens": 68422850888.0, + "step": 16370 + }, + { + "epoch": 1.9454545454545453, + "grad_norm": 0.2180099684567734, + "learning_rate": 2.0352401639531464e-06, + "loss": 0.7883, + "num_tokens": 68427040109.0, + "step": 16371 + }, + { + "epoch": 1.9455733808674984, + "grad_norm": 0.21883962300591744, + "learning_rate": 2.0350872119154873e-06, + "loss": 0.8217, + "num_tokens": 68431216430.0, + "step": 16372 + }, + { + "epoch": 1.9456922162804515, + "grad_norm": 0.2110635816855799, + "learning_rate": 2.0349345918788594e-06, + "loss": 0.7968, + "num_tokens": 68435404303.0, + "step": 16373 + }, + { + "epoch": 1.9458110516934046, + "grad_norm": 0.22494971039245099, + "learning_rate": 2.034782303848915e-06, + "loss": 0.7766, + "num_tokens": 68439569241.0, + "step": 16374 + }, + { + "epoch": 1.9459298871063577, + "grad_norm": 0.22614655789895563, + "learning_rate": 2.034630347831295e-06, + "loss": 0.7948, + "num_tokens": 68443731737.0, + "step": 16375 + }, + { + "epoch": 1.9460487225193108, + "grad_norm": 0.2419688333500764, + "learning_rate": 2.0344787238316256e-06, + "loss": 0.7661, + "num_tokens": 68447922156.0, + "step": 16376 + }, + { + "epoch": 1.946167557932264, + "grad_norm": 0.21129262083070596, + "learning_rate": 2.0343274318555205e-06, + "loss": 0.808, + "num_tokens": 68452110871.0, + "step": 16377 + }, + { + "epoch": 1.946286393345217, + "grad_norm": 0.20356355796889217, + "learning_rate": 2.0341764719085854e-06, + "loss": 0.7964, + "num_tokens": 68456270980.0, + "step": 16378 + }, + { + "epoch": 1.94640522875817, + "grad_norm": 0.22787784758531163, + "learning_rate": 2.03402584399641e-06, + "loss": 0.8167, + "num_tokens": 68460458896.0, + "step": 16379 + }, + { + "epoch": 1.946524064171123, + "grad_norm": 0.2079488461438058, + "learning_rate": 2.033875548124572e-06, + "loss": 0.8197, + "num_tokens": 68464618238.0, + "step": 16380 + }, + { + "epoch": 1.946642899584076, + "grad_norm": 0.2233708699030249, + "learning_rate": 2.0337255842986377e-06, + "loss": 0.8203, + "num_tokens": 68468796747.0, + "step": 16381 + }, + { + "epoch": 1.946761734997029, + "grad_norm": 0.21766152273186218, + "learning_rate": 2.0335759525241613e-06, + "loss": 0.8211, + "num_tokens": 68472985842.0, + "step": 16382 + }, + { + "epoch": 1.946880570409982, + "grad_norm": 0.21907315824252535, + "learning_rate": 2.033426652806683e-06, + "loss": 0.8558, + "num_tokens": 68477160125.0, + "step": 16383 + }, + { + "epoch": 1.9469994058229352, + "grad_norm": 0.20594338173950114, + "learning_rate": 2.0332776851517323e-06, + "loss": 0.8253, + "num_tokens": 68481348461.0, + "step": 16384 + }, + { + "epoch": 1.9471182412358883, + "grad_norm": 0.22792875878056773, + "learning_rate": 2.033129049564826e-06, + "loss": 0.8059, + "num_tokens": 68485536309.0, + "step": 16385 + }, + { + "epoch": 1.9472370766488414, + "grad_norm": 0.21705052369154876, + "learning_rate": 2.032980746051469e-06, + "loss": 0.7926, + "num_tokens": 68489724475.0, + "step": 16386 + }, + { + "epoch": 1.9473559120617945, + "grad_norm": 0.21481793253075188, + "learning_rate": 2.0328327746171536e-06, + "loss": 0.8066, + "num_tokens": 68493913439.0, + "step": 16387 + }, + { + "epoch": 1.9474747474747476, + "grad_norm": 0.23362028849650182, + "learning_rate": 2.0326851352673588e-06, + "loss": 0.8176, + "num_tokens": 68498103956.0, + "step": 16388 + }, + { + "epoch": 1.9475935828877007, + "grad_norm": 0.21774735296106468, + "learning_rate": 2.0325378280075526e-06, + "loss": 0.7752, + "num_tokens": 68502294338.0, + "step": 16389 + }, + { + "epoch": 1.9477124183006536, + "grad_norm": 0.2161443234630939, + "learning_rate": 2.032390852843191e-06, + "loss": 0.7907, + "num_tokens": 68506465183.0, + "step": 16390 + }, + { + "epoch": 1.9478312537136067, + "grad_norm": 0.23829368252856334, + "learning_rate": 2.0322442097797154e-06, + "loss": 0.7808, + "num_tokens": 68510655149.0, + "step": 16391 + }, + { + "epoch": 1.9479500891265598, + "grad_norm": 0.22155472175381274, + "learning_rate": 2.0320978988225577e-06, + "loss": 0.781, + "num_tokens": 68514840770.0, + "step": 16392 + }, + { + "epoch": 1.9480689245395126, + "grad_norm": 0.21949786053807113, + "learning_rate": 2.031951919977136e-06, + "loss": 0.8094, + "num_tokens": 68519031451.0, + "step": 16393 + }, + { + "epoch": 1.9481877599524657, + "grad_norm": 0.23415940218519807, + "learning_rate": 2.031806273248856e-06, + "loss": 0.7658, + "num_tokens": 68523211620.0, + "step": 16394 + }, + { + "epoch": 1.9483065953654188, + "grad_norm": 0.21296126179169705, + "learning_rate": 2.0316609586431127e-06, + "loss": 0.7657, + "num_tokens": 68527400665.0, + "step": 16395 + }, + { + "epoch": 1.948425430778372, + "grad_norm": 0.21852800803610445, + "learning_rate": 2.0315159761652865e-06, + "loss": 0.8094, + "num_tokens": 68531590685.0, + "step": 16396 + }, + { + "epoch": 1.948544266191325, + "grad_norm": 0.2263935973531658, + "learning_rate": 2.0313713258207463e-06, + "loss": 0.7909, + "num_tokens": 68535778675.0, + "step": 16397 + }, + { + "epoch": 1.9486631016042781, + "grad_norm": 0.20786560693977968, + "learning_rate": 2.0312270076148506e-06, + "loss": 0.8121, + "num_tokens": 68539941484.0, + "step": 16398 + }, + { + "epoch": 1.9487819370172312, + "grad_norm": 0.21059792055662116, + "learning_rate": 2.031083021552942e-06, + "loss": 0.789, + "num_tokens": 68544112257.0, + "step": 16399 + }, + { + "epoch": 1.9489007724301843, + "grad_norm": 0.23356162584624116, + "learning_rate": 2.0309393676403545e-06, + "loss": 0.786, + "num_tokens": 68548301191.0, + "step": 16400 + }, + { + "epoch": 1.9490196078431372, + "grad_norm": 0.20680208581690418, + "learning_rate": 2.030796045882407e-06, + "loss": 0.8176, + "num_tokens": 68552433684.0, + "step": 16401 + }, + { + "epoch": 1.9491384432560903, + "grad_norm": 0.2210165287169802, + "learning_rate": 2.030653056284407e-06, + "loss": 0.7861, + "num_tokens": 68556622445.0, + "step": 16402 + }, + { + "epoch": 1.9492572786690434, + "grad_norm": 0.22951013580525814, + "learning_rate": 2.030510398851651e-06, + "loss": 0.8167, + "num_tokens": 68560812672.0, + "step": 16403 + }, + { + "epoch": 1.9493761140819963, + "grad_norm": 0.22538209574947607, + "learning_rate": 2.030368073589421e-06, + "loss": 0.8098, + "num_tokens": 68564958907.0, + "step": 16404 + }, + { + "epoch": 1.9494949494949494, + "grad_norm": 0.21750654092536884, + "learning_rate": 2.0302260805029887e-06, + "loss": 0.7725, + "num_tokens": 68569148635.0, + "step": 16405 + }, + { + "epoch": 1.9496137849079025, + "grad_norm": 0.21651409176728992, + "learning_rate": 2.0300844195976124e-06, + "loss": 0.7976, + "num_tokens": 68573324786.0, + "step": 16406 + }, + { + "epoch": 1.9497326203208556, + "grad_norm": 0.22820643685343464, + "learning_rate": 2.029943090878538e-06, + "loss": 0.8129, + "num_tokens": 68577512491.0, + "step": 16407 + }, + { + "epoch": 1.9498514557338087, + "grad_norm": 0.2113740721297134, + "learning_rate": 2.0298020943509992e-06, + "loss": 0.823, + "num_tokens": 68581696148.0, + "step": 16408 + }, + { + "epoch": 1.9499702911467618, + "grad_norm": 0.21843599127042448, + "learning_rate": 2.029661430020217e-06, + "loss": 0.7556, + "num_tokens": 68585886279.0, + "step": 16409 + }, + { + "epoch": 1.950089126559715, + "grad_norm": 0.20496668814986244, + "learning_rate": 2.0295210978914026e-06, + "loss": 0.8121, + "num_tokens": 68590066591.0, + "step": 16410 + }, + { + "epoch": 1.950207961972668, + "grad_norm": 0.2016932757928554, + "learning_rate": 2.029381097969752e-06, + "loss": 0.7877, + "num_tokens": 68594242496.0, + "step": 16411 + }, + { + "epoch": 1.9503267973856209, + "grad_norm": 0.22476427617278258, + "learning_rate": 2.0292414302604492e-06, + "loss": 0.8052, + "num_tokens": 68598408403.0, + "step": 16412 + }, + { + "epoch": 1.950445632798574, + "grad_norm": 0.2189418901869384, + "learning_rate": 2.0291020947686667e-06, + "loss": 0.8304, + "num_tokens": 68602595356.0, + "step": 16413 + }, + { + "epoch": 1.950564468211527, + "grad_norm": 0.2116710258158182, + "learning_rate": 2.0289630914995657e-06, + "loss": 0.7965, + "num_tokens": 68606724098.0, + "step": 16414 + }, + { + "epoch": 1.95068330362448, + "grad_norm": 0.22518482551034824, + "learning_rate": 2.0288244204582927e-06, + "loss": 0.8254, + "num_tokens": 68610912781.0, + "step": 16415 + }, + { + "epoch": 1.950802139037433, + "grad_norm": 0.2131883448701934, + "learning_rate": 2.028686081649984e-06, + "loss": 0.7826, + "num_tokens": 68615086996.0, + "step": 16416 + }, + { + "epoch": 1.9509209744503861, + "grad_norm": 0.21033929986948657, + "learning_rate": 2.0285480750797623e-06, + "loss": 0.768, + "num_tokens": 68619276142.0, + "step": 16417 + }, + { + "epoch": 1.9510398098633392, + "grad_norm": 0.20926174978444798, + "learning_rate": 2.0284104007527387e-06, + "loss": 0.8098, + "num_tokens": 68623441131.0, + "step": 16418 + }, + { + "epoch": 1.9511586452762923, + "grad_norm": 0.21752745262181838, + "learning_rate": 2.0282730586740117e-06, + "loss": 0.7863, + "num_tokens": 68627608868.0, + "step": 16419 + }, + { + "epoch": 1.9512774806892454, + "grad_norm": 0.21553696675453834, + "learning_rate": 2.028136048848667e-06, + "loss": 0.8272, + "num_tokens": 68631797748.0, + "step": 16420 + }, + { + "epoch": 1.9513963161021985, + "grad_norm": 0.21116208919912677, + "learning_rate": 2.0279993712817793e-06, + "loss": 0.8142, + "num_tokens": 68635987718.0, + "step": 16421 + }, + { + "epoch": 1.9515151515151516, + "grad_norm": 0.24249519341359993, + "learning_rate": 2.02786302597841e-06, + "loss": 0.8063, + "num_tokens": 68640177104.0, + "step": 16422 + }, + { + "epoch": 1.9516339869281045, + "grad_norm": 0.20479812427282196, + "learning_rate": 2.0277270129436076e-06, + "loss": 0.8166, + "num_tokens": 68644365616.0, + "step": 16423 + }, + { + "epoch": 1.9517528223410576, + "grad_norm": 0.21734838370001014, + "learning_rate": 2.0275913321824104e-06, + "loss": 0.8063, + "num_tokens": 68648556411.0, + "step": 16424 + }, + { + "epoch": 1.9518716577540107, + "grad_norm": 0.2194333923291828, + "learning_rate": 2.027455983699843e-06, + "loss": 0.8064, + "num_tokens": 68652745055.0, + "step": 16425 + }, + { + "epoch": 1.9519904931669636, + "grad_norm": 0.2107994534440474, + "learning_rate": 2.027320967500918e-06, + "loss": 0.805, + "num_tokens": 68656904020.0, + "step": 16426 + }, + { + "epoch": 1.9521093285799167, + "grad_norm": 0.2123179033364449, + "learning_rate": 2.0271862835906333e-06, + "loss": 0.7946, + "num_tokens": 68661063133.0, + "step": 16427 + }, + { + "epoch": 1.9522281639928698, + "grad_norm": 0.21109040393317224, + "learning_rate": 2.0270519319739797e-06, + "loss": 0.8024, + "num_tokens": 68665237262.0, + "step": 16428 + }, + { + "epoch": 1.952346999405823, + "grad_norm": 0.22232199289932056, + "learning_rate": 2.0269179126559302e-06, + "loss": 0.7965, + "num_tokens": 68669401535.0, + "step": 16429 + }, + { + "epoch": 1.952465834818776, + "grad_norm": 0.2141614766879544, + "learning_rate": 2.0267842256414497e-06, + "loss": 0.7847, + "num_tokens": 68673533568.0, + "step": 16430 + }, + { + "epoch": 1.952584670231729, + "grad_norm": 0.22297313452320341, + "learning_rate": 2.026650870935489e-06, + "loss": 0.8107, + "num_tokens": 68677724307.0, + "step": 16431 + }, + { + "epoch": 1.9527035056446822, + "grad_norm": 0.2078822925496599, + "learning_rate": 2.0265178485429858e-06, + "loss": 0.7943, + "num_tokens": 68681912952.0, + "step": 16432 + }, + { + "epoch": 1.9528223410576353, + "grad_norm": 0.22948167230657718, + "learning_rate": 2.026385158468866e-06, + "loss": 0.8209, + "num_tokens": 68686100211.0, + "step": 16433 + }, + { + "epoch": 1.9529411764705882, + "grad_norm": 0.2133178699998379, + "learning_rate": 2.026252800718045e-06, + "loss": 0.8074, + "num_tokens": 68690287365.0, + "step": 16434 + }, + { + "epoch": 1.9530600118835413, + "grad_norm": 0.2040976585310003, + "learning_rate": 2.0261207752954235e-06, + "loss": 0.7958, + "num_tokens": 68694474626.0, + "step": 16435 + }, + { + "epoch": 1.9531788472964944, + "grad_norm": 0.20928190482725495, + "learning_rate": 2.025989082205891e-06, + "loss": 0.7877, + "num_tokens": 68698665394.0, + "step": 16436 + }, + { + "epoch": 1.9532976827094473, + "grad_norm": 0.21938998944403823, + "learning_rate": 2.0258577214543245e-06, + "loss": 0.7895, + "num_tokens": 68702852854.0, + "step": 16437 + }, + { + "epoch": 1.9534165181224004, + "grad_norm": 0.21151050644115282, + "learning_rate": 2.0257266930455895e-06, + "loss": 0.8253, + "num_tokens": 68707019215.0, + "step": 16438 + }, + { + "epoch": 1.9535353535353535, + "grad_norm": 0.20294322534719658, + "learning_rate": 2.025595996984537e-06, + "loss": 0.8085, + "num_tokens": 68711208247.0, + "step": 16439 + }, + { + "epoch": 1.9536541889483066, + "grad_norm": 0.22113053527024268, + "learning_rate": 2.0254656332760087e-06, + "loss": 0.853, + "num_tokens": 68715397949.0, + "step": 16440 + }, + { + "epoch": 1.9537730243612597, + "grad_norm": 0.20572950984796623, + "learning_rate": 2.0253356019248314e-06, + "loss": 0.7908, + "num_tokens": 68719587841.0, + "step": 16441 + }, + { + "epoch": 1.9538918597742128, + "grad_norm": 0.2091221509848561, + "learning_rate": 2.0252059029358208e-06, + "loss": 0.798, + "num_tokens": 68723768432.0, + "step": 16442 + }, + { + "epoch": 1.9540106951871659, + "grad_norm": 0.22360508818166827, + "learning_rate": 2.02507653631378e-06, + "loss": 0.7855, + "num_tokens": 68727958727.0, + "step": 16443 + }, + { + "epoch": 1.954129530600119, + "grad_norm": 0.21125767907170345, + "learning_rate": 2.0249475020635e-06, + "loss": 0.8072, + "num_tokens": 68732146643.0, + "step": 16444 + }, + { + "epoch": 1.954248366013072, + "grad_norm": 0.20692861564564854, + "learning_rate": 2.0248188001897588e-06, + "loss": 0.7808, + "num_tokens": 68736279408.0, + "step": 16445 + }, + { + "epoch": 1.954367201426025, + "grad_norm": 0.21726312584213398, + "learning_rate": 2.0246904306973238e-06, + "loss": 0.7944, + "num_tokens": 68740442814.0, + "step": 16446 + }, + { + "epoch": 1.954486036838978, + "grad_norm": 0.21472779290358446, + "learning_rate": 2.0245623935909485e-06, + "loss": 0.7443, + "num_tokens": 68744592696.0, + "step": 16447 + }, + { + "epoch": 1.954604872251931, + "grad_norm": 0.21976085014771415, + "learning_rate": 2.0244346888753743e-06, + "loss": 0.796, + "num_tokens": 68748775479.0, + "step": 16448 + }, + { + "epoch": 1.954723707664884, + "grad_norm": 0.22749391850556552, + "learning_rate": 2.024307316555331e-06, + "loss": 0.8483, + "num_tokens": 68752964372.0, + "step": 16449 + }, + { + "epoch": 1.9548425430778371, + "grad_norm": 0.22005566424981704, + "learning_rate": 2.0241802766355352e-06, + "loss": 0.7892, + "num_tokens": 68757126207.0, + "step": 16450 + }, + { + "epoch": 1.9549613784907902, + "grad_norm": 0.23309202157218184, + "learning_rate": 2.024053569120691e-06, + "loss": 0.8149, + "num_tokens": 68761307408.0, + "step": 16451 + }, + { + "epoch": 1.9550802139037433, + "grad_norm": 0.21025928232497437, + "learning_rate": 2.0239271940154926e-06, + "loss": 0.8211, + "num_tokens": 68765468809.0, + "step": 16452 + }, + { + "epoch": 1.9551990493166964, + "grad_norm": 0.22107912667266597, + "learning_rate": 2.0238011513246182e-06, + "loss": 0.7692, + "num_tokens": 68769634228.0, + "step": 16453 + }, + { + "epoch": 1.9553178847296495, + "grad_norm": 0.22764747884803493, + "learning_rate": 2.0236754410527366e-06, + "loss": 0.829, + "num_tokens": 68773822330.0, + "step": 16454 + }, + { + "epoch": 1.9554367201426026, + "grad_norm": 0.21457617366552229, + "learning_rate": 2.023550063204503e-06, + "loss": 0.7781, + "num_tokens": 68778012038.0, + "step": 16455 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.21405901339688493, + "learning_rate": 2.0234250177845617e-06, + "loss": 0.8229, + "num_tokens": 68782172437.0, + "step": 16456 + }, + { + "epoch": 1.9556743909685086, + "grad_norm": 0.2120393021715913, + "learning_rate": 2.0233003047975412e-06, + "loss": 0.8098, + "num_tokens": 68786359864.0, + "step": 16457 + }, + { + "epoch": 1.9557932263814617, + "grad_norm": 0.21911738094609612, + "learning_rate": 2.0231759242480624e-06, + "loss": 0.814, + "num_tokens": 68790538450.0, + "step": 16458 + }, + { + "epoch": 1.9559120617944146, + "grad_norm": 0.23862753395897518, + "learning_rate": 2.023051876140729e-06, + "loss": 0.7789, + "num_tokens": 68794716993.0, + "step": 16459 + }, + { + "epoch": 1.9560308972073677, + "grad_norm": 0.21181171091072798, + "learning_rate": 2.0229281604801383e-06, + "loss": 0.8068, + "num_tokens": 68798904378.0, + "step": 16460 + }, + { + "epoch": 1.9561497326203208, + "grad_norm": 0.22194314812723762, + "learning_rate": 2.0228047772708686e-06, + "loss": 0.8184, + "num_tokens": 68803070792.0, + "step": 16461 + }, + { + "epoch": 1.9562685680332739, + "grad_norm": 0.23738524546641904, + "learning_rate": 2.0226817265174907e-06, + "loss": 0.8329, + "num_tokens": 68807260297.0, + "step": 16462 + }, + { + "epoch": 1.956387403446227, + "grad_norm": 0.22901348917181544, + "learning_rate": 2.0225590082245623e-06, + "loss": 0.7943, + "num_tokens": 68811449151.0, + "step": 16463 + }, + { + "epoch": 1.95650623885918, + "grad_norm": 0.22713691066193228, + "learning_rate": 2.022436622396626e-06, + "loss": 0.8064, + "num_tokens": 68815639032.0, + "step": 16464 + }, + { + "epoch": 1.9566250742721332, + "grad_norm": 0.22668857492720268, + "learning_rate": 2.022314569038217e-06, + "loss": 0.7924, + "num_tokens": 68819827986.0, + "step": 16465 + }, + { + "epoch": 1.9567439096850863, + "grad_norm": 0.2288107679339518, + "learning_rate": 2.0221928481538523e-06, + "loss": 0.8006, + "num_tokens": 68824007584.0, + "step": 16466 + }, + { + "epoch": 1.9568627450980394, + "grad_norm": 0.22260200410109396, + "learning_rate": 2.0220714597480423e-06, + "loss": 0.7901, + "num_tokens": 68828175071.0, + "step": 16467 + }, + { + "epoch": 1.9569815805109922, + "grad_norm": 0.20373778161931666, + "learning_rate": 2.021950403825281e-06, + "loss": 0.8178, + "num_tokens": 68832349793.0, + "step": 16468 + }, + { + "epoch": 1.9571004159239453, + "grad_norm": 0.20843712017468496, + "learning_rate": 2.021829680390052e-06, + "loss": 0.7536, + "num_tokens": 68836539847.0, + "step": 16469 + }, + { + "epoch": 1.9572192513368984, + "grad_norm": 0.22162895505364089, + "learning_rate": 2.0217092894468253e-06, + "loss": 0.8124, + "num_tokens": 68840705872.0, + "step": 16470 + }, + { + "epoch": 1.9573380867498513, + "grad_norm": 0.21602955918628597, + "learning_rate": 2.02158923100006e-06, + "loss": 0.7944, + "num_tokens": 68844895605.0, + "step": 16471 + }, + { + "epoch": 1.9574569221628044, + "grad_norm": 0.20928810625865238, + "learning_rate": 2.0214695050542015e-06, + "loss": 0.791, + "num_tokens": 68849073308.0, + "step": 16472 + }, + { + "epoch": 1.9575757575757575, + "grad_norm": 0.2134103335170243, + "learning_rate": 2.0213501116136853e-06, + "loss": 0.8083, + "num_tokens": 68853263346.0, + "step": 16473 + }, + { + "epoch": 1.9576945929887106, + "grad_norm": 0.2210842096526354, + "learning_rate": 2.021231050682932e-06, + "loss": 0.8365, + "num_tokens": 68857454004.0, + "step": 16474 + }, + { + "epoch": 1.9578134284016637, + "grad_norm": 0.21376704982355843, + "learning_rate": 2.021112322266351e-06, + "loss": 0.8454, + "num_tokens": 68861643303.0, + "step": 16475 + }, + { + "epoch": 1.9579322638146168, + "grad_norm": 0.2120204075692337, + "learning_rate": 2.020993926368338e-06, + "loss": 0.8223, + "num_tokens": 68865799561.0, + "step": 16476 + }, + { + "epoch": 1.95805109922757, + "grad_norm": 0.22486711336394394, + "learning_rate": 2.0208758629932803e-06, + "loss": 0.7903, + "num_tokens": 68869989461.0, + "step": 16477 + }, + { + "epoch": 1.958169934640523, + "grad_norm": 0.22067561780309575, + "learning_rate": 2.0207581321455473e-06, + "loss": 0.7994, + "num_tokens": 68874177498.0, + "step": 16478 + }, + { + "epoch": 1.958288770053476, + "grad_norm": 0.24212499564601023, + "learning_rate": 2.0206407338295007e-06, + "loss": 0.8204, + "num_tokens": 68878308885.0, + "step": 16479 + }, + { + "epoch": 1.958407605466429, + "grad_norm": 0.22080817260944077, + "learning_rate": 2.020523668049487e-06, + "loss": 0.8053, + "num_tokens": 68882472778.0, + "step": 16480 + }, + { + "epoch": 1.958526440879382, + "grad_norm": 0.2333738584376269, + "learning_rate": 2.0204069348098424e-06, + "loss": 0.8097, + "num_tokens": 68886641244.0, + "step": 16481 + }, + { + "epoch": 1.958645276292335, + "grad_norm": 0.22451759553987322, + "learning_rate": 2.020290534114891e-06, + "loss": 0.81, + "num_tokens": 68890830096.0, + "step": 16482 + }, + { + "epoch": 1.958764111705288, + "grad_norm": 0.22650173083503597, + "learning_rate": 2.0201744659689402e-06, + "loss": 0.7923, + "num_tokens": 68894993665.0, + "step": 16483 + }, + { + "epoch": 1.9588829471182412, + "grad_norm": 0.22068612854895897, + "learning_rate": 2.020058730376292e-06, + "loss": 0.8031, + "num_tokens": 68899182949.0, + "step": 16484 + }, + { + "epoch": 1.9590017825311943, + "grad_norm": 0.22295236781738131, + "learning_rate": 2.0199433273412294e-06, + "loss": 0.771, + "num_tokens": 68903371327.0, + "step": 16485 + }, + { + "epoch": 1.9591206179441474, + "grad_norm": 0.21139757891816738, + "learning_rate": 2.0198282568680293e-06, + "loss": 0.8232, + "num_tokens": 68907558955.0, + "step": 16486 + }, + { + "epoch": 1.9592394533571005, + "grad_norm": 0.22018474326321194, + "learning_rate": 2.0197135189609513e-06, + "loss": 0.7828, + "num_tokens": 68911692740.0, + "step": 16487 + }, + { + "epoch": 1.9593582887700536, + "grad_norm": 0.21444232153524498, + "learning_rate": 2.019599113624244e-06, + "loss": 0.8635, + "num_tokens": 68915855582.0, + "step": 16488 + }, + { + "epoch": 1.9594771241830067, + "grad_norm": 0.2233725732682246, + "learning_rate": 2.0194850408621447e-06, + "loss": 0.8099, + "num_tokens": 68920045275.0, + "step": 16489 + }, + { + "epoch": 1.9595959595959596, + "grad_norm": 0.2229985809260282, + "learning_rate": 2.019371300678879e-06, + "loss": 0.8192, + "num_tokens": 68924211228.0, + "step": 16490 + }, + { + "epoch": 1.9597147950089127, + "grad_norm": 0.22932152633971298, + "learning_rate": 2.0192578930786584e-06, + "loss": 0.7981, + "num_tokens": 68928400710.0, + "step": 16491 + }, + { + "epoch": 1.9598336304218658, + "grad_norm": 0.22337904989502463, + "learning_rate": 2.0191448180656823e-06, + "loss": 0.819, + "num_tokens": 68932551165.0, + "step": 16492 + }, + { + "epoch": 1.9599524658348186, + "grad_norm": 0.22128190213709942, + "learning_rate": 2.0190320756441385e-06, + "loss": 0.8107, + "num_tokens": 68936719791.0, + "step": 16493 + }, + { + "epoch": 1.9600713012477717, + "grad_norm": 0.22010059257949835, + "learning_rate": 2.018919665818202e-06, + "loss": 0.8061, + "num_tokens": 68940909038.0, + "step": 16494 + }, + { + "epoch": 1.9601901366607248, + "grad_norm": 0.23921014393430962, + "learning_rate": 2.0188075885920363e-06, + "loss": 0.8092, + "num_tokens": 68945084814.0, + "step": 16495 + }, + { + "epoch": 1.960308972073678, + "grad_norm": 0.21515934593291286, + "learning_rate": 2.018695843969792e-06, + "loss": 0.8119, + "num_tokens": 68949245451.0, + "step": 16496 + }, + { + "epoch": 1.960427807486631, + "grad_norm": 0.21594515377787327, + "learning_rate": 2.0185844319556065e-06, + "loss": 0.8378, + "num_tokens": 68953434214.0, + "step": 16497 + }, + { + "epoch": 1.9605466428995841, + "grad_norm": 0.21206753722945237, + "learning_rate": 2.018473352553607e-06, + "loss": 0.8254, + "num_tokens": 68957623582.0, + "step": 16498 + }, + { + "epoch": 1.9606654783125372, + "grad_norm": 0.21417627343701773, + "learning_rate": 2.0183626057679063e-06, + "loss": 0.8034, + "num_tokens": 68961812859.0, + "step": 16499 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.21746228582640023, + "learning_rate": 2.0182521916026065e-06, + "loss": 0.8623, + "num_tokens": 68965999543.0, + "step": 16500 + }, + { + "epoch": 1.9609031491384432, + "grad_norm": 0.22333181611827685, + "learning_rate": 2.0181421100617956e-06, + "loss": 0.8218, + "num_tokens": 68970169443.0, + "step": 16501 + }, + { + "epoch": 1.9610219845513963, + "grad_norm": 0.23036861100868414, + "learning_rate": 2.0180323611495504e-06, + "loss": 0.8036, + "num_tokens": 68974360652.0, + "step": 16502 + }, + { + "epoch": 1.9611408199643494, + "grad_norm": 0.20825645235767534, + "learning_rate": 2.0179229448699354e-06, + "loss": 0.7864, + "num_tokens": 68978550138.0, + "step": 16503 + }, + { + "epoch": 1.9612596553773023, + "grad_norm": 0.2102452481835026, + "learning_rate": 2.0178138612270033e-06, + "loss": 0.8506, + "num_tokens": 68982725726.0, + "step": 16504 + }, + { + "epoch": 1.9613784907902554, + "grad_norm": 0.20459311701010413, + "learning_rate": 2.0177051102247935e-06, + "loss": 0.8124, + "num_tokens": 68986914442.0, + "step": 16505 + }, + { + "epoch": 1.9614973262032085, + "grad_norm": 0.21595214906037946, + "learning_rate": 2.017596691867334e-06, + "loss": 0.8233, + "num_tokens": 68991080156.0, + "step": 16506 + }, + { + "epoch": 1.9616161616161616, + "grad_norm": 0.2107916785108105, + "learning_rate": 2.0174886061586375e-06, + "loss": 0.7974, + "num_tokens": 68995269068.0, + "step": 16507 + }, + { + "epoch": 1.9617349970291147, + "grad_norm": 0.21905662568073403, + "learning_rate": 2.0173808531027098e-06, + "loss": 0.8398, + "num_tokens": 68999453766.0, + "step": 16508 + }, + { + "epoch": 1.9618538324420678, + "grad_norm": 0.21987809617652454, + "learning_rate": 2.01727343270354e-06, + "loss": 0.8159, + "num_tokens": 69003642007.0, + "step": 16509 + }, + { + "epoch": 1.961972667855021, + "grad_norm": 0.21188144463682945, + "learning_rate": 2.017166344965106e-06, + "loss": 0.793, + "num_tokens": 69007830063.0, + "step": 16510 + }, + { + "epoch": 1.962091503267974, + "grad_norm": 0.21800214267132403, + "learning_rate": 2.0170595898913743e-06, + "loss": 0.8458, + "num_tokens": 69012017545.0, + "step": 16511 + }, + { + "epoch": 1.9622103386809269, + "grad_norm": 0.20493446609494545, + "learning_rate": 2.0169531674862977e-06, + "loss": 0.7801, + "num_tokens": 69016199493.0, + "step": 16512 + }, + { + "epoch": 1.96232917409388, + "grad_norm": 0.21795796754039942, + "learning_rate": 2.0168470777538177e-06, + "loss": 0.8249, + "num_tokens": 69020389530.0, + "step": 16513 + }, + { + "epoch": 1.962448009506833, + "grad_norm": 0.21421768793945956, + "learning_rate": 2.0167413206978633e-06, + "loss": 0.8123, + "num_tokens": 69024577679.0, + "step": 16514 + }, + { + "epoch": 1.962566844919786, + "grad_norm": 0.21316797221175482, + "learning_rate": 2.0166358963223507e-06, + "loss": 0.804, + "num_tokens": 69028764945.0, + "step": 16515 + }, + { + "epoch": 1.962685680332739, + "grad_norm": 0.22350494896725098, + "learning_rate": 2.0165308046311844e-06, + "loss": 0.8231, + "num_tokens": 69032952175.0, + "step": 16516 + }, + { + "epoch": 1.9628045157456921, + "grad_norm": 0.2148474703759911, + "learning_rate": 2.016426045628256e-06, + "loss": 0.7964, + "num_tokens": 69037141372.0, + "step": 16517 + }, + { + "epoch": 1.9629233511586452, + "grad_norm": 0.22333205919274612, + "learning_rate": 2.0163216193174453e-06, + "loss": 0.8212, + "num_tokens": 69041330969.0, + "step": 16518 + }, + { + "epoch": 1.9630421865715983, + "grad_norm": 0.22237623877369067, + "learning_rate": 2.016217525702621e-06, + "loss": 0.7829, + "num_tokens": 69045521592.0, + "step": 16519 + }, + { + "epoch": 1.9631610219845514, + "grad_norm": 0.2256981986784939, + "learning_rate": 2.0161137647876344e-06, + "loss": 0.7981, + "num_tokens": 69049709416.0, + "step": 16520 + }, + { + "epoch": 1.9632798573975045, + "grad_norm": 0.22126895197226115, + "learning_rate": 2.016010336576332e-06, + "loss": 0.8231, + "num_tokens": 69053891862.0, + "step": 16521 + }, + { + "epoch": 1.9633986928104576, + "grad_norm": 0.21178050529734238, + "learning_rate": 2.0159072410725424e-06, + "loss": 0.8325, + "num_tokens": 69058080460.0, + "step": 16522 + }, + { + "epoch": 1.9635175282234105, + "grad_norm": 0.21049308821169913, + "learning_rate": 2.015804478280082e-06, + "loss": 0.8224, + "num_tokens": 69062268648.0, + "step": 16523 + }, + { + "epoch": 1.9636363636363636, + "grad_norm": 0.22576105433855564, + "learning_rate": 2.0157020482027596e-06, + "loss": 0.835, + "num_tokens": 69066457965.0, + "step": 16524 + }, + { + "epoch": 1.9637551990493167, + "grad_norm": 0.21155194837455094, + "learning_rate": 2.015599950844366e-06, + "loss": 0.759, + "num_tokens": 69070647154.0, + "step": 16525 + }, + { + "epoch": 1.9638740344622696, + "grad_norm": 0.23075023821941576, + "learning_rate": 2.0154981862086835e-06, + "loss": 0.7947, + "num_tokens": 69074826393.0, + "step": 16526 + }, + { + "epoch": 1.9639928698752227, + "grad_norm": 0.2258480197187313, + "learning_rate": 2.015396754299481e-06, + "loss": 0.8045, + "num_tokens": 69079015818.0, + "step": 16527 + }, + { + "epoch": 1.9641117052881758, + "grad_norm": 0.22034354064729839, + "learning_rate": 2.0152956551205146e-06, + "loss": 0.7856, + "num_tokens": 69083181834.0, + "step": 16528 + }, + { + "epoch": 1.964230540701129, + "grad_norm": 0.20772615784396692, + "learning_rate": 2.0151948886755273e-06, + "loss": 0.7977, + "num_tokens": 69087371580.0, + "step": 16529 + }, + { + "epoch": 1.964349376114082, + "grad_norm": 0.22852786100209743, + "learning_rate": 2.015094454968252e-06, + "loss": 0.7974, + "num_tokens": 69091554240.0, + "step": 16530 + }, + { + "epoch": 1.964468211527035, + "grad_norm": 0.22193659044832903, + "learning_rate": 2.014994354002407e-06, + "loss": 0.8249, + "num_tokens": 69095707358.0, + "step": 16531 + }, + { + "epoch": 1.9645870469399882, + "grad_norm": 0.245487737896829, + "learning_rate": 2.0148945857817006e-06, + "loss": 0.7753, + "num_tokens": 69099896692.0, + "step": 16532 + }, + { + "epoch": 1.9647058823529413, + "grad_norm": 0.22031620813595873, + "learning_rate": 2.0147951503098266e-06, + "loss": 0.852, + "num_tokens": 69104085837.0, + "step": 16533 + }, + { + "epoch": 1.9648247177658944, + "grad_norm": 0.2205308389298561, + "learning_rate": 2.014696047590469e-06, + "loss": 0.8133, + "num_tokens": 69108274243.0, + "step": 16534 + }, + { + "epoch": 1.9649435531788473, + "grad_norm": 0.2130143091587387, + "learning_rate": 2.0145972776272956e-06, + "loss": 0.7721, + "num_tokens": 69112462678.0, + "step": 16535 + }, + { + "epoch": 1.9650623885918004, + "grad_norm": 0.20924571831918568, + "learning_rate": 2.0144988404239662e-06, + "loss": 0.8184, + "num_tokens": 69116652158.0, + "step": 16536 + }, + { + "epoch": 1.9651812240047533, + "grad_norm": 0.23600530137748132, + "learning_rate": 2.014400735984125e-06, + "loss": 0.8157, + "num_tokens": 69120841200.0, + "step": 16537 + }, + { + "epoch": 1.9653000594177064, + "grad_norm": 0.21768716855964895, + "learning_rate": 2.0143029643114067e-06, + "loss": 0.8081, + "num_tokens": 69125029965.0, + "step": 16538 + }, + { + "epoch": 1.9654188948306595, + "grad_norm": 0.2514718766534886, + "learning_rate": 2.0142055254094296e-06, + "loss": 0.8395, + "num_tokens": 69129219173.0, + "step": 16539 + }, + { + "epoch": 1.9655377302436126, + "grad_norm": 0.2510398499322817, + "learning_rate": 2.0141084192818045e-06, + "loss": 0.8157, + "num_tokens": 69133408439.0, + "step": 16540 + }, + { + "epoch": 1.9656565656565657, + "grad_norm": 0.2193230802477537, + "learning_rate": 2.0140116459321258e-06, + "loss": 0.8363, + "num_tokens": 69137596917.0, + "step": 16541 + }, + { + "epoch": 1.9657754010695188, + "grad_norm": 0.21099291015633267, + "learning_rate": 2.0139152053639784e-06, + "loss": 0.7769, + "num_tokens": 69141757619.0, + "step": 16542 + }, + { + "epoch": 1.9658942364824719, + "grad_norm": 0.22834614289597166, + "learning_rate": 2.0138190975809335e-06, + "loss": 0.7831, + "num_tokens": 69145945715.0, + "step": 16543 + }, + { + "epoch": 1.966013071895425, + "grad_norm": 0.21754832365944893, + "learning_rate": 2.013723322586551e-06, + "loss": 0.8074, + "num_tokens": 69150133888.0, + "step": 16544 + }, + { + "epoch": 1.966131907308378, + "grad_norm": 0.22874033208436684, + "learning_rate": 2.0136278803843768e-06, + "loss": 0.7968, + "num_tokens": 69154300829.0, + "step": 16545 + }, + { + "epoch": 1.966250742721331, + "grad_norm": 0.22347384203685963, + "learning_rate": 2.013532770977946e-06, + "loss": 0.7845, + "num_tokens": 69158477238.0, + "step": 16546 + }, + { + "epoch": 1.966369578134284, + "grad_norm": 0.23583427108314983, + "learning_rate": 2.0134379943707807e-06, + "loss": 0.803, + "num_tokens": 69162653755.0, + "step": 16547 + }, + { + "epoch": 1.966488413547237, + "grad_norm": 0.22229433943969035, + "learning_rate": 2.0133435505663907e-06, + "loss": 0.8027, + "num_tokens": 69166843609.0, + "step": 16548 + }, + { + "epoch": 1.96660724896019, + "grad_norm": 0.20996869955201347, + "learning_rate": 2.013249439568273e-06, + "loss": 0.8121, + "num_tokens": 69171031978.0, + "step": 16549 + }, + { + "epoch": 1.9667260843731431, + "grad_norm": 0.23248277802803904, + "learning_rate": 2.0131556613799142e-06, + "loss": 0.7939, + "num_tokens": 69175221219.0, + "step": 16550 + }, + { + "epoch": 1.9668449197860962, + "grad_norm": 0.2233530053276564, + "learning_rate": 2.0130622160047866e-06, + "loss": 0.8127, + "num_tokens": 69179392789.0, + "step": 16551 + }, + { + "epoch": 1.9669637551990493, + "grad_norm": 0.21948733656570935, + "learning_rate": 2.012969103446351e-06, + "loss": 0.7738, + "num_tokens": 69183582175.0, + "step": 16552 + }, + { + "epoch": 1.9670825906120024, + "grad_norm": 0.22396913141538136, + "learning_rate": 2.0128763237080548e-06, + "loss": 0.7784, + "num_tokens": 69187770421.0, + "step": 16553 + }, + { + "epoch": 1.9672014260249555, + "grad_norm": 0.22921160881628663, + "learning_rate": 2.0127838767933344e-06, + "loss": 0.7915, + "num_tokens": 69191960245.0, + "step": 16554 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.22836747177043365, + "learning_rate": 2.012691762705614e-06, + "loss": 0.7809, + "num_tokens": 69196149145.0, + "step": 16555 + }, + { + "epoch": 1.9674390968508617, + "grad_norm": 0.21281664108857054, + "learning_rate": 2.012599981448304e-06, + "loss": 0.7667, + "num_tokens": 69200297464.0, + "step": 16556 + }, + { + "epoch": 1.9675579322638146, + "grad_norm": 0.22833661551754242, + "learning_rate": 2.012508533024804e-06, + "loss": 0.8303, + "num_tokens": 69204485516.0, + "step": 16557 + }, + { + "epoch": 1.9676767676767677, + "grad_norm": 0.22724768609856835, + "learning_rate": 2.0124174174385e-06, + "loss": 0.765, + "num_tokens": 69208643501.0, + "step": 16558 + }, + { + "epoch": 1.9677956030897208, + "grad_norm": 0.2197090005381691, + "learning_rate": 2.012326634692767e-06, + "loss": 0.8246, + "num_tokens": 69212832808.0, + "step": 16559 + }, + { + "epoch": 1.9679144385026737, + "grad_norm": 0.212325731978197, + "learning_rate": 2.012236184790967e-06, + "loss": 0.8218, + "num_tokens": 69217022038.0, + "step": 16560 + }, + { + "epoch": 1.9680332739156268, + "grad_norm": 0.22795848091603185, + "learning_rate": 2.012146067736449e-06, + "loss": 0.8234, + "num_tokens": 69221210667.0, + "step": 16561 + }, + { + "epoch": 1.9681521093285799, + "grad_norm": 0.2354643470039736, + "learning_rate": 2.0120562835325517e-06, + "loss": 0.7837, + "num_tokens": 69225381518.0, + "step": 16562 + }, + { + "epoch": 1.968270944741533, + "grad_norm": 0.21339228384050365, + "learning_rate": 2.011966832182598e-06, + "loss": 0.7907, + "num_tokens": 69229569658.0, + "step": 16563 + }, + { + "epoch": 1.968389780154486, + "grad_norm": 0.2154711699402677, + "learning_rate": 2.011877713689903e-06, + "loss": 0.7753, + "num_tokens": 69233761268.0, + "step": 16564 + }, + { + "epoch": 1.9685086155674392, + "grad_norm": 0.22011043483368453, + "learning_rate": 2.0117889280577647e-06, + "loss": 0.784, + "num_tokens": 69237950022.0, + "step": 16565 + }, + { + "epoch": 1.9686274509803923, + "grad_norm": 0.22064319207019403, + "learning_rate": 2.011700475289473e-06, + "loss": 0.7883, + "num_tokens": 69242104203.0, + "step": 16566 + }, + { + "epoch": 1.9687462863933454, + "grad_norm": 0.2148160070115619, + "learning_rate": 2.0116123553883027e-06, + "loss": 0.8259, + "num_tokens": 69246272510.0, + "step": 16567 + }, + { + "epoch": 1.9688651218062982, + "grad_norm": 0.23737226510077666, + "learning_rate": 2.0115245683575173e-06, + "loss": 0.8091, + "num_tokens": 69250461905.0, + "step": 16568 + }, + { + "epoch": 1.9689839572192513, + "grad_norm": 0.21659911277913219, + "learning_rate": 2.011437114200368e-06, + "loss": 0.7728, + "num_tokens": 69254620543.0, + "step": 16569 + }, + { + "epoch": 1.9691027926322044, + "grad_norm": 0.21911573236162102, + "learning_rate": 2.011349992920093e-06, + "loss": 0.7935, + "num_tokens": 69258810012.0, + "step": 16570 + }, + { + "epoch": 1.9692216280451573, + "grad_norm": 0.21325355317166758, + "learning_rate": 2.0112632045199195e-06, + "loss": 0.827, + "num_tokens": 69262996694.0, + "step": 16571 + }, + { + "epoch": 1.9693404634581104, + "grad_norm": 0.22140552333900707, + "learning_rate": 2.011176749003061e-06, + "loss": 0.8297, + "num_tokens": 69267183551.0, + "step": 16572 + }, + { + "epoch": 1.9694592988710635, + "grad_norm": 0.21419458603337482, + "learning_rate": 2.0110906263727197e-06, + "loss": 0.8038, + "num_tokens": 69271363338.0, + "step": 16573 + }, + { + "epoch": 1.9695781342840166, + "grad_norm": 0.22710628948709574, + "learning_rate": 2.011004836632084e-06, + "loss": 0.8117, + "num_tokens": 69275551354.0, + "step": 16574 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.21500535771578802, + "learning_rate": 2.010919379784332e-06, + "loss": 0.7501, + "num_tokens": 69279742365.0, + "step": 16575 + }, + { + "epoch": 1.9698158051099228, + "grad_norm": 0.21886353483972584, + "learning_rate": 2.0108342558326285e-06, + "loss": 0.7951, + "num_tokens": 69283930926.0, + "step": 16576 + }, + { + "epoch": 1.969934640522876, + "grad_norm": 0.22233540443418262, + "learning_rate": 2.0107494647801253e-06, + "loss": 0.7853, + "num_tokens": 69288120875.0, + "step": 16577 + }, + { + "epoch": 1.970053475935829, + "grad_norm": 0.22125228525084248, + "learning_rate": 2.0106650066299625e-06, + "loss": 0.817, + "num_tokens": 69292283549.0, + "step": 16578 + }, + { + "epoch": 1.970172311348782, + "grad_norm": 0.2290179262767018, + "learning_rate": 2.0105808813852694e-06, + "loss": 0.8192, + "num_tokens": 69296444597.0, + "step": 16579 + }, + { + "epoch": 1.970291146761735, + "grad_norm": 0.21396379476698416, + "learning_rate": 2.010497089049159e-06, + "loss": 0.8088, + "num_tokens": 69300634033.0, + "step": 16580 + }, + { + "epoch": 1.970409982174688, + "grad_norm": 0.2246849115901631, + "learning_rate": 2.010413629624736e-06, + "loss": 0.8367, + "num_tokens": 69304822071.0, + "step": 16581 + }, + { + "epoch": 1.970528817587641, + "grad_norm": 0.2224094475808087, + "learning_rate": 2.0103305031150904e-06, + "loss": 0.7887, + "num_tokens": 69309000676.0, + "step": 16582 + }, + { + "epoch": 1.970647653000594, + "grad_norm": 0.21227602422165465, + "learning_rate": 2.010247709523302e-06, + "loss": 0.7818, + "num_tokens": 69313163360.0, + "step": 16583 + }, + { + "epoch": 1.9707664884135472, + "grad_norm": 0.20523993816386008, + "learning_rate": 2.0101652488524357e-06, + "loss": 0.7642, + "num_tokens": 69317352744.0, + "step": 16584 + }, + { + "epoch": 1.9708853238265003, + "grad_norm": 0.20885964075003166, + "learning_rate": 2.0100831211055455e-06, + "loss": 0.7589, + "num_tokens": 69321542066.0, + "step": 16585 + }, + { + "epoch": 1.9710041592394534, + "grad_norm": 0.2297513875714076, + "learning_rate": 2.010001326285673e-06, + "loss": 0.7896, + "num_tokens": 69325697271.0, + "step": 16586 + }, + { + "epoch": 1.9711229946524065, + "grad_norm": 0.22081476830803623, + "learning_rate": 2.0099198643958465e-06, + "loss": 0.8248, + "num_tokens": 69329886848.0, + "step": 16587 + }, + { + "epoch": 1.9712418300653596, + "grad_norm": 0.22612507156605113, + "learning_rate": 2.0098387354390847e-06, + "loss": 0.8213, + "num_tokens": 69334047081.0, + "step": 16588 + }, + { + "epoch": 1.9713606654783127, + "grad_norm": 0.22488260854187891, + "learning_rate": 2.009757939418391e-06, + "loss": 0.8303, + "num_tokens": 69338220864.0, + "step": 16589 + }, + { + "epoch": 1.9714795008912656, + "grad_norm": 0.2113455679478319, + "learning_rate": 2.009677476336756e-06, + "loss": 0.8246, + "num_tokens": 69342409064.0, + "step": 16590 + }, + { + "epoch": 1.9715983363042187, + "grad_norm": 0.22580909718896688, + "learning_rate": 2.0095973461971627e-06, + "loss": 0.7965, + "num_tokens": 69346594949.0, + "step": 16591 + }, + { + "epoch": 1.9717171717171718, + "grad_norm": 0.20622792749068453, + "learning_rate": 2.0095175490025767e-06, + "loss": 0.8022, + "num_tokens": 69350782756.0, + "step": 16592 + }, + { + "epoch": 1.9718360071301246, + "grad_norm": 0.21091667317360865, + "learning_rate": 2.0094380847559526e-06, + "loss": 0.7855, + "num_tokens": 69354971877.0, + "step": 16593 + }, + { + "epoch": 1.9719548425430777, + "grad_norm": 0.22855880356283304, + "learning_rate": 2.0093589534602348e-06, + "loss": 0.8422, + "num_tokens": 69359161244.0, + "step": 16594 + }, + { + "epoch": 1.9720736779560308, + "grad_norm": 0.22019440491068135, + "learning_rate": 2.009280155118352e-06, + "loss": 0.8172, + "num_tokens": 69363349791.0, + "step": 16595 + }, + { + "epoch": 1.972192513368984, + "grad_norm": 0.20852576525549216, + "learning_rate": 2.009201689733225e-06, + "loss": 0.8283, + "num_tokens": 69367538050.0, + "step": 16596 + }, + { + "epoch": 1.972311348781937, + "grad_norm": 0.22370752087546278, + "learning_rate": 2.0091235573077565e-06, + "loss": 0.8108, + "num_tokens": 69371727601.0, + "step": 16597 + }, + { + "epoch": 1.9724301841948901, + "grad_norm": 0.2217628141131938, + "learning_rate": 2.0090457578448425e-06, + "loss": 0.8195, + "num_tokens": 69375916405.0, + "step": 16598 + }, + { + "epoch": 1.9725490196078432, + "grad_norm": 0.21529760923223895, + "learning_rate": 2.0089682913473627e-06, + "loss": 0.7542, + "num_tokens": 69380097084.0, + "step": 16599 + }, + { + "epoch": 1.9726678550207963, + "grad_norm": 0.21112614590086856, + "learning_rate": 2.0088911578181868e-06, + "loss": 0.7992, + "num_tokens": 69384287071.0, + "step": 16600 + }, + { + "epoch": 1.9727866904337492, + "grad_norm": 0.21996914279529817, + "learning_rate": 2.008814357260171e-06, + "loss": 0.7705, + "num_tokens": 69388475671.0, + "step": 16601 + }, + { + "epoch": 1.9729055258467023, + "grad_norm": 0.21159703911082517, + "learning_rate": 2.008737889676159e-06, + "loss": 0.8106, + "num_tokens": 69392664327.0, + "step": 16602 + }, + { + "epoch": 1.9730243612596554, + "grad_norm": 0.21203237621846974, + "learning_rate": 2.0086617550689834e-06, + "loss": 0.8003, + "num_tokens": 69396830973.0, + "step": 16603 + }, + { + "epoch": 1.9731431966726083, + "grad_norm": 0.23196409185428332, + "learning_rate": 2.0085859534414633e-06, + "loss": 0.7864, + "num_tokens": 69401013296.0, + "step": 16604 + }, + { + "epoch": 1.9732620320855614, + "grad_norm": 0.20426757873176127, + "learning_rate": 2.008510484796407e-06, + "loss": 0.793, + "num_tokens": 69405194163.0, + "step": 16605 + }, + { + "epoch": 1.9733808674985145, + "grad_norm": 0.21877274260315538, + "learning_rate": 2.0084353491366073e-06, + "loss": 0.8041, + "num_tokens": 69409383408.0, + "step": 16606 + }, + { + "epoch": 1.9734997029114676, + "grad_norm": 0.2149811465017925, + "learning_rate": 2.0083605464648482e-06, + "loss": 0.8074, + "num_tokens": 69413556418.0, + "step": 16607 + }, + { + "epoch": 1.9736185383244207, + "grad_norm": 0.21283061417551058, + "learning_rate": 2.0082860767838996e-06, + "loss": 0.7829, + "num_tokens": 69417746703.0, + "step": 16608 + }, + { + "epoch": 1.9737373737373738, + "grad_norm": 0.2167812442246187, + "learning_rate": 2.0082119400965188e-06, + "loss": 0.7729, + "num_tokens": 69421937838.0, + "step": 16609 + }, + { + "epoch": 1.973856209150327, + "grad_norm": 0.2153485505617194, + "learning_rate": 2.0081381364054532e-06, + "loss": 0.8288, + "num_tokens": 69426102817.0, + "step": 16610 + }, + { + "epoch": 1.97397504456328, + "grad_norm": 0.21827952386366054, + "learning_rate": 2.008064665713433e-06, + "loss": 0.7946, + "num_tokens": 69430258132.0, + "step": 16611 + }, + { + "epoch": 1.9740938799762329, + "grad_norm": 0.21050622571430738, + "learning_rate": 2.0079915280231806e-06, + "loss": 0.7729, + "num_tokens": 69434412666.0, + "step": 16612 + }, + { + "epoch": 1.974212715389186, + "grad_norm": 0.2198623226853477, + "learning_rate": 2.0079187233374055e-06, + "loss": 0.7987, + "num_tokens": 69438587965.0, + "step": 16613 + }, + { + "epoch": 1.974331550802139, + "grad_norm": 0.20917545618313776, + "learning_rate": 2.0078462516588034e-06, + "loss": 0.8067, + "num_tokens": 69442767996.0, + "step": 16614 + }, + { + "epoch": 1.974450386215092, + "grad_norm": 0.21225383980753582, + "learning_rate": 2.0077741129900568e-06, + "loss": 0.8116, + "num_tokens": 69446932350.0, + "step": 16615 + }, + { + "epoch": 1.974569221628045, + "grad_norm": 0.231007800076558, + "learning_rate": 2.007702307333839e-06, + "loss": 0.7636, + "num_tokens": 69451121974.0, + "step": 16616 + }, + { + "epoch": 1.9746880570409981, + "grad_norm": 0.22430138034270763, + "learning_rate": 2.007630834692808e-06, + "loss": 0.8116, + "num_tokens": 69455297891.0, + "step": 16617 + }, + { + "epoch": 1.9748068924539512, + "grad_norm": 0.23378505686303017, + "learning_rate": 2.007559695069611e-06, + "loss": 0.8121, + "num_tokens": 69459486975.0, + "step": 16618 + }, + { + "epoch": 1.9749257278669043, + "grad_norm": 0.21780789410707935, + "learning_rate": 2.0074888884668826e-06, + "loss": 0.7851, + "num_tokens": 69463676461.0, + "step": 16619 + }, + { + "epoch": 1.9750445632798574, + "grad_norm": 0.21987555608246478, + "learning_rate": 2.007418414887246e-06, + "loss": 0.818, + "num_tokens": 69467849671.0, + "step": 16620 + }, + { + "epoch": 1.9751633986928105, + "grad_norm": 0.23940492143932168, + "learning_rate": 2.007348274333309e-06, + "loss": 0.8138, + "num_tokens": 69472039106.0, + "step": 16621 + }, + { + "epoch": 1.9752822341057636, + "grad_norm": 0.23095941964172742, + "learning_rate": 2.0072784668076707e-06, + "loss": 0.795, + "num_tokens": 69476228765.0, + "step": 16622 + }, + { + "epoch": 1.9754010695187165, + "grad_norm": 0.2451844632563199, + "learning_rate": 2.0072089923129156e-06, + "loss": 0.767, + "num_tokens": 69480404369.0, + "step": 16623 + }, + { + "epoch": 1.9755199049316696, + "grad_norm": 0.21308722764751406, + "learning_rate": 2.0071398508516172e-06, + "loss": 0.7802, + "num_tokens": 69484595196.0, + "step": 16624 + }, + { + "epoch": 1.9756387403446227, + "grad_norm": 0.21186697192337609, + "learning_rate": 2.007071042426336e-06, + "loss": 0.7839, + "num_tokens": 69488784484.0, + "step": 16625 + }, + { + "epoch": 1.9757575757575756, + "grad_norm": 0.21590639955567664, + "learning_rate": 2.00700256703962e-06, + "loss": 0.7668, + "num_tokens": 69492972869.0, + "step": 16626 + }, + { + "epoch": 1.9758764111705287, + "grad_norm": 0.21187849060095917, + "learning_rate": 2.006934424694004e-06, + "loss": 0.8888, + "num_tokens": 69497155872.0, + "step": 16627 + }, + { + "epoch": 1.9759952465834818, + "grad_norm": 0.20907612913538368, + "learning_rate": 2.0068666153920135e-06, + "loss": 0.7833, + "num_tokens": 69501344914.0, + "step": 16628 + }, + { + "epoch": 1.976114081996435, + "grad_norm": 0.2292110045891196, + "learning_rate": 2.0067991391361584e-06, + "loss": 0.8085, + "num_tokens": 69505533607.0, + "step": 16629 + }, + { + "epoch": 1.976232917409388, + "grad_norm": 0.21654907286844657, + "learning_rate": 2.0067319959289375e-06, + "loss": 0.7653, + "num_tokens": 69509694259.0, + "step": 16630 + }, + { + "epoch": 1.976351752822341, + "grad_norm": 0.21118097884101406, + "learning_rate": 2.006665185772838e-06, + "loss": 0.808, + "num_tokens": 69513880220.0, + "step": 16631 + }, + { + "epoch": 1.9764705882352942, + "grad_norm": 0.22582267916773685, + "learning_rate": 2.006598708670334e-06, + "loss": 0.8045, + "num_tokens": 69518069941.0, + "step": 16632 + }, + { + "epoch": 1.9765894236482473, + "grad_norm": 0.21956965982017154, + "learning_rate": 2.0065325646238874e-06, + "loss": 0.8137, + "num_tokens": 69522258308.0, + "step": 16633 + }, + { + "epoch": 1.9767082590612004, + "grad_norm": 0.21260252233545723, + "learning_rate": 2.0064667536359474e-06, + "loss": 0.7593, + "num_tokens": 69526446762.0, + "step": 16634 + }, + { + "epoch": 1.9768270944741533, + "grad_norm": 0.2241940464102941, + "learning_rate": 2.006401275708952e-06, + "loss": 0.8478, + "num_tokens": 69530593237.0, + "step": 16635 + }, + { + "epoch": 1.9769459298871064, + "grad_norm": 0.21039262125163274, + "learning_rate": 2.0063361308453242e-06, + "loss": 0.8194, + "num_tokens": 69534782681.0, + "step": 16636 + }, + { + "epoch": 1.9770647653000593, + "grad_norm": 0.21078336120645114, + "learning_rate": 2.0062713190474787e-06, + "loss": 0.8351, + "num_tokens": 69538972020.0, + "step": 16637 + }, + { + "epoch": 1.9771836007130124, + "grad_norm": 0.23799142765099643, + "learning_rate": 2.006206840317814e-06, + "loss": 0.7971, + "num_tokens": 69543160854.0, + "step": 16638 + }, + { + "epoch": 1.9773024361259655, + "grad_norm": 0.20956097339334487, + "learning_rate": 2.0061426946587187e-06, + "loss": 0.8468, + "num_tokens": 69547349470.0, + "step": 16639 + }, + { + "epoch": 1.9774212715389186, + "grad_norm": 0.22032548313099595, + "learning_rate": 2.0060788820725695e-06, + "loss": 0.773, + "num_tokens": 69551506837.0, + "step": 16640 + }, + { + "epoch": 1.9775401069518717, + "grad_norm": 0.22153265838082875, + "learning_rate": 2.006015402561728e-06, + "loss": 0.8138, + "num_tokens": 69555695475.0, + "step": 16641 + }, + { + "epoch": 1.9776589423648248, + "grad_norm": 0.22723146335185468, + "learning_rate": 2.0059522561285453e-06, + "loss": 0.7894, + "num_tokens": 69559884236.0, + "step": 16642 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.22581771135274495, + "learning_rate": 2.0058894427753593e-06, + "loss": 0.826, + "num_tokens": 69564049604.0, + "step": 16643 + }, + { + "epoch": 1.977896613190731, + "grad_norm": 0.2198848664290604, + "learning_rate": 2.0058269625044984e-06, + "loss": 0.8201, + "num_tokens": 69568213613.0, + "step": 16644 + }, + { + "epoch": 1.978015448603684, + "grad_norm": 0.24281447069411102, + "learning_rate": 2.005764815318274e-06, + "loss": 0.8191, + "num_tokens": 69572377306.0, + "step": 16645 + }, + { + "epoch": 1.978134284016637, + "grad_norm": 0.23460745795568835, + "learning_rate": 2.0057030012189897e-06, + "loss": 0.7971, + "num_tokens": 69576566831.0, + "step": 16646 + }, + { + "epoch": 1.97825311942959, + "grad_norm": 0.23153552491354362, + "learning_rate": 2.005641520208933e-06, + "loss": 0.8191, + "num_tokens": 69580746187.0, + "step": 16647 + }, + { + "epoch": 1.978371954842543, + "grad_norm": 0.23660284171120624, + "learning_rate": 2.0055803722903822e-06, + "loss": 0.8064, + "num_tokens": 69584910775.0, + "step": 16648 + }, + { + "epoch": 1.978490790255496, + "grad_norm": 0.22667572648911463, + "learning_rate": 2.0055195574656003e-06, + "loss": 0.8472, + "num_tokens": 69589085120.0, + "step": 16649 + }, + { + "epoch": 1.9786096256684491, + "grad_norm": 0.2341577695532328, + "learning_rate": 2.005459075736841e-06, + "loss": 0.7819, + "num_tokens": 69593275167.0, + "step": 16650 + }, + { + "epoch": 1.9787284610814022, + "grad_norm": 0.2313374507718182, + "learning_rate": 2.0053989271063428e-06, + "loss": 0.7977, + "num_tokens": 69597464574.0, + "step": 16651 + }, + { + "epoch": 1.9788472964943553, + "grad_norm": 0.21703531326951625, + "learning_rate": 2.0053391115763337e-06, + "loss": 0.8469, + "num_tokens": 69601653694.0, + "step": 16652 + }, + { + "epoch": 1.9789661319073084, + "grad_norm": 0.2225791699523318, + "learning_rate": 2.00527962914903e-06, + "loss": 0.8068, + "num_tokens": 69605842245.0, + "step": 16653 + }, + { + "epoch": 1.9790849673202615, + "grad_norm": 0.20854817209666926, + "learning_rate": 2.0052204798266327e-06, + "loss": 0.7951, + "num_tokens": 69610000913.0, + "step": 16654 + }, + { + "epoch": 1.9792038027332146, + "grad_norm": 0.21368043833185926, + "learning_rate": 2.0051616636113327e-06, + "loss": 0.7471, + "num_tokens": 69614158334.0, + "step": 16655 + }, + { + "epoch": 1.9793226381461677, + "grad_norm": 0.22033164778390205, + "learning_rate": 2.005103180505309e-06, + "loss": 0.7947, + "num_tokens": 69618347786.0, + "step": 16656 + }, + { + "epoch": 1.9794414735591206, + "grad_norm": 0.21599398306675024, + "learning_rate": 2.0050450305107274e-06, + "loss": 0.7643, + "num_tokens": 69622535877.0, + "step": 16657 + }, + { + "epoch": 1.9795603089720737, + "grad_norm": 0.20636986484554362, + "learning_rate": 2.0049872136297406e-06, + "loss": 0.8007, + "num_tokens": 69626724644.0, + "step": 16658 + }, + { + "epoch": 1.9796791443850268, + "grad_norm": 0.21191487648580784, + "learning_rate": 2.0049297298644897e-06, + "loss": 0.8706, + "num_tokens": 69630853379.0, + "step": 16659 + }, + { + "epoch": 1.9797979797979797, + "grad_norm": 0.22116488837042483, + "learning_rate": 2.004872579217105e-06, + "loss": 0.8123, + "num_tokens": 69635015142.0, + "step": 16660 + }, + { + "epoch": 1.9799168152109328, + "grad_norm": 0.21799278818109388, + "learning_rate": 2.0048157616897014e-06, + "loss": 0.8014, + "num_tokens": 69639170783.0, + "step": 16661 + }, + { + "epoch": 1.9800356506238859, + "grad_norm": 0.23246044285932155, + "learning_rate": 2.004759277284383e-06, + "loss": 0.8113, + "num_tokens": 69643336883.0, + "step": 16662 + }, + { + "epoch": 1.980154486036839, + "grad_norm": 0.21547374735745325, + "learning_rate": 2.004703126003242e-06, + "loss": 0.826, + "num_tokens": 69647478653.0, + "step": 16663 + }, + { + "epoch": 1.980273321449792, + "grad_norm": 0.21715841331699087, + "learning_rate": 2.004647307848359e-06, + "loss": 0.7914, + "num_tokens": 69651649199.0, + "step": 16664 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 0.2204235493429083, + "learning_rate": 2.0045918228218007e-06, + "loss": 0.7794, + "num_tokens": 69655837935.0, + "step": 16665 + }, + { + "epoch": 1.9805109922756983, + "grad_norm": 0.227569530905316, + "learning_rate": 2.004536670925621e-06, + "loss": 0.7985, + "num_tokens": 69660020431.0, + "step": 16666 + }, + { + "epoch": 1.9806298276886514, + "grad_norm": 0.21855591198288582, + "learning_rate": 2.0044818521618615e-06, + "loss": 0.8294, + "num_tokens": 69664208075.0, + "step": 16667 + }, + { + "epoch": 1.9807486631016042, + "grad_norm": 0.22417223346608522, + "learning_rate": 2.0044273665325552e-06, + "loss": 0.8124, + "num_tokens": 69668372891.0, + "step": 16668 + }, + { + "epoch": 1.9808674985145573, + "grad_norm": 0.22104421515812192, + "learning_rate": 2.0043732140397172e-06, + "loss": 0.8177, + "num_tokens": 69672519140.0, + "step": 16669 + }, + { + "epoch": 1.9809863339275104, + "grad_norm": 0.2231951685522216, + "learning_rate": 2.004319394685354e-06, + "loss": 0.8145, + "num_tokens": 69676679918.0, + "step": 16670 + }, + { + "epoch": 1.9811051693404633, + "grad_norm": 0.22696906289976873, + "learning_rate": 2.0042659084714594e-06, + "loss": 0.7948, + "num_tokens": 69680860019.0, + "step": 16671 + }, + { + "epoch": 1.9812240047534164, + "grad_norm": 0.21153985971933498, + "learning_rate": 2.004212755400014e-06, + "loss": 0.7864, + "num_tokens": 69685031720.0, + "step": 16672 + }, + { + "epoch": 1.9813428401663695, + "grad_norm": 0.2288241572272946, + "learning_rate": 2.0041599354729847e-06, + "loss": 0.8219, + "num_tokens": 69689221784.0, + "step": 16673 + }, + { + "epoch": 1.9814616755793226, + "grad_norm": 0.21726574687624028, + "learning_rate": 2.004107448692329e-06, + "loss": 0.8249, + "num_tokens": 69693404997.0, + "step": 16674 + }, + { + "epoch": 1.9815805109922757, + "grad_norm": 0.230468309388228, + "learning_rate": 2.0040552950599904e-06, + "loss": 0.8089, + "num_tokens": 69697594688.0, + "step": 16675 + }, + { + "epoch": 1.9816993464052288, + "grad_norm": 0.2367086458557461, + "learning_rate": 2.0040034745779e-06, + "loss": 0.8086, + "num_tokens": 69701775541.0, + "step": 16676 + }, + { + "epoch": 1.981818181818182, + "grad_norm": 0.21230951431893075, + "learning_rate": 2.0039519872479763e-06, + "loss": 0.8019, + "num_tokens": 69705965094.0, + "step": 16677 + }, + { + "epoch": 1.981937017231135, + "grad_norm": 0.23596244837572233, + "learning_rate": 2.003900833072128e-06, + "loss": 0.8136, + "num_tokens": 69710155750.0, + "step": 16678 + }, + { + "epoch": 1.982055852644088, + "grad_norm": 0.21093369271454937, + "learning_rate": 2.0038500120522487e-06, + "loss": 0.7773, + "num_tokens": 69714345434.0, + "step": 16679 + }, + { + "epoch": 1.982174688057041, + "grad_norm": 0.21135985711373684, + "learning_rate": 2.0037995241902196e-06, + "loss": 0.8096, + "num_tokens": 69718534552.0, + "step": 16680 + }, + { + "epoch": 1.982293523469994, + "grad_norm": 0.21735934353354305, + "learning_rate": 2.0037493694879103e-06, + "loss": 0.8137, + "num_tokens": 69722710846.0, + "step": 16681 + }, + { + "epoch": 1.982412358882947, + "grad_norm": 0.22816923785143803, + "learning_rate": 2.0036995479471803e-06, + "loss": 0.7918, + "num_tokens": 69726861686.0, + "step": 16682 + }, + { + "epoch": 1.9825311942959, + "grad_norm": 0.22344012521466036, + "learning_rate": 2.003650059569872e-06, + "loss": 0.7856, + "num_tokens": 69731051379.0, + "step": 16683 + }, + { + "epoch": 1.9826500297088532, + "grad_norm": 0.20423354482463071, + "learning_rate": 2.0036009043578207e-06, + "loss": 0.8133, + "num_tokens": 69735239386.0, + "step": 16684 + }, + { + "epoch": 1.9827688651218063, + "grad_norm": 0.21908582535742158, + "learning_rate": 2.0035520823128438e-06, + "loss": 0.8193, + "num_tokens": 69739422527.0, + "step": 16685 + }, + { + "epoch": 1.9828877005347594, + "grad_norm": 0.2100732668293261, + "learning_rate": 2.0035035934367524e-06, + "loss": 0.8086, + "num_tokens": 69743612818.0, + "step": 16686 + }, + { + "epoch": 1.9830065359477125, + "grad_norm": 0.20665992760295415, + "learning_rate": 2.00345543773134e-06, + "loss": 0.7915, + "num_tokens": 69747796671.0, + "step": 16687 + }, + { + "epoch": 1.9831253713606656, + "grad_norm": 0.21868228760241298, + "learning_rate": 2.0034076151983916e-06, + "loss": 0.8242, + "num_tokens": 69751954975.0, + "step": 16688 + }, + { + "epoch": 1.9832442067736187, + "grad_norm": 0.21108964963056737, + "learning_rate": 2.003360125839676e-06, + "loss": 0.8141, + "num_tokens": 69756123296.0, + "step": 16689 + }, + { + "epoch": 1.9833630421865716, + "grad_norm": 0.1993987940894611, + "learning_rate": 2.0033129696569544e-06, + "loss": 0.7648, + "num_tokens": 69760312557.0, + "step": 16690 + }, + { + "epoch": 1.9834818775995247, + "grad_norm": 0.21280356175260506, + "learning_rate": 2.003266146651972e-06, + "loss": 0.8175, + "num_tokens": 69764503675.0, + "step": 16691 + }, + { + "epoch": 1.9836007130124778, + "grad_norm": 0.21814101487238813, + "learning_rate": 2.0032196568264626e-06, + "loss": 0.8257, + "num_tokens": 69768692214.0, + "step": 16692 + }, + { + "epoch": 1.9837195484254306, + "grad_norm": 0.2205620703172781, + "learning_rate": 2.0031735001821482e-06, + "loss": 0.8, + "num_tokens": 69772849309.0, + "step": 16693 + }, + { + "epoch": 1.9838383838383837, + "grad_norm": 0.20969041954579967, + "learning_rate": 2.003127676720739e-06, + "loss": 0.8102, + "num_tokens": 69777028917.0, + "step": 16694 + }, + { + "epoch": 1.9839572192513368, + "grad_norm": 0.2129330697029505, + "learning_rate": 2.0030821864439294e-06, + "loss": 0.8057, + "num_tokens": 69781201830.0, + "step": 16695 + }, + { + "epoch": 1.98407605466429, + "grad_norm": 0.21435111044563204, + "learning_rate": 2.0030370293534064e-06, + "loss": 0.791, + "num_tokens": 69785391540.0, + "step": 16696 + }, + { + "epoch": 1.984194890077243, + "grad_norm": 0.21127092234865127, + "learning_rate": 2.002992205450842e-06, + "loss": 0.8069, + "num_tokens": 69789582098.0, + "step": 16697 + }, + { + "epoch": 1.9843137254901961, + "grad_norm": 0.2125808099844043, + "learning_rate": 2.002947714737896e-06, + "loss": 0.7828, + "num_tokens": 69793771229.0, + "step": 16698 + }, + { + "epoch": 1.9844325609031492, + "grad_norm": 0.22567067703212698, + "learning_rate": 2.002903557216215e-06, + "loss": 0.8139, + "num_tokens": 69797945231.0, + "step": 16699 + }, + { + "epoch": 1.9845513963161023, + "grad_norm": 0.21362059873300046, + "learning_rate": 2.002859732887436e-06, + "loss": 0.7727, + "num_tokens": 69802106372.0, + "step": 16700 + }, + { + "epoch": 1.9846702317290552, + "grad_norm": 0.21890504977104547, + "learning_rate": 2.002816241753181e-06, + "loss": 0.8075, + "num_tokens": 69806271914.0, + "step": 16701 + }, + { + "epoch": 1.9847890671420083, + "grad_norm": 0.22635184138819536, + "learning_rate": 2.002773083815061e-06, + "loss": 0.8183, + "num_tokens": 69810453484.0, + "step": 16702 + }, + { + "epoch": 1.9849079025549614, + "grad_norm": 0.21819221728136193, + "learning_rate": 2.002730259074673e-06, + "loss": 0.8171, + "num_tokens": 69814594998.0, + "step": 16703 + }, + { + "epoch": 1.9850267379679143, + "grad_norm": 0.21109092002166197, + "learning_rate": 2.002687767533604e-06, + "loss": 0.8091, + "num_tokens": 69818782465.0, + "step": 16704 + }, + { + "epoch": 1.9851455733808674, + "grad_norm": 0.21420944660977334, + "learning_rate": 2.002645609193429e-06, + "loss": 0.7889, + "num_tokens": 69822956569.0, + "step": 16705 + }, + { + "epoch": 1.9852644087938205, + "grad_norm": 0.21461376064979887, + "learning_rate": 2.002603784055707e-06, + "loss": 0.8091, + "num_tokens": 69827144644.0, + "step": 16706 + }, + { + "epoch": 1.9853832442067736, + "grad_norm": 0.2119137479788753, + "learning_rate": 2.0025622921219885e-06, + "loss": 0.808, + "num_tokens": 69831333443.0, + "step": 16707 + }, + { + "epoch": 1.9855020796197267, + "grad_norm": 0.21665433893974156, + "learning_rate": 2.0025211333938087e-06, + "loss": 0.8239, + "num_tokens": 69835521913.0, + "step": 16708 + }, + { + "epoch": 1.9856209150326798, + "grad_norm": 0.215393670827768, + "learning_rate": 2.002480307872692e-06, + "loss": 0.8087, + "num_tokens": 69839710327.0, + "step": 16709 + }, + { + "epoch": 1.985739750445633, + "grad_norm": 0.21905630534925222, + "learning_rate": 2.002439815560152e-06, + "loss": 0.809, + "num_tokens": 69843897090.0, + "step": 16710 + }, + { + "epoch": 1.985858585858586, + "grad_norm": 0.2108656349370262, + "learning_rate": 2.0023996564576863e-06, + "loss": 0.7962, + "num_tokens": 69848084069.0, + "step": 16711 + }, + { + "epoch": 1.9859774212715389, + "grad_norm": 0.2231569667766039, + "learning_rate": 2.002359830566783e-06, + "loss": 0.8249, + "num_tokens": 69852271412.0, + "step": 16712 + }, + { + "epoch": 1.986096256684492, + "grad_norm": 0.2403848160998012, + "learning_rate": 2.0023203378889173e-06, + "loss": 0.8182, + "num_tokens": 69856460190.0, + "step": 16713 + }, + { + "epoch": 1.986215092097445, + "grad_norm": 0.20412479401759656, + "learning_rate": 2.0022811784255515e-06, + "loss": 0.8005, + "num_tokens": 69860593663.0, + "step": 16714 + }, + { + "epoch": 1.986333927510398, + "grad_norm": 0.2314308654727534, + "learning_rate": 2.0022423521781352e-06, + "loss": 0.8193, + "num_tokens": 69864783627.0, + "step": 16715 + }, + { + "epoch": 1.986452762923351, + "grad_norm": 0.21548948211050173, + "learning_rate": 2.002203859148107e-06, + "loss": 0.8301, + "num_tokens": 69868973405.0, + "step": 16716 + }, + { + "epoch": 1.9865715983363041, + "grad_norm": 0.20170199971120842, + "learning_rate": 2.002165699336892e-06, + "loss": 0.8218, + "num_tokens": 69873163868.0, + "step": 16717 + }, + { + "epoch": 1.9866904337492572, + "grad_norm": 0.21089821211402823, + "learning_rate": 2.0021278727459033e-06, + "loss": 0.8157, + "num_tokens": 69877307080.0, + "step": 16718 + }, + { + "epoch": 1.9868092691622103, + "grad_norm": 0.21796865937066207, + "learning_rate": 2.0020903793765424e-06, + "loss": 0.813, + "num_tokens": 69881483540.0, + "step": 16719 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.21888755666706633, + "learning_rate": 2.0020532192301974e-06, + "loss": 0.7949, + "num_tokens": 69885649207.0, + "step": 16720 + }, + { + "epoch": 1.9870469399881165, + "grad_norm": 0.21638846832605124, + "learning_rate": 2.002016392308244e-06, + "loss": 0.8059, + "num_tokens": 69889838606.0, + "step": 16721 + }, + { + "epoch": 1.9871657754010696, + "grad_norm": 0.2197451328558999, + "learning_rate": 2.001979898612047e-06, + "loss": 0.8177, + "num_tokens": 69894026917.0, + "step": 16722 + }, + { + "epoch": 1.9872846108140227, + "grad_norm": 0.24222248078978548, + "learning_rate": 2.001943738142957e-06, + "loss": 0.8192, + "num_tokens": 69898198493.0, + "step": 16723 + }, + { + "epoch": 1.9874034462269756, + "grad_norm": 0.21112233461449303, + "learning_rate": 2.001907910902314e-06, + "loss": 0.8045, + "num_tokens": 69902388620.0, + "step": 16724 + }, + { + "epoch": 1.9875222816399287, + "grad_norm": 0.21785229589915442, + "learning_rate": 2.0018724168914443e-06, + "loss": 0.7909, + "num_tokens": 69906576402.0, + "step": 16725 + }, + { + "epoch": 1.9876411170528816, + "grad_norm": 0.2242349785105731, + "learning_rate": 2.0018372561116615e-06, + "loss": 0.8304, + "num_tokens": 69910763777.0, + "step": 16726 + }, + { + "epoch": 1.9877599524658347, + "grad_norm": 0.22181093283798473, + "learning_rate": 2.00180242856427e-06, + "loss": 0.8153, + "num_tokens": 69914954177.0, + "step": 16727 + }, + { + "epoch": 1.9878787878787878, + "grad_norm": 0.23055204813978858, + "learning_rate": 2.0017679342505574e-06, + "loss": 0.7966, + "num_tokens": 69919141659.0, + "step": 16728 + }, + { + "epoch": 1.987997623291741, + "grad_norm": 0.2124769999506605, + "learning_rate": 2.001733773171802e-06, + "loss": 0.7857, + "num_tokens": 69923331421.0, + "step": 16729 + }, + { + "epoch": 1.988116458704694, + "grad_norm": 0.22336526371208798, + "learning_rate": 2.0016999453292683e-06, + "loss": 0.8132, + "num_tokens": 69927520765.0, + "step": 16730 + }, + { + "epoch": 1.988235294117647, + "grad_norm": 0.22884503816361174, + "learning_rate": 2.001666450724211e-06, + "loss": 0.8308, + "num_tokens": 69931710914.0, + "step": 16731 + }, + { + "epoch": 1.9883541295306002, + "grad_norm": 0.22058439578141098, + "learning_rate": 2.001633289357868e-06, + "loss": 0.7608, + "num_tokens": 69935888979.0, + "step": 16732 + }, + { + "epoch": 1.9884729649435533, + "grad_norm": 0.26374759910504897, + "learning_rate": 2.0016004612314686e-06, + "loss": 0.8, + "num_tokens": 69940076701.0, + "step": 16733 + }, + { + "epoch": 1.9885918003565064, + "grad_norm": 0.22341221098252514, + "learning_rate": 2.001567966346229e-06, + "loss": 0.8069, + "num_tokens": 69944264310.0, + "step": 16734 + }, + { + "epoch": 1.9887106357694593, + "grad_norm": 0.21378651426809292, + "learning_rate": 2.0015358047033513e-06, + "loss": 0.8088, + "num_tokens": 69948435370.0, + "step": 16735 + }, + { + "epoch": 1.9888294711824124, + "grad_norm": 0.21538905022845156, + "learning_rate": 2.001503976304028e-06, + "loss": 0.8004, + "num_tokens": 69952624413.0, + "step": 16736 + }, + { + "epoch": 1.9889483065953653, + "grad_norm": 0.21773244744592687, + "learning_rate": 2.0014724811494368e-06, + "loss": 0.8387, + "num_tokens": 69956812567.0, + "step": 16737 + }, + { + "epoch": 1.9890671420083184, + "grad_norm": 0.21773536411890593, + "learning_rate": 2.0014413192407446e-06, + "loss": 0.8134, + "num_tokens": 69960982601.0, + "step": 16738 + }, + { + "epoch": 1.9891859774212715, + "grad_norm": 0.2146126365345186, + "learning_rate": 2.001410490579104e-06, + "loss": 0.8176, + "num_tokens": 69965167774.0, + "step": 16739 + }, + { + "epoch": 1.9893048128342246, + "grad_norm": 0.21812462043748693, + "learning_rate": 2.0013799951656593e-06, + "loss": 0.7607, + "num_tokens": 69969356851.0, + "step": 16740 + }, + { + "epoch": 1.9894236482471777, + "grad_norm": 0.219810985552296, + "learning_rate": 2.001349833001538e-06, + "loss": 0.8022, + "num_tokens": 69973519947.0, + "step": 16741 + }, + { + "epoch": 1.9895424836601308, + "grad_norm": 0.21457510411151334, + "learning_rate": 2.0013200040878567e-06, + "loss": 0.7977, + "num_tokens": 69977708569.0, + "step": 16742 + }, + { + "epoch": 1.9896613190730839, + "grad_norm": 0.21322451409783688, + "learning_rate": 2.0012905084257222e-06, + "loss": 0.8097, + "num_tokens": 69981895219.0, + "step": 16743 + }, + { + "epoch": 1.989780154486037, + "grad_norm": 0.21510983975414694, + "learning_rate": 2.001261346016225e-06, + "loss": 0.8097, + "num_tokens": 69986067089.0, + "step": 16744 + }, + { + "epoch": 1.98989898989899, + "grad_norm": 0.22471433360543938, + "learning_rate": 2.0012325168604455e-06, + "loss": 0.8012, + "num_tokens": 69990246068.0, + "step": 16745 + }, + { + "epoch": 1.990017825311943, + "grad_norm": 0.20995254109826472, + "learning_rate": 2.0012040209594518e-06, + "loss": 0.8219, + "num_tokens": 69994415673.0, + "step": 16746 + }, + { + "epoch": 1.990136660724896, + "grad_norm": 0.2067471516723269, + "learning_rate": 2.0011758583142987e-06, + "loss": 0.7758, + "num_tokens": 69998604151.0, + "step": 16747 + }, + { + "epoch": 1.9902554961378491, + "grad_norm": 0.22169821369251327, + "learning_rate": 2.001148028926029e-06, + "loss": 0.8154, + "num_tokens": 70002787286.0, + "step": 16748 + }, + { + "epoch": 1.990374331550802, + "grad_norm": 0.21857454012113545, + "learning_rate": 2.001120532795674e-06, + "loss": 0.8163, + "num_tokens": 70006976957.0, + "step": 16749 + }, + { + "epoch": 1.9904931669637551, + "grad_norm": 0.22533570083570834, + "learning_rate": 2.001093369924251e-06, + "loss": 0.7875, + "num_tokens": 70011149249.0, + "step": 16750 + }, + { + "epoch": 1.9906120023767082, + "grad_norm": 0.22145448428961112, + "learning_rate": 2.0010665403127677e-06, + "loss": 0.8201, + "num_tokens": 70015320391.0, + "step": 16751 + }, + { + "epoch": 1.9907308377896613, + "grad_norm": 0.21023923964777746, + "learning_rate": 2.001040043962215e-06, + "loss": 0.7984, + "num_tokens": 70019480872.0, + "step": 16752 + }, + { + "epoch": 1.9908496732026144, + "grad_norm": 0.22506841911487607, + "learning_rate": 2.0010138808735764e-06, + "loss": 0.8123, + "num_tokens": 70023649172.0, + "step": 16753 + }, + { + "epoch": 1.9909685086155675, + "grad_norm": 0.22297181459438548, + "learning_rate": 2.000988051047821e-06, + "loss": 0.8001, + "num_tokens": 70027839328.0, + "step": 16754 + }, + { + "epoch": 1.9910873440285206, + "grad_norm": 0.28761536976248137, + "learning_rate": 2.0009625544859033e-06, + "loss": 0.8149, + "num_tokens": 70032027862.0, + "step": 16755 + }, + { + "epoch": 1.9912061794414737, + "grad_norm": 0.24613185823110456, + "learning_rate": 2.0009373911887693e-06, + "loss": 0.7944, + "num_tokens": 70036217283.0, + "step": 16756 + }, + { + "epoch": 1.9913250148544266, + "grad_norm": 0.21077362347826084, + "learning_rate": 2.0009125611573494e-06, + "loss": 0.8361, + "num_tokens": 70040396913.0, + "step": 16757 + }, + { + "epoch": 1.9914438502673797, + "grad_norm": 0.2179310628713989, + "learning_rate": 2.0008880643925647e-06, + "loss": 0.7795, + "num_tokens": 70044584963.0, + "step": 16758 + }, + { + "epoch": 1.9915626856803328, + "grad_norm": 0.20954847009604438, + "learning_rate": 2.000863900895322e-06, + "loss": 0.7881, + "num_tokens": 70048774729.0, + "step": 16759 + }, + { + "epoch": 1.9916815210932857, + "grad_norm": 0.2169801712223982, + "learning_rate": 2.0008400706665152e-06, + "loss": 0.8026, + "num_tokens": 70052963598.0, + "step": 16760 + }, + { + "epoch": 1.9918003565062388, + "grad_norm": 0.22083467322962005, + "learning_rate": 2.000816573707027e-06, + "loss": 0.8171, + "num_tokens": 70057150599.0, + "step": 16761 + }, + { + "epoch": 1.9919191919191919, + "grad_norm": 0.2141638941490202, + "learning_rate": 2.0007934100177297e-06, + "loss": 0.83, + "num_tokens": 70061334122.0, + "step": 16762 + }, + { + "epoch": 1.992038027332145, + "grad_norm": 0.21519161549759175, + "learning_rate": 2.000770579599478e-06, + "loss": 0.8117, + "num_tokens": 70065523372.0, + "step": 16763 + }, + { + "epoch": 1.992156862745098, + "grad_norm": 0.2175926766254424, + "learning_rate": 2.00074808245312e-06, + "loss": 0.8043, + "num_tokens": 70069701702.0, + "step": 16764 + }, + { + "epoch": 1.9922756981580512, + "grad_norm": 0.2097421500507155, + "learning_rate": 2.0007259185794865e-06, + "loss": 0.8057, + "num_tokens": 70073890052.0, + "step": 16765 + }, + { + "epoch": 1.9923945335710043, + "grad_norm": 0.21345277613812497, + "learning_rate": 2.0007040879794e-06, + "loss": 0.8202, + "num_tokens": 70078078517.0, + "step": 16766 + }, + { + "epoch": 1.9925133689839574, + "grad_norm": 0.218729347592178, + "learning_rate": 2.0006825906536684e-06, + "loss": 0.8335, + "num_tokens": 70082199502.0, + "step": 16767 + }, + { + "epoch": 1.9926322043969102, + "grad_norm": 0.20821286855225604, + "learning_rate": 2.000661426603088e-06, + "loss": 0.7998, + "num_tokens": 70086369592.0, + "step": 16768 + }, + { + "epoch": 1.9927510398098633, + "grad_norm": 0.21778256409149568, + "learning_rate": 2.0006405958284418e-06, + "loss": 0.7972, + "num_tokens": 70090518647.0, + "step": 16769 + }, + { + "epoch": 1.9928698752228164, + "grad_norm": 0.22252100702602415, + "learning_rate": 2.000620098330503e-06, + "loss": 0.7908, + "num_tokens": 70094706677.0, + "step": 16770 + }, + { + "epoch": 1.9929887106357693, + "grad_norm": 0.2136205032368395, + "learning_rate": 2.0005999341100295e-06, + "loss": 0.8183, + "num_tokens": 70098896000.0, + "step": 16771 + }, + { + "epoch": 1.9931075460487224, + "grad_norm": 0.2248690942653033, + "learning_rate": 2.0005801031677675e-06, + "loss": 0.797, + "num_tokens": 70103055244.0, + "step": 16772 + }, + { + "epoch": 1.9932263814616755, + "grad_norm": 0.2244619213281392, + "learning_rate": 2.0005606055044523e-06, + "loss": 0.7672, + "num_tokens": 70107243521.0, + "step": 16773 + }, + { + "epoch": 1.9933452168746286, + "grad_norm": 0.22237363632907373, + "learning_rate": 2.0005414411208056e-06, + "loss": 0.8268, + "num_tokens": 70111432164.0, + "step": 16774 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.26674057386259203, + "learning_rate": 2.000522610017538e-06, + "loss": 0.7926, + "num_tokens": 70115608967.0, + "step": 16775 + }, + { + "epoch": 1.9935828877005348, + "grad_norm": 0.21808249577651642, + "learning_rate": 2.0005041121953447e-06, + "loss": 0.8288, + "num_tokens": 70119796002.0, + "step": 16776 + }, + { + "epoch": 1.993701723113488, + "grad_norm": 0.2222361936095027, + "learning_rate": 2.000485947654913e-06, + "loss": 0.7867, + "num_tokens": 70123960909.0, + "step": 16777 + }, + { + "epoch": 1.993820558526441, + "grad_norm": 0.23354616294493696, + "learning_rate": 2.0004681163969147e-06, + "loss": 0.8483, + "num_tokens": 70128121258.0, + "step": 16778 + }, + { + "epoch": 1.993939393939394, + "grad_norm": 0.22272573094379938, + "learning_rate": 2.00045061842201e-06, + "loss": 0.8264, + "num_tokens": 70132310766.0, + "step": 16779 + }, + { + "epoch": 1.994058229352347, + "grad_norm": 0.2281171711903412, + "learning_rate": 2.000433453730847e-06, + "loss": 0.7858, + "num_tokens": 70136448202.0, + "step": 16780 + }, + { + "epoch": 1.9941770647653, + "grad_norm": 0.22315119919327508, + "learning_rate": 2.000416622324062e-06, + "loss": 0.766, + "num_tokens": 70140623206.0, + "step": 16781 + }, + { + "epoch": 1.994295900178253, + "grad_norm": 0.22013205225055205, + "learning_rate": 2.0004001242022774e-06, + "loss": 0.7842, + "num_tokens": 70144814254.0, + "step": 16782 + }, + { + "epoch": 1.994414735591206, + "grad_norm": 0.22896322413389708, + "learning_rate": 2.000383959366105e-06, + "loss": 0.8307, + "num_tokens": 70149002578.0, + "step": 16783 + }, + { + "epoch": 1.9945335710041592, + "grad_norm": 0.2108710929312794, + "learning_rate": 2.0003681278161417e-06, + "loss": 0.8213, + "num_tokens": 70153191592.0, + "step": 16784 + }, + { + "epoch": 1.9946524064171123, + "grad_norm": 0.21012354050292845, + "learning_rate": 2.000352629552976e-06, + "loss": 0.7962, + "num_tokens": 70157381644.0, + "step": 16785 + }, + { + "epoch": 1.9947712418300654, + "grad_norm": 0.218534635064151, + "learning_rate": 2.000337464577182e-06, + "loss": 0.761, + "num_tokens": 70161571135.0, + "step": 16786 + }, + { + "epoch": 1.9948900772430185, + "grad_norm": 0.21821304104505318, + "learning_rate": 2.000322632889319e-06, + "loss": 0.7717, + "num_tokens": 70165760115.0, + "step": 16787 + }, + { + "epoch": 1.9950089126559716, + "grad_norm": 0.2155063732193492, + "learning_rate": 2.0003081344899376e-06, + "loss": 0.7735, + "num_tokens": 70169929950.0, + "step": 16788 + }, + { + "epoch": 1.9951277480689247, + "grad_norm": 0.22617531387308937, + "learning_rate": 2.0002939693795743e-06, + "loss": 0.7962, + "num_tokens": 70174108693.0, + "step": 16789 + }, + { + "epoch": 1.9952465834818776, + "grad_norm": 0.22360229442338525, + "learning_rate": 2.0002801375587546e-06, + "loss": 0.8095, + "num_tokens": 70178297745.0, + "step": 16790 + }, + { + "epoch": 1.9953654188948307, + "grad_norm": 0.2358740175167543, + "learning_rate": 2.0002666390279906e-06, + "loss": 0.8157, + "num_tokens": 70182487266.0, + "step": 16791 + }, + { + "epoch": 1.9954842543077838, + "grad_norm": 0.22642075355835334, + "learning_rate": 2.000253473787781e-06, + "loss": 0.7982, + "num_tokens": 70186642842.0, + "step": 16792 + }, + { + "epoch": 1.9956030897207366, + "grad_norm": 0.24124798562293076, + "learning_rate": 2.000240641838615e-06, + "loss": 0.8037, + "num_tokens": 70190824073.0, + "step": 16793 + }, + { + "epoch": 1.9957219251336897, + "grad_norm": 0.22069637746801832, + "learning_rate": 2.000228143180966e-06, + "loss": 0.7635, + "num_tokens": 70195013958.0, + "step": 16794 + }, + { + "epoch": 1.9958407605466428, + "grad_norm": 0.2270164846643172, + "learning_rate": 2.0002159778152986e-06, + "loss": 0.782, + "num_tokens": 70199203698.0, + "step": 16795 + }, + { + "epoch": 1.995959595959596, + "grad_norm": 0.23874288395901686, + "learning_rate": 2.0002041457420623e-06, + "loss": 0.8569, + "num_tokens": 70203393419.0, + "step": 16796 + }, + { + "epoch": 1.996078431372549, + "grad_norm": 0.2326732710893824, + "learning_rate": 2.0001926469616947e-06, + "loss": 0.8228, + "num_tokens": 70207583866.0, + "step": 16797 + }, + { + "epoch": 1.9961972667855021, + "grad_norm": 0.23202517428773178, + "learning_rate": 2.0001814814746235e-06, + "loss": 0.795, + "num_tokens": 70211772393.0, + "step": 16798 + }, + { + "epoch": 1.9963161021984552, + "grad_norm": 0.233507842967697, + "learning_rate": 2.00017064928126e-06, + "loss": 0.8305, + "num_tokens": 70215961529.0, + "step": 16799 + }, + { + "epoch": 1.9964349376114083, + "grad_norm": 0.24070423354950907, + "learning_rate": 2.000160150382008e-06, + "loss": 0.8164, + "num_tokens": 70220150494.0, + "step": 16800 + }, + { + "epoch": 1.9965537730243612, + "grad_norm": 0.24322631775384249, + "learning_rate": 2.000149984777254e-06, + "loss": 0.7897, + "num_tokens": 70224340919.0, + "step": 16801 + }, + { + "epoch": 1.9966726084373143, + "grad_norm": 0.2596386116690553, + "learning_rate": 2.000140152467375e-06, + "loss": 0.8232, + "num_tokens": 70228468127.0, + "step": 16802 + }, + { + "epoch": 1.9967914438502674, + "grad_norm": 0.2388323156908728, + "learning_rate": 2.000130653452736e-06, + "loss": 0.8136, + "num_tokens": 70232638255.0, + "step": 16803 + }, + { + "epoch": 1.9969102792632203, + "grad_norm": 0.21967783698825907, + "learning_rate": 2.0001214877336883e-06, + "loss": 0.7905, + "num_tokens": 70236826828.0, + "step": 16804 + }, + { + "epoch": 1.9970291146761734, + "grad_norm": 0.2377036108110131, + "learning_rate": 2.0001126553105705e-06, + "loss": 0.7931, + "num_tokens": 70241017984.0, + "step": 16805 + }, + { + "epoch": 1.9971479500891265, + "grad_norm": 0.23709914618541006, + "learning_rate": 2.0001041561837112e-06, + "loss": 0.7774, + "num_tokens": 70245197661.0, + "step": 16806 + }, + { + "epoch": 1.9972667855020796, + "grad_norm": 0.22146471906380316, + "learning_rate": 2.000095990353423e-06, + "loss": 0.7739, + "num_tokens": 70249360021.0, + "step": 16807 + }, + { + "epoch": 1.9973856209150327, + "grad_norm": 0.22305441965982925, + "learning_rate": 2.000088157820011e-06, + "loss": 0.8042, + "num_tokens": 70253547020.0, + "step": 16808 + }, + { + "epoch": 1.9975044563279858, + "grad_norm": 0.23769819735035896, + "learning_rate": 2.0000806585837633e-06, + "loss": 0.7886, + "num_tokens": 70257695047.0, + "step": 16809 + }, + { + "epoch": 1.9976232917409389, + "grad_norm": 0.23195719278896254, + "learning_rate": 2.000073492644959e-06, + "loss": 0.7591, + "num_tokens": 70261884014.0, + "step": 16810 + }, + { + "epoch": 1.997742127153892, + "grad_norm": 0.20704076864939389, + "learning_rate": 2.000066660003862e-06, + "loss": 0.8338, + "num_tokens": 70266050262.0, + "step": 16811 + }, + { + "epoch": 1.9978609625668449, + "grad_norm": 0.22481345669636155, + "learning_rate": 2.0000601606607264e-06, + "loss": 0.7964, + "num_tokens": 70270188786.0, + "step": 16812 + }, + { + "epoch": 1.997979797979798, + "grad_norm": 0.2165471670129782, + "learning_rate": 2.000053994615793e-06, + "loss": 0.825, + "num_tokens": 70274376716.0, + "step": 16813 + }, + { + "epoch": 1.998098633392751, + "grad_norm": 0.2116537026325789, + "learning_rate": 2.000048161869289e-06, + "loss": 0.8137, + "num_tokens": 70278566401.0, + "step": 16814 + }, + { + "epoch": 1.998217468805704, + "grad_norm": 0.23177966691916593, + "learning_rate": 2.000042662421431e-06, + "loss": 0.7967, + "num_tokens": 70282727980.0, + "step": 16815 + }, + { + "epoch": 1.998336304218657, + "grad_norm": 0.21246188802954544, + "learning_rate": 2.0000374962724236e-06, + "loss": 0.8282, + "num_tokens": 70286916089.0, + "step": 16816 + }, + { + "epoch": 1.9984551396316101, + "grad_norm": 0.21095366318416114, + "learning_rate": 2.000032663422456e-06, + "loss": 0.8314, + "num_tokens": 70291105155.0, + "step": 16817 + }, + { + "epoch": 1.9985739750445632, + "grad_norm": 0.2189670436074746, + "learning_rate": 2.00002816387171e-06, + "loss": 0.8101, + "num_tokens": 70295294731.0, + "step": 16818 + }, + { + "epoch": 1.9986928104575163, + "grad_norm": 0.22819703086532966, + "learning_rate": 2.00002399762035e-06, + "loss": 0.777, + "num_tokens": 70299433682.0, + "step": 16819 + }, + { + "epoch": 1.9988116458704694, + "grad_norm": 0.2145621576649253, + "learning_rate": 2.000020164668531e-06, + "loss": 0.7898, + "num_tokens": 70303623770.0, + "step": 16820 + }, + { + "epoch": 1.9989304812834225, + "grad_norm": 0.2167357768160367, + "learning_rate": 2.0000166650163953e-06, + "loss": 0.8123, + "num_tokens": 70307811732.0, + "step": 16821 + }, + { + "epoch": 1.9990493166963756, + "grad_norm": 0.2353868720616398, + "learning_rate": 2.0000134986640717e-06, + "loss": 0.8202, + "num_tokens": 70311980701.0, + "step": 16822 + }, + { + "epoch": 1.9991681521093287, + "grad_norm": 0.23335888042029507, + "learning_rate": 2.000010665611678e-06, + "loss": 0.7945, + "num_tokens": 70316158725.0, + "step": 16823 + }, + { + "epoch": 1.9992869875222816, + "grad_norm": 0.21634228243982323, + "learning_rate": 2.000008165859319e-06, + "loss": 0.804, + "num_tokens": 70320347860.0, + "step": 16824 + }, + { + "epoch": 1.9994058229352347, + "grad_norm": 0.2126240583070109, + "learning_rate": 2.0000059994070876e-06, + "loss": 0.7835, + "num_tokens": 70324533576.0, + "step": 16825 + }, + { + "epoch": 1.9995246583481876, + "grad_norm": 0.2131451756463517, + "learning_rate": 2.000004166255063e-06, + "loss": 0.7973, + "num_tokens": 70328721732.0, + "step": 16826 + }, + { + "epoch": 1.9996434937611407, + "grad_norm": 0.21681129279743472, + "learning_rate": 2.0000026664033144e-06, + "loss": 0.7771, + "num_tokens": 70332888212.0, + "step": 16827 + }, + { + "epoch": 1.9997623291740938, + "grad_norm": 0.2225259050994952, + "learning_rate": 2.000001499851897e-06, + "loss": 0.7761, + "num_tokens": 70337078439.0, + "step": 16828 + }, + { + "epoch": 1.999881164587047, + "grad_norm": 0.21636846139671886, + "learning_rate": 2.000000666600854e-06, + "loss": 0.8158, + "num_tokens": 70341247972.0, + "step": 16829 + }, + { + "epoch": 2.0, + "grad_norm": 0.21040463635159465, + "learning_rate": 2.000000166650215e-06, + "loss": 0.8291, + "num_tokens": 70345437135.0, + "step": 16830 + }, + { + "epoch": 2.0, + "step": 16830, + "total_flos": 2.85249316814848e+16, + "train_loss": 0.8500257841618516, + "train_runtime": 594460.0391, + "train_samples_per_second": 3.624, + "train_steps_per_second": 0.028 + } + ], + "logging_steps": 1, + "max_steps": 16830, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 842, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.85249316814848e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..7115df6 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e19e0e4c4c10cdcc3f0b94d55363762f578e871d9d1760f13eac69df89af6799 +size 7288